gists/atttt.py

#!/usr/bin/env python3

import sys
import numpy as np

from misc import *
from basic import Brain


def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False):
    # black magic wrapper around np.unique
    # via np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
    return_any = return_index or return_inverse or return_counts
    if not return_any:
        np.unique(a.view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))).view(a.dtype).reshape(-1, a.shape[1])
    else:
        void_dtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
        ret = np.unique(a.view(void_dtype), return_index, return_inverse, return_counts)
        return (ret[0].view(a.dtype).reshape(-1, a.shape[1]),) + ret[1:]


class ATTTT():

    def __init__(self, brain):
        self.brain = brain
        self.score = self._score


    def _score(self, reply, maxn):
        if len(reply) > maxn:
            return -999999999

        #return len(reply)
        return 1


    def reply(self, item=None, maxn=1000, raw=False, attempts=None):
        if attempts == None:
            attempts = int(2**12 / self.brain.order)
            lament('attempts:', attempts)

        replies = []
        for i in range(attempts):
            reply = "".join(self.brain.reply(item=item, maxn=maxn+1))
            replies += [(reply, self.score(reply, maxn))]

        result = sorted(replies, key=lambda t: t[1], reverse=True)[0]

        if raw:
            return result
        else:
            return result[0]


class PatternBrain(Brain):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokens = []


    def helper(self, v):
        return (v,)


    def learn_all(self, items, merges=1):
        min_count = 2
        if merges < 0:
            min_count = -merges
            merges = 65536

        # use numpy so this isn't nearly as disgustingly slow

        int32_min = -2**(np.dtype(np.int32).itemsize * 8 - 1)
        empty = int32_min
        neg_lookup = {-1: ''} # default with padding

        alignment = 2
        align = lambda x: (x + alignment // 2) // alignment * alignment

        new_items = []
        for item in items:
            item = item.strip('\n')
            # assert at least 1 padding character at the end
            next_biggest = align(len(item) + 1)
            # fill with padding (-1)
            new_item = -np.ones(next_biggest, dtype=np.int32)
            for i, c in enumerate(item):
                new_item[i] = ord(c)
            new_items.append(new_item)

        # add an extra padding item to the head and tail
        # for easier conversion from sequence back to all_items later on
        pad = -np.ones(1, dtype=np.int32)
        new_items.insert(0, pad)
        new_items.append(pad)

        all_items = np.concatenate(new_items)

        if merges > 0:
            # set up a 2d array to step through at half the row length,
            # this means double redundancy, to acquire all the sequences.
            # we don't have to .roll it later to get the other half,
            # though that would require less memory.
            sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy()

        for i in range(merges):
            # learn
            most_common = (None, 1)
            # TODO: eventually check for empty here too
            invalid = np.any(sequences == -1, axis=1)
            valid_sequences = np.delete(sequences, np.where(invalid), axis=0)
            unique, counts = uniq_rows(valid_sequences, return_counts=True)
            count = counts.max()

            if count > most_common[1]:
                seq = unique[counts == count][0]
                most_common = (seq, count)

            if most_common[0] is None or most_common[1] <= 1 or most_common[1] < min_count:
                lament('no more valid sequences')
                break

            new_id = -1 - len(neg_lookup)
            neg_lookup[new_id] = "".join([o < 0 and neg_lookup[o] or chr(o) for o in most_common[0]])

            if len("".join(neg_lookup.values())) > len(all_items):
                lament('preventing dict from growing larger than source')
                break

            # replace our most common sequence in the sequences
            found = np.all(sequences == most_common[0], axis=1)
            before = np.roll(found, -1)
            after = np.roll(found, 1)
            # don't wrap around truth values
            before[-1] = False
            after[0] = False
            # or remove padding
            #before[0] = False
            #after[-1] = False
            # remove the "found" sequences
            befores = sequences[before].T.copy()
            befores[1] = new_id
            sequences[before] = befores.T
            afters = sequences[after].T.copy()
            afters[0] = new_id
            sequences[after] = afters.T
            #sequences[found] = [empty, empty]
            here = np.where(found)
            sequences = np.delete(sequences, here, axis=0)

            print("({:8}) new token: {:5} \"{}\"".format(len(here[0]), new_id, neg_lookup[new_id]))

        if merges > 0:
            # reconstruct all_items out of the sequences
            all_items = sequences.reshape(-1)[::2][1:].copy()

        self.padding = '~'
        self.reset()
        np_item = []
        for i in all_items:
        #for np_item in np.split(all_items, np.where(all_items == -1)):
            if i == -1:
                if len(np_item) == 0:
                    continue
                item = tuple()
                for i in np_item:
                    if i < 0:
                        assert(i != -1)
                        item += self.helper(neg_lookup[i])
                    else:
                        item += self.helper(chr(i))
                #die(np_item, item)
                self.learn(item)
                np_item = []
            elif i != empty:
                np_item.append(i)
        self.update()


def run(pname, args, env):
    if not 1 <= len(args) <= 2:
        lament("usage: {} {{input file}} [state_fn file]".format(sys.argv[0]))
        sys.exit(1)

    args = dict(enumerate(args)) # for .get()

    fn = args[0]
    state_fn = args.get(1, None)

    count = int(env.get('COUNT', '8'))
    order = int(env.get('ORDER', '3'))
    temperature = float(env.get('TEMPERATURE', '0'))
    maxn = int(env.get('MAXN', '1000'))
    attempts = int(env.get('ATTEMPTS', '-1'))
    merges = int(env.get('MERGES', '0'))

    if attempts <= 0:
        attempts = None

    brain = PatternBrain(order=order, temperature=temperature)
    tool = ATTTT(brain)

    lament('# loading')
    if state_fn:
        try:
            brain.load(state_fn, raw=False)
        except FileNotFoundError:
            pass

    if brain and brain.new:
        lament('# learning')
        lines = open(fn).readlines()
        brain.learn_all(lines, merges)

    if brain and brain.new and state_fn:
        brain.save(state_fn, raw=False)

    lament('# replying')
    for i in range(count):
        #reply = tool.reply(maxn=maxn, raw=True, attempts=attempts)
        #print('{:6.1f}\t{}'.format(reply[1], reply[0]))
        print(tool.reply(maxn=maxn, attempts=attempts))


if __name__ == '__main__':
    import sys
    import os
    pname = len(sys.argv) > 0 and sys.argv[0] or ''
    args = len(sys.argv) > 1 and sys.argv[1:] or []
    sys.exit(run(pname, args, os.environ))
. 2016-05-24 20:15:26 -07:00			`#!/usr/bin/env python3`

			`import sys`
			`import numpy as np`

			`from misc import *`
			`from basic import Brain`


			`def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False):`
			`# black magic wrapper around np.unique`
			`# via np.dtype((np.void, a.dtype.itemsize * a.shape[1]))`
			`return_any = return_index or return_inverse or return_counts`
			`if not return_any:`
			`np.unique(a.view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))).view(a.dtype).reshape(-1, a.shape[1])`
			`else:`
			`void_dtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))`
			`ret = np.unique(a.view(void_dtype), return_index, return_inverse, return_counts)`
			`return (ret[0].view(a.dtype).reshape(-1, a.shape[1]),) + ret[1:]`


			`class ATTTT():`

			`def __init__(self, brain):`
			`self.brain = brain`
			`self.score = self._score`


			`def _score(self, reply, maxn):`
			`if len(reply) > maxn:`
			`return -999999999`

			`#return len(reply)`
			`return 1`


			`def reply(self, item=None, maxn=1000, raw=False, attempts=None):`
			`if attempts == None:`
			`attempts = int(2**12 / self.brain.order)`
			`lament('attempts:', attempts)`

			`replies = []`
			`for i in range(attempts):`
			`reply = "".join(self.brain.reply(item=item, maxn=maxn+1))`
			`replies += [(reply, self.score(reply, maxn))]`

			`result = sorted(replies, key=lambda t: t[1], reverse=True)[0]`

			`if raw:`
			`return result`
			`else:`
			`return result[0]`


			`class PatternBrain(Brain):`

			`def __init__(self, args, *kwargs):`
			`super().__init__(args, *kwargs)`
			`self.tokens = []`


			`def helper(self, v):`
			`return (v,)`


			`def learn_all(self, items, merges=1):`
			`min_count = 2`
			`if merges < 0:`
			`min_count = -merges`
			`merges = 65536`

			`# use numpy so this isn't nearly as disgustingly slow`

			`int32_min = -2*(np.dtype(np.int32).itemsize 8 - 1)`
			`empty = int32_min`
			`neg_lookup = {-1: ''} # default with padding`

			`alignment = 2`
			`align = lambda x: (x + alignment // 2) // alignment * alignment`

			`new_items = []`
			`for item in items:`
			`item = item.strip('\n')`
			`# assert at least 1 padding character at the end`
			`next_biggest = align(len(item) + 1)`
			`# fill with padding (-1)`
			`new_item = -np.ones(next_biggest, dtype=np.int32)`
			`for i, c in enumerate(item):`
			`new_item[i] = ord(c)`
			`new_items.append(new_item)`

			`# add an extra padding item to the head and tail`
			`# for easier conversion from sequence back to all_items later on`
			`pad = -np.ones(1, dtype=np.int32)`
			`new_items.insert(0, pad)`
			`new_items.append(pad)`

			`all_items = np.concatenate(new_items)`

			`if merges > 0:`
			`# set up a 2d array to step through at half the row length,`
			`# this means double redundancy, to acquire all the sequences.`
			`# we don't have to .roll it later to get the other half,`
			`# though that would require less memory.`
			`sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy()`

			`for i in range(merges):`
			`# learn`
			`most_common = (None, 1)`
			`# TODO: eventually check for empty here too`
			`invalid = np.any(sequences == -1, axis=1)`
			`valid_sequences = np.delete(sequences, np.where(invalid), axis=0)`
			`unique, counts = uniq_rows(valid_sequences, return_counts=True)`
			`count = counts.max()`

			`if count > most_common[1]:`
			`seq = unique[counts == count][0]`
			`most_common = (seq, count)`

			`if most_common[0] is None or most_common[1] <= 1 or most_common[1] < min_count:`
			`lament('no more valid sequences')`
			`break`

			`new_id = -1 - len(neg_lookup)`
			`neg_lookup[new_id] = "".join([o < 0 and neg_lookup[o] or chr(o) for o in most_common[0]])`

			`if len("".join(neg_lookup.values())) > len(all_items):`
			`lament('preventing dict from growing larger than source')`
			`break`

			`# replace our most common sequence in the sequences`
			`found = np.all(sequences == most_common[0], axis=1)`
			`before = np.roll(found, -1)`
			`after = np.roll(found, 1)`
			`# don't wrap around truth values`
			`before[-1] = False`
			`after[0] = False`
			`# or remove padding`
			`#before[0] = False`
			`#after[-1] = False`
			`# remove the "found" sequences`
			`befores = sequences[before].T.copy()`
			`befores[1] = new_id`
			`sequences[before] = befores.T`
			`afters = sequences[after].T.copy()`
			`afters[0] = new_id`
			`sequences[after] = afters.T`
			`#sequences[found] = [empty, empty]`
			`here = np.where(found)`
			`sequences = np.delete(sequences, here, axis=0)`

			`print("({:8}) new token: {:5} \"{}\"".format(len(here[0]), new_id, neg_lookup[new_id]))`

			`if merges > 0:`
			`# reconstruct all_items out of the sequences`
			`all_items = sequences.reshape(-1)[::2][1:].copy()`

			`self.padding = '~'`
			`self.reset()`
			`np_item = []`
			`for i in all_items:`
			`#for np_item in np.split(all_items, np.where(all_items == -1)):`
			`if i == -1:`
			`if len(np_item) == 0:`
			`continue`
			`item = tuple()`
			`for i in np_item:`
			`if i < 0:`
			`assert(i != -1)`
			`item += self.helper(neg_lookup[i])`
			`else:`
			`item += self.helper(chr(i))`
			`#die(np_item, item)`
			`self.learn(item)`
			`np_item = []`
			`elif i != empty:`
			`np_item.append(i)`
			`self.update()`


			`def run(pname, args, env):`
			`if not 1 <= len(args) <= 2:`
			`lament("usage: {} {{input file}} [state_fn file]".format(sys.argv[0]))`
			`sys.exit(1)`

			`args = dict(enumerate(args)) # for .get()`

			`fn = args[0]`
			`state_fn = args.get(1, None)`

			`count = int(env.get('COUNT', '8'))`
			`order = int(env.get('ORDER', '3'))`
			`temperature = float(env.get('TEMPERATURE', '0'))`
			`maxn = int(env.get('MAXN', '1000'))`
			`attempts = int(env.get('ATTEMPTS', '-1'))`
			`merges = int(env.get('MERGES', '0'))`

			`if attempts <= 0:`
			`attempts = None`

			`brain = PatternBrain(order=order, temperature=temperature)`
			`tool = ATTTT(brain)`

			`lament('# loading')`
			`if state_fn:`
			`try:`
			`brain.load(state_fn, raw=False)`
			`except FileNotFoundError:`
			`pass`

			`if brain and brain.new:`
			`lament('# learning')`
			`lines = open(fn).readlines()`
			`brain.learn_all(lines, merges)`

			`if brain and brain.new and state_fn:`
			`brain.save(state_fn, raw=False)`

			`lament('# replying')`
			`for i in range(count):`
			`#reply = tool.reply(maxn=maxn, raw=True, attempts=attempts)`
			`#print('{:6.1f}\t{}'.format(reply[1], reply[0]))`
			`print(tool.reply(maxn=maxn, attempts=attempts))`


			`if __name__ == '__main__':`
			`import sys`
			`import os`
			`pname = len(sys.argv) > 0 and sys.argv[0] or ''`
			`args = len(sys.argv) > 1 and sys.argv[1:] or []`
			`sys.exit(run(pname, args, os.environ))`