From db3171ac295ff2b0047381de58154be8a5ecd2d1 Mon Sep 17 00:00:00 2001
From: Connor <cloningdonor@gmail.com>
Date: Tue, 24 May 2016 20:10:24 -0700
Subject: [PATCH 1/5]

---
 .dummy | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .dummy

diff --git a/.dummy b/.dummy
new file mode 100644
index 0000000..945c9b4
--- /dev/null
+++ b/.dummy
@@ -0,0 +1 @@
+.
\ No newline at end of file

From 28edd2907288d76af6b73bed41a884fce44e8743 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Tue, 24 May 2016 20:15:26 -0700
Subject: [PATCH 2/5] .

---
 .dummy   |   1 -
 atttt.py | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 basic.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++
 misc.py  |  18 +++++
 4 files changed, 445 insertions(+), 1 deletion(-)
 delete mode 100644 .dummy
 create mode 100755 atttt.py
 create mode 100755 basic.py
 create mode 100755 misc.py

diff --git a/.dummy b/.dummy
deleted file mode 100644
index 945c9b4..0000000
--- a/.dummy
+++ /dev/null
@@ -1 +0,0 @@
-.
\ No newline at end of file
diff --git a/atttt.py b/atttt.py
new file mode 100755
index 0000000..4e65b20
--- /dev/null
+++ b/atttt.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+
+import sys
+import numpy as np
+
+from misc import *
+from basic import Brain
+
+
+def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False):
+    # black magic wrapper around np.unique
+    # via np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
+    return_any = return_index or return_inverse or return_counts
+    if not return_any:
+        np.unique(a.view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))).view(a.dtype).reshape(-1, a.shape[1])
+    else:
+        void_dtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
+        ret = np.unique(a.view(void_dtype), return_index, return_inverse, return_counts)
+        return (ret[0].view(a.dtype).reshape(-1, a.shape[1]),) + ret[1:]
+
+
+class ATTTT():
+
+    def __init__(self, brain):
+        self.brain = brain
+        self.score = self._score
+
+
+    def _score(self, reply, maxn):
+        if len(reply) > maxn:
+            return -999999999
+
+        #return len(reply)
+        return 1
+
+
+    def reply(self, item=None, maxn=1000, raw=False, attempts=None):
+        if attempts == None:
+            attempts = int(2**12 / self.brain.order)
+            lament('attempts:', attempts)
+
+        replies = []
+        for i in range(attempts):
+            reply = "".join(self.brain.reply(item=item, maxn=maxn+1))
+            replies += [(reply, self.score(reply, maxn))]
+
+        result = sorted(replies, key=lambda t: t[1], reverse=True)[0]
+
+        if raw:
+            return result
+        else:
+            return result[0]
+
+
+class PatternBrain(Brain):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tokens = []
+
+
+    def helper(self, v):
+        return (v,)
+
+
+    def learn_all(self, items, merges=1):
+        min_count = 2
+        if merges < 0:
+            min_count = -merges
+            merges = 65536
+
+        # use numpy so this isn't nearly as disgustingly slow
+
+        int32_min = -2**(np.dtype(np.int32).itemsize * 8 - 1)
+        empty = int32_min
+        neg_lookup = {-1: ''} # default with padding
+
+        alignment = 2
+        align = lambda x: (x + alignment // 2) // alignment * alignment
+
+        new_items = []
+        for item in items:
+            item = item.strip('\n')
+            # assert at least 1 padding character at the end
+            next_biggest = align(len(item) + 1)
+            # fill with padding (-1)
+            new_item = -np.ones(next_biggest, dtype=np.int32)
+            for i, c in enumerate(item):
+                new_item[i] = ord(c)
+            new_items.append(new_item)
+
+        # add an extra padding item to the head and tail
+        # for easier conversion from sequence back to all_items later on
+        pad = -np.ones(1, dtype=np.int32)
+        new_items.insert(0, pad)
+        new_items.append(pad)
+
+        all_items = np.concatenate(new_items)
+
+        if merges > 0:
+            # set up a 2d array to step through at half the row length,
+            # this means double redundancy, to acquire all the sequences.
+            # we don't have to .roll it later to get the other half,
+            # though that would require less memory.
+            sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy()
+
+        for i in range(merges):
+            # learn
+            most_common = (None, 1)
+            # TODO: eventually check for empty here too
+            invalid = np.any(sequences == -1, axis=1)
+            valid_sequences = np.delete(sequences, np.where(invalid), axis=0)
+            unique, counts = uniq_rows(valid_sequences, return_counts=True)
+            count = counts.max()
+
+            if count > most_common[1]:
+                seq = unique[counts == count][0]
+                most_common = (seq, count)
+
+            if most_common[0] is None or most_common[1] <= 1 or most_common[1] < min_count:
+                lament('no more valid sequences')
+                break
+
+            new_id = -1 - len(neg_lookup)
+            neg_lookup[new_id] = "".join([o < 0 and neg_lookup[o] or chr(o) for o in most_common[0]])
+
+            if len("".join(neg_lookup.values())) > len(all_items):
+                lament('preventing dict from growing larger than source')
+                break
+
+            # replace our most common sequence in the sequences
+            found = np.all(sequences == most_common[0], axis=1)
+            before = np.roll(found, -1)
+            after = np.roll(found, 1)
+            # don't wrap around truth values
+            before[-1] = False
+            after[0] = False
+            # or remove padding
+            #before[0] = False
+            #after[-1] = False
+            # remove the "found" sequences
+            befores = sequences[before].T.copy()
+            befores[1] = new_id
+            sequences[before] = befores.T
+            afters = sequences[after].T.copy()
+            afters[0] = new_id
+            sequences[after] = afters.T
+            #sequences[found] = [empty, empty]
+            here = np.where(found)
+            sequences = np.delete(sequences, here, axis=0)
+
+            print("({:8}) new token: {:5} \"{}\"".format(len(here[0]), new_id, neg_lookup[new_id]))
+
+        if merges > 0:
+            # reconstruct all_items out of the sequences
+            all_items = sequences.reshape(-1)[::2][1:].copy()
+
+        self.padding = '~'
+        self.reset()
+        np_item = []
+        for i in all_items:
+        #for np_item in np.split(all_items, np.where(all_items == -1)):
+            if i == -1:
+                if len(np_item) == 0:
+                    continue
+                item = tuple()
+                for i in np_item:
+                    if i < 0:
+                        assert(i != -1)
+                        item += self.helper(neg_lookup[i])
+                    else:
+                        item += self.helper(chr(i))
+                #die(np_item, item)
+                self.learn(item)
+                np_item = []
+            elif i != empty:
+                np_item.append(i)
+        self.update()
+
+
+def run(pname, args, env):
+    if not 1 <= len(args) <= 2:
+        lament("usage: {} {{input file}} [state_fn file]".format(sys.argv[0]))
+        sys.exit(1)
+
+    args = dict(enumerate(args)) # for .get()
+
+    fn = args[0]
+    state_fn = args.get(1, None)
+
+    count = int(env.get('COUNT', '8'))
+    order = int(env.get('ORDER', '3'))
+    temperature = float(env.get('TEMPERATURE', '0'))
+    maxn = int(env.get('MAXN', '1000'))
+    attempts = int(env.get('ATTEMPTS', '-1'))
+    merges = int(env.get('MERGES', '0'))
+
+    if attempts <= 0:
+        attempts = None
+
+    brain = PatternBrain(order=order, temperature=temperature)
+    tool = ATTTT(brain)
+
+    lament('# loading')
+    if state_fn:
+        try:
+            brain.load(state_fn, raw=False)
+        except FileNotFoundError:
+            pass
+
+    if brain and brain.new:
+        lament('# learning')
+        lines = open(fn).readlines()
+        brain.learn_all(lines, merges)
+
+    if brain and brain.new and state_fn:
+        brain.save(state_fn, raw=False)
+
+    lament('# replying')
+    for i in range(count):
+        #reply = tool.reply(maxn=maxn, raw=True, attempts=attempts)
+        #print('{:6.1f}\t{}'.format(reply[1], reply[0]))
+        print(tool.reply(maxn=maxn, attempts=attempts))
+
+
+if __name__ == '__main__':
+    import sys
+    import os
+    pname = len(sys.argv) > 0 and sys.argv[0] or ''
+    args = len(sys.argv) > 1 and sys.argv[1:] or []
+    sys.exit(run(pname, args, os.environ))
diff --git a/basic.py b/basic.py
new file mode 100755
index 0000000..a19e174
--- /dev/null
+++ b/basic.py
@@ -0,0 +1,196 @@
+import math
+import numpy as np
+
+from misc import *
+
+
+def normalize(counter):
+    v = counter.values()
+    s = float(sum(v))
+    m = float(max(v))
+    del v
+    d = {}
+    for c, cnt in counter.items():
+        d[c] = (cnt/s, cnt/m)
+    return d
+#   return [(c, cnt/s, cnt/m) for c, cnt in counter.items()]
+
+
+def normalize_sorted(counter):
+    # mostly just for debugging i guess?
+    return sorted(normalize(counter), key=lambda t: t[1], reverse=True)
+
+
+# http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139
+class Brain:
+
+    # TODO: don't default padding here, but make sure it's set before running
+    # the reason is it's the only place that's specific to a string anymore
+    def __init__(self, order=1, temperature=0.5, padding="~"):
+        self.order = order
+        self.padding = padding
+        self.temperature = temperature
+
+        self.reset()
+
+
+    def reset(self):
+        import collections as cool
+        # unnormalized
+        self._machine = cool.defaultdict(cool.Counter)
+        # normalized
+        self.machine = None
+
+        self.type = None
+        self.dirty = False
+        self.new = True
+
+
+    @property
+    def temperature(self):
+        return self._temperature
+
+
+    @temperature.setter
+    def temperature(self, value):
+        self._temperature = value
+
+        if value == 1:
+            # TODO: proper distribution stuff
+            self.random = lambda count: np.random.random(count)**2
+        elif value == 0:
+            self.random = np.random.random
+        else:
+            # +0.25 = -0.0
+            # +0.50 = +0.5
+            # +0.75 = +1.0
+            point75 = 1
+            const = (point75 * 2 - 1) / math.atanh(0.75 * 2 - 1)
+            unbound = (math.atanh((1 - value) * 2 - 1) * const + 1) / 2
+            self.random = easytruncnorm(0, 1, unbound, 0.25).rvs
+
+
+    def learn_all(self, items):
+        for item in items:
+            self.learn(item)
+        self.update()
+
+
+    def learn(self, item):
+        if self.type is None and item is not None:
+            self.type = type(item)
+        if type(item) is not self.type:
+            raise Exception("that's no good")
+
+        if self.type == type("string"):
+            item = item.strip()
+
+        if len(item) == 0:
+            return
+
+        pad = self.helper(self.padding) * self.order
+        item = pad + item + pad
+
+        stop = len(item) - self.order
+        if stop > 0:
+            for i in range(stop):
+                history, newitem = item[i:i+self.order], item[i+self.order]
+                self._machine[history][newitem] += 1
+
+        self.dirty = True
+
+
+    def update(self):
+        if self.dirty and self._machine:
+            self.machine = {hist:normalize(items)
+                            for hist, items in self._machine.items()}
+            self.dirty = False
+
+
+    def next(self, history):
+        history = history[-self.order:]
+
+        dist = self.machine.get(history, None)
+        if dist == None:
+            lament('warning: no value: {}'.format(history))
+            return None
+
+        x = self.random(1)
+        for c, v in dist.items():
+        #   if x <= v: # this is a bad idea
+            x = x - v[0]
+            if x <= 0:
+                return c
+
+
+    def helper(self, v):
+        return v
+
+
+    def reply(self, item=None, maxn=1000):
+        self.update()
+
+        history = self.helper(self.padding) * self.order
+
+        out = []
+        for i in range(maxn):
+            c = self.next(history)
+            if c.find(self.padding) != -1:
+                out.append(c.replace(self.padding, ''))
+                break
+            history = history[-self.order:] + self.helper(c)
+            out.append(c)
+
+        return out
+
+
+    def load(self, fn, raw=True):
+        import pickle
+        if type(fn) == type(''):
+            f = open(fn, 'rb')
+        else:
+            f = fn
+
+        d = pickle.load(f)
+
+        if d['order'] != self.order:
+            lament('warning: order mismatch. cancelling load.')
+            return
+        self.order = d['order']
+
+        if raw:
+            if not d.get('_machine'):
+                lament('warning: no _machine. cancelling load.')
+                return
+            self._machine = d['_machine']
+
+            self.dirty = True
+            self.update()
+        else:
+            if not d.get('machine'):
+                lament('warning: no machine. cancelling load.')
+                return
+            self.machine = d['machine']
+
+        self.new = False
+        if f != fn:
+            f.close()
+
+
+    def save(self, fn, raw=True):
+        import pickle
+        if type(fn) == type(''):
+            f = open(fn, 'wb')
+        else:
+            f = fn
+
+        d = {}
+        d['order'] = self.order
+        if raw:
+            d['_machine'] = self._machine
+        else:
+            d['machine'] = self.machine
+        pickle.dump(d, f)
+
+        if f != fn:
+            f.close()
diff --git a/misc.py b/misc.py
new file mode 100755
index 0000000..e2cd2d7
--- /dev/null
+++ b/misc.py
@@ -0,0 +1,18 @@
+import sys
+lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
+
+
+def die(*args, **kwargs):
+    lament(*args, **kwargs)
+    sys.exit(1)
+
+
+def easytruncnorm(lower=0, upper=1, loc=0.5, scale=0.25):
+    import scipy.stats as stats
+    a = (lower - loc) / scale
+    b = (upper - loc) / scale
+    return stats.truncnorm(a=a, b=b, loc=loc, scale=scale)
+
+
+# only make some things visible to "from misc import *"
+__all__ = [o for o in locals() if type(o) != 'module' and not o.startswith('_')]

From eeb5d2941e591ab488c495eb21cb5706dac423bd Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Wed, 25 May 2016 07:31:48 -0700
Subject: [PATCH 3/5] .

---
 atttt.py | 156 ++++++++++++++++++++++++++++++++-----------------------
 basic.py |  11 ++--
 misc.py  |   1 +
 3 files changed, 100 insertions(+), 68 deletions(-)

diff --git a/atttt.py b/atttt.py
index 4e65b20..0763bcc 100755
--- a/atttt.py
+++ b/atttt.py
@@ -8,8 +8,8 @@ from basic import Brain
 
 
 def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False):
+    # via http://stackoverflow.com/a/16973510
     # black magic wrapper around np.unique
-    # via np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
     return_any = return_index or return_inverse or return_counts
     if not return_any:
         np.unique(a.view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))).view(a.dtype).reshape(-1, a.shape[1])
@@ -34,8 +34,9 @@ class ATTTT():
         return 1
 
 
-    def reply(self, item=None, maxn=1000, raw=False, attempts=None):
+    def reply(self, item=None, maxn=1000, include_scores=False, attempts=None):
         if attempts == None:
+            # just guess some value that'll take roughly the same amount of time
             attempts = int(2**12 / self.brain.order)
             lament('attempts:', attempts)
 
@@ -46,7 +47,7 @@ class ATTTT():
 
         result = sorted(replies, key=lambda t: t[1], reverse=True)[0]
 
-        if raw:
+        if include_scores:
             return result
         else:
             return result[0]
@@ -63,56 +64,31 @@ class PatternBrain(Brain):
         return (v,)
 
 
-    def learn_all(self, items, merges=1):
-        min_count = 2
-        if merges < 0:
-            min_count = -merges
-            merges = 65536
+    def resolve_tokens(self, tokens):
+        # positive values are just unicode characters
+        return [o < 0 and self.tokens[o] or chr(o) for o in tokens]
 
-        # use numpy so this isn't nearly as disgustingly slow
 
-        int32_min = -2**(np.dtype(np.int32).itemsize * 8 - 1)
-        empty = int32_min
-        neg_lookup = {-1: ''} # default with padding
+    def new_token(self, value):
+        new_id = -1 - len(self.tokens)
+        self.tokens[new_id] = value
+        return new_id
 
-        alignment = 2
-        align = lambda x: (x + alignment // 2) // alignment * alignment
 
-        new_items = []
-        for item in items:
-            item = item.strip('\n')
-            # assert at least 1 padding character at the end
-            next_biggest = align(len(item) + 1)
-            # fill with padding (-1)
-            new_item = -np.ones(next_biggest, dtype=np.int32)
-            for i, c in enumerate(item):
-                new_item[i] = ord(c)
-            new_items.append(new_item)
-
-        # add an extra padding item to the head and tail
-        # for easier conversion from sequence back to all_items later on
-        pad = -np.ones(1, dtype=np.int32)
-        new_items.insert(0, pad)
-        new_items.append(pad)
-
-        all_items = np.concatenate(new_items)
-
-        if merges > 0:
-            # set up a 2d array to step through at half the row length,
-            # this means double redundancy, to acquire all the sequences.
-            # we don't have to .roll it later to get the other half,
-            # though that would require less memory.
-            sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy()
+    def merge_all(self, all_items, merges, min_count=2):
+        # set up a 2d array to step through at half the row length;
+        # this means double redundancy; to acquire all the sequences.
+        # we could instead .roll it later to get the other half.
+        # that would require less memory, but memory isn't really a concern.
+        sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy()
 
         for i in range(merges):
-            # learn
-            most_common = (None, 1)
-            # TODO: eventually check for empty here too
             invalid = np.any(sequences == -1, axis=1)
             valid_sequences = np.delete(sequences, np.where(invalid), axis=0)
             unique, counts = uniq_rows(valid_sequences, return_counts=True)
             count = counts.max()
 
+            most_common = (None, 1)
             if count > most_common[1]:
                 seq = unique[counts == count][0]
                 most_common = (seq, count)
@@ -121,45 +97,85 @@ class PatternBrain(Brain):
                 lament('no more valid sequences')
                 break
 
-            new_id = -1 - len(neg_lookup)
-            neg_lookup[new_id] = "".join([o < 0 and neg_lookup[o] or chr(o) for o in most_common[0]])
+            token_value = "".join(self.resolve_tokens(most_common[0]))
+            new_id = self.new_token(token_value)
 
-            if len("".join(neg_lookup.values())) > len(all_items):
-                lament('preventing dict from growing larger than source')
+            if len("".join(self.tokens.values())) > len(all_items):
+                # this might not ever occur
+                lament('preventing token dictionary from growing larger than source')
                 break
 
-            # replace our most common sequence in the sequences
+            # replace the most common two-token sequence
+            # with one token to represent both
             found = np.all(sequences == most_common[0], axis=1)
             before = np.roll(found, -1)
             after = np.roll(found, 1)
             # don't wrap around truth values
             before[-1] = False
             after[0] = False
-            # or remove padding
-            #before[0] = False
-            #after[-1] = False
             # remove the "found" sequences
+            # and update the previous/next,
+            # not unlike a doubly-linked list.
             befores = sequences[before].T.copy()
             befores[1] = new_id
             sequences[before] = befores.T
             afters = sequences[after].T.copy()
             afters[0] = new_id
             sequences[after] = afters.T
-            #sequences[found] = [empty, empty]
             here = np.where(found)
             sequences = np.delete(sequences, here, axis=0)
 
-            print("({:8}) new token: {:5} \"{}\"".format(len(here[0]), new_id, neg_lookup[new_id]))
+            print("new token id {:5} occurs {:8} times: \"{}\"".format(new_id, len(here[0]), self.tokens[new_id]))
+
+        # TODO: find unused tokens
+
+        # reconstruct all_items out of the sequences
+        all_items = sequences.reshape(-1)[::2][1:].copy()
+        return all_items
+
+
+    def learn_all(self, items, merges=0):
+        min_count = 2 # minimum number of occurences to stop creating tokens at
+        if merges < 0:
+            min_count = -merges
+            merges = 65536 # arbitrary sanity value
+
+        # we'll use numpy matrices so this isn't nearly as disgustingly slow
+
+        self.tokens = {-1: ''} # default with an empty padding token
+
+        # we need to assert that the number of sequences is a multiple of this
+        # otherwise we can't .reshape() it to be two-dimensional later on
+        alignment = 2
+        align = lambda x: (x + alignment // 2) // alignment * alignment
+
+        new_items = []
+        for item in items:
+            item = item.strip('\n')
+            # assert at least 1 padding character at the end
+            next_biggest = align(len(item) + 1)
+            # initialize with padding (-1)
+            new_item = -np.ones(next_biggest, dtype=np.int32)
+            for i, c in enumerate(item):
+                new_item[i] = ord(c)
+            new_items.append(new_item)
+
+        # add an extra padding item to the head and tail
+        # to make it easier to convert from sequences back to items later on
+        pad = -np.ones(1, dtype=np.int32)
+        new_items.insert(0, pad)
+        new_items.append(pad)
+
+        all_items = np.concatenate(new_items)
 
         if merges > 0:
-            # reconstruct all_items out of the sequences
-            all_items = sequences.reshape(-1)[::2][1:].copy()
+            all_items = self.merge_all(all_items, merges, min_count)
 
+        # begin the actual learning
         self.padding = '~'
         self.reset()
         np_item = []
         for i in all_items:
-        #for np_item in np.split(all_items, np.where(all_items == -1)):
             if i == -1:
                 if len(np_item) == 0:
                     continue
@@ -167,32 +183,40 @@ class PatternBrain(Brain):
                 for i in np_item:
                     if i < 0:
                         assert(i != -1)
-                        item += self.helper(neg_lookup[i])
+                        item += self.helper(self.tokens[i])
                     else:
                         item += self.helper(chr(i))
                 #die(np_item, item)
                 self.learn(item)
                 np_item = []
-            elif i != empty:
+            else:
                 np_item.append(i)
         self.update()
 
 
 def run(pname, args, env):
     if not 1 <= len(args) <= 2:
-        lament("usage: {} {{input file}} [state_fn file]".format(sys.argv[0]))
-        sys.exit(1)
+        lament("usage: {} {{input file}} [savestate file]".format(pname))
+        return 1
 
-    args = dict(enumerate(args)) # for .get()
+    args = dict(enumerate(args)) # just for the .get() method
 
     fn = args[0]
     state_fn = args.get(1, None)
 
+    # the number of lines to output.
     count = int(env.get('COUNT', '8'))
-    order = int(env.get('ORDER', '3'))
-    temperature = float(env.get('TEMPERATURE', '0'))
-    maxn = int(env.get('MAXN', '1000'))
+    # learn and sample using this number of sequential tokens.
+    order = int(env.get('ORDER', '2'))
+    # how experimental to be with sampling.
+    # probably doesn't work properly.
+    temperature = float(env.get('TEMPERATURE', '0.5'))
+    # the max character length of output. (not guaranteed)
+    maxn = int(env.get('MAXN', '240'))
+    # attempts to maximize scoring
     attempts = int(env.get('ATTEMPTS', '-1'))
+    # if positive, maximum number of tokens to merge.
+    # if negative, minimum number of occurences to stop at.
     merges = int(env.get('MERGES', '0'))
 
     if attempts <= 0:
@@ -201,11 +225,12 @@ def run(pname, args, env):
     brain = PatternBrain(order=order, temperature=temperature)
     tool = ATTTT(brain)
 
-    lament('# loading')
     if state_fn:
+        lament('# loading')
         try:
             brain.load(state_fn, raw=False)
         except FileNotFoundError:
+            lament('# no file to load. skipping')
             pass
 
     if brain and brain.new:
@@ -214,6 +239,7 @@ def run(pname, args, env):
         brain.learn_all(lines, merges)
 
     if brain and brain.new and state_fn:
+        lament('# saving')
         brain.save(state_fn, raw=False)
 
     lament('# replying')
@@ -222,6 +248,8 @@ def run(pname, args, env):
         #print('{:6.1f}\t{}'.format(reply[1], reply[0]))
         print(tool.reply(maxn=maxn, attempts=attempts))
 
+    return 0
+
 
 if __name__ == '__main__':
     import sys
diff --git a/basic.py b/basic.py
index a19e174..417a5ab 100755
--- a/basic.py
+++ b/basic.py
@@ -24,12 +24,10 @@ def normalize_sorted(counter):
 # http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139
 class Brain:
 
-    # TODO: don't default padding here, but make sure it's set before running
-    # the reason is it's the only place that's specific to a string anymore
-    def __init__(self, order=1, temperature=0.5, padding="~"):
+    def __init__(self, order=1, temperature=0.5):
         self.order = order
-        self.padding = padding
         self.temperature = temperature
+        self.padding = None
 
         self.reset()
 
@@ -77,6 +75,8 @@ class Brain:
 
 
     def learn(self, item):
+        assert(self.padding)
+
         if self.type is None and item is not None:
             self.type = type(item)
         if type(item) is not self.type:
@@ -123,11 +123,14 @@ class Brain:
                 return c
 
 
+    # for overriding in subclasses
+    # in case the input tokens aren't strings (e.g. tuples)
     def helper(self, v):
         return v
 
 
     def reply(self, item=None, maxn=1000):
+        assert(self.padding)
         self.update()
 
         history = self.helper(self.padding) * self.order
diff --git a/misc.py b/misc.py
index e2cd2d7..5fcff9d 100755
--- a/misc.py
+++ b/misc.py
@@ -3,6 +3,7 @@ lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
 
 
 def die(*args, **kwargs):
+    # just for ad-hoc debugging really
     lament(*args, **kwargs)
     sys.exit(1)
 

From e61a32c615e78457532766a5d4a2116e04568876 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Wed, 25 May 2016 11:30:07 -0700
Subject: [PATCH 4/5] .

---
 atttt.py | 83 ++++++++++++++++++++++++++++++++++++++------------------
 misc.py  |  8 ------
 2 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/atttt.py b/atttt.py
index 0763bcc..f9f0af7 100755
--- a/atttt.py
+++ b/atttt.py
@@ -7,6 +7,10 @@ from misc import *
 from basic import Brain
 
 
+def align(x, alignment):
+    return (x + alignment // 2) // alignment * alignment
+
+
 def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False):
     # via http://stackoverflow.com/a/16973510
     # black magic wrapper around np.unique
@@ -66,7 +70,10 @@ class PatternBrain(Brain):
 
     def resolve_tokens(self, tokens):
         # positive values are just unicode characters
-        return [o < 0 and self.tokens[o] or chr(o) for o in tokens]
+        if isinstance(tokens, int) or isinstance(tokens, np.int32):
+            return tokens < 0 and self.tokens[tokens] or chr(tokens)
+        else:
+            return [o < 0 and self.tokens[o] or chr(o) for o in tokens]
 
 
     def new_token(self, value):
@@ -75,6 +82,46 @@ class PatternBrain(Brain):
         return new_id
 
 
+    @staticmethod
+    def prepare_items(items, pad=True):
+        new_items = []
+        for item in items:
+            item = item.strip('\n')
+            # assert that the number of sequences is a multiple of 2
+            # otherwise we can't .reshape() it to be two-dimensional later on
+            next_biggest = align(len(item) + 1, 2)
+            # initialize with padding (-1)
+            new_item = -np.ones(next_biggest, dtype=np.int32)
+            for i, c in enumerate(item):
+                new_item[i] = ord(c)
+            new_items.append(new_item)
+
+        # add an extra padding item to the head and tail
+        # to make it easier to convert from sequences back to items later on
+        if pad:
+            pad = -np.ones(1, dtype=np.int32)
+            new_items.insert(0, pad)
+            new_items.append(pad)
+
+        return np.concatenate(new_items)
+
+
+    def stat_tokens(self, all_items, skip_normal=False):
+        unique, counts = np.unique(all_items, return_counts=True)
+        count_order = np.argsort(counts)[::-1]
+        counts_descending = counts[count_order]
+        unique_descending = unique[count_order]
+        for i, token_id in enumerate(unique_descending):
+            if token_id == -1:
+                continue
+            if skip_normal and token_id >= 0:
+                continue
+            token = self.resolve_tokens(token_id)
+            lament("token id {:5} occurs {:8} times: \"{}\"".format(
+                token_id, counts_descending[i], token))
+        lament("total tokens: {:5}".format(i + 1))
+
+
     def merge_all(self, all_items, merges, min_count=2):
         # set up a 2d array to step through at half the row length;
         # this means double redundancy; to acquire all the sequences.
@@ -125,16 +172,17 @@ class PatternBrain(Brain):
             here = np.where(found)
             sequences = np.delete(sequences, here, axis=0)
 
-            print("new token id {:5} occurs {:8} times: \"{}\"".format(new_id, len(here[0]), self.tokens[new_id]))
+            lament("new token id {:5} occurs {:8} times: \"{}\"".format(
+                new_id, len(here[0]), self.tokens[new_id]))
 
-        # TODO: find unused tokens
+        # TODO: find unused tokens?
 
         # reconstruct all_items out of the sequences
         all_items = sequences.reshape(-1)[::2][1:].copy()
         return all_items
 
 
-    def learn_all(self, items, merges=0):
+    def learn_all(self, items, merges=0, stat=True):
         min_count = 2 # minimum number of occurences to stop creating tokens at
         if merges < 0:
             min_count = -merges
@@ -144,29 +192,7 @@ class PatternBrain(Brain):
 
         self.tokens = {-1: ''} # default with an empty padding token
 
-        # we need to assert that the number of sequences is a multiple of this
-        # otherwise we can't .reshape() it to be two-dimensional later on
-        alignment = 2
-        align = lambda x: (x + alignment // 2) // alignment * alignment
-
-        new_items = []
-        for item in items:
-            item = item.strip('\n')
-            # assert at least 1 padding character at the end
-            next_biggest = align(len(item) + 1)
-            # initialize with padding (-1)
-            new_item = -np.ones(next_biggest, dtype=np.int32)
-            for i, c in enumerate(item):
-                new_item[i] = ord(c)
-            new_items.append(new_item)
-
-        # add an extra padding item to the head and tail
-        # to make it easier to convert from sequences back to items later on
-        pad = -np.ones(1, dtype=np.int32)
-        new_items.insert(0, pad)
-        new_items.append(pad)
-
-        all_items = np.concatenate(new_items)
+        all_items = self.prepare_items(items)
 
         if merges > 0:
             all_items = self.merge_all(all_items, merges, min_count)
@@ -193,6 +219,9 @@ class PatternBrain(Brain):
                 np_item.append(i)
         self.update()
 
+        if merges != 0 and stat:
+            self.stat_tokens(all_items)
+
 
 def run(pname, args, env):
     if not 1 <= len(args) <= 2:
diff --git a/misc.py b/misc.py
index 5fcff9d..36da0d4 100755
--- a/misc.py
+++ b/misc.py
@@ -8,12 +8,4 @@ def die(*args, **kwargs):
     sys.exit(1)
 
 
-def easytruncnorm(lower=0, upper=1, loc=0.5, scale=0.25):
-    import scipy.stats as stats
-    a = (lower - loc) / scale
-    b = (upper - loc) / scale
-    return stats.truncnorm(a=a, b=b, loc=loc, scale=scale)
-
-
-# only make some things visible to "from misc import *"
 __all__ = [o for o in locals() if type(o) != 'module' and not o.startswith('_')]

From b028ee53d9ee494579bbb1ba9b1cc8e29b911721 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Wed, 25 May 2016 11:37:34 -0700
Subject: [PATCH 5/5] .

---
 atttt.py | 10 +---------
 basic.py | 40 +++++++++++++++-------------------------
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/atttt.py b/atttt.py
index f9f0af7..02f662c 100755
--- a/atttt.py
+++ b/atttt.py
@@ -60,7 +60,7 @@ class ATTTT():
 class PatternBrain(Brain):
 
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+        super().__init__(*args, padding='~', **kwargs)
         self.tokens = []
 
 
@@ -147,11 +147,6 @@ class PatternBrain(Brain):
             token_value = "".join(self.resolve_tokens(most_common[0]))
             new_id = self.new_token(token_value)
 
-            if len("".join(self.tokens.values())) > len(all_items):
-                # this might not ever occur
-                lament('preventing token dictionary from growing larger than source')
-                break
-
             # replace the most common two-token sequence
             # with one token to represent both
             found = np.all(sequences == most_common[0], axis=1)
@@ -175,8 +170,6 @@ class PatternBrain(Brain):
             lament("new token id {:5} occurs {:8} times: \"{}\"".format(
                 new_id, len(here[0]), self.tokens[new_id]))
 
-        # TODO: find unused tokens?
-
         # reconstruct all_items out of the sequences
         all_items = sequences.reshape(-1)[::2][1:].copy()
         return all_items
@@ -198,7 +191,6 @@ class PatternBrain(Brain):
             all_items = self.merge_all(all_items, merges, min_count)
 
         # begin the actual learning
-        self.padding = '~'
         self.reset()
         np_item = []
         for i in all_items:
diff --git a/basic.py b/basic.py
index 417a5ab..0dc1416 100755
--- a/basic.py
+++ b/basic.py
@@ -9,25 +9,24 @@ def normalize(counter):
     s = float(sum(v))
     m = float(max(v))
     del v
-    d = {}
-    for c, cnt in counter.items():
-        d[c] = (cnt/s, cnt/m)
-    return d
-#   return [(c, cnt/s, cnt/m) for c, cnt in counter.items()]
+    return [(c, cnt/s, cnt/m) for c, cnt in counter.items()]
 
 
 def normalize_sorted(counter):
-    # mostly just for debugging i guess?
+    # if the elements were unsorted,
+    # we couldn't use our lazy method (subtraction) of selecting tokens
+    # and temperature would correspond to arbitrary tokens
+    # instead of more/less common tokens.
     return sorted(normalize(counter), key=lambda t: t[1], reverse=True)
 
 
 # http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139
 class Brain:
 
-    def __init__(self, order=1, temperature=0.5):
+    def __init__(self, padding, order=1, temperature=0.5):
         self.order = order
         self.temperature = temperature
-        self.padding = None
+        self.padding = padding
 
         self.reset()
 
@@ -51,21 +50,13 @@ class Brain:
 
     @temperature.setter
     def temperature(self, value):
+        assert(0 < value < 1)
         self._temperature = value
 
-        if value == 1:
-            # TODO: proper distribution stuff
-            self.random = lambda count: np.random.random(count)**2
-        elif value == 0:
-            self.random = np.random.random
-        else:
-            # +0.25 = -0.0
-            # +0.50 = +0.5
-            # +0.75 = +1.0
-            point75 = 1
-            const = (point75 * 2 - 1) / math.atanh(0.75 * 2 - 1)
-            unbound = (math.atanh((1 - value) * 2 - 1) * const + 1) / 2
-            self.random = easytruncnorm(0, 1, unbound, 0.25).rvs
+        a = 1 - value * 2
+        # http://www.mathopenref.com/graphfunctions.html?fx=(a*x-x)/(2*a*x-a-1)&sg=f&sh=f&xh=1&xl=0&yh=1&yl=0&ah=1&al=-1&a=0.5
+        tweak = lambda x: (a * x - x) / (2 * a * x - a - 1)
+        self.random = lambda n: 1 - tweak(np.random.random(n))
 
 
     def learn_all(self, items):
@@ -102,7 +93,7 @@ class Brain:
 
     def update(self):
         if self.dirty and self._machine:
-            self.machine = {hist:normalize(items)
+            self.machine = {hist: normalize_sorted(items)
                             for hist, items in self._machine.items()}
             self.dirty = False
 
@@ -116,9 +107,8 @@ class Brain:
             return None
 
         x = self.random(1)
-        for c, v in dist.items():
-        #   if x <= v: # this is a bad idea
-            x = x - v[0]
+        for c, cs, cm in dist:
+            x = x - cs
             if x <= 0:
                 return c