From db3171ac295ff2b0047381de58154be8a5ecd2d1 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 24 May 2016 20:10:24 -0700 Subject: [PATCH 1/5] --- .dummy | 1 + 1 file changed, 1 insertion(+) create mode 100644 .dummy diff --git a/.dummy b/.dummy new file mode 100644 index 0000000..945c9b4 --- /dev/null +++ b/.dummy @@ -0,0 +1 @@ +. \ No newline at end of file From 28edd2907288d76af6b73bed41a884fce44e8743 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Tue, 24 May 2016 20:15:26 -0700 Subject: [PATCH 2/5] . --- .dummy | 1 - atttt.py | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ basic.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++ misc.py | 18 +++++ 4 files changed, 445 insertions(+), 1 deletion(-) delete mode 100644 .dummy create mode 100755 atttt.py create mode 100755 basic.py create mode 100755 misc.py diff --git a/.dummy b/.dummy deleted file mode 100644 index 945c9b4..0000000 --- a/.dummy +++ /dev/null @@ -1 +0,0 @@ -. \ No newline at end of file diff --git a/atttt.py b/atttt.py new file mode 100755 index 0000000..4e65b20 --- /dev/null +++ b/atttt.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +import sys +import numpy as np + +from misc import * +from basic import Brain + + +def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False): + # black magic wrapper around np.unique + # via np.dtype((np.void, a.dtype.itemsize * a.shape[1])) + return_any = return_index or return_inverse or return_counts + if not return_any: + np.unique(a.view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))).view(a.dtype).reshape(-1, a.shape[1]) + else: + void_dtype = np.dtype((np.void, a.dtype.itemsize * a.shape[1])) + ret = np.unique(a.view(void_dtype), return_index, return_inverse, return_counts) + return (ret[0].view(a.dtype).reshape(-1, a.shape[1]),) + ret[1:] + + +class ATTTT(): + + def __init__(self, brain): + self.brain = brain + self.score = self._score + + + def _score(self, reply, maxn): + if len(reply) > maxn: + return -999999999 + + #return len(reply) + return 1 + + + def reply(self, item=None, maxn=1000, raw=False, attempts=None): + if attempts == None: + attempts = int(2**12 / self.brain.order) + lament('attempts:', attempts) + + replies = [] + for i in range(attempts): + reply = "".join(self.brain.reply(item=item, maxn=maxn+1)) + replies += [(reply, self.score(reply, maxn))] + + result = sorted(replies, key=lambda t: t[1], reverse=True)[0] + + if raw: + return result + else: + return result[0] + + +class PatternBrain(Brain): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.tokens = [] + + + def helper(self, v): + return (v,) + + + def learn_all(self, items, merges=1): + min_count = 2 + if merges < 0: + min_count = -merges + merges = 65536 + + # use numpy so this isn't nearly as disgustingly slow + + int32_min = -2**(np.dtype(np.int32).itemsize * 8 - 1) + empty = int32_min + neg_lookup = {-1: ''} # default with padding + + alignment = 2 + align = lambda x: (x + alignment // 2) // alignment * alignment + + new_items = [] + for item in items: + item = item.strip('\n') + # assert at least 1 padding character at the end + next_biggest = align(len(item) + 1) + # fill with padding (-1) + new_item = -np.ones(next_biggest, dtype=np.int32) + for i, c in enumerate(item): + new_item[i] = ord(c) + new_items.append(new_item) + + # add an extra padding item to the head and tail + # for easier conversion from sequence back to all_items later on + pad = -np.ones(1, dtype=np.int32) + new_items.insert(0, pad) + new_items.append(pad) + + all_items = np.concatenate(new_items) + + if merges > 0: + # set up a 2d array to step through at half the row length, + # this means double redundancy, to acquire all the sequences. + # we don't have to .roll it later to get the other half, + # though that would require less memory. + sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy() + + for i in range(merges): + # learn + most_common = (None, 1) + # TODO: eventually check for empty here too + invalid = np.any(sequences == -1, axis=1) + valid_sequences = np.delete(sequences, np.where(invalid), axis=0) + unique, counts = uniq_rows(valid_sequences, return_counts=True) + count = counts.max() + + if count > most_common[1]: + seq = unique[counts == count][0] + most_common = (seq, count) + + if most_common[0] is None or most_common[1] <= 1 or most_common[1] < min_count: + lament('no more valid sequences') + break + + new_id = -1 - len(neg_lookup) + neg_lookup[new_id] = "".join([o < 0 and neg_lookup[o] or chr(o) for o in most_common[0]]) + + if len("".join(neg_lookup.values())) > len(all_items): + lament('preventing dict from growing larger than source') + break + + # replace our most common sequence in the sequences + found = np.all(sequences == most_common[0], axis=1) + before = np.roll(found, -1) + after = np.roll(found, 1) + # don't wrap around truth values + before[-1] = False + after[0] = False + # or remove padding + #before[0] = False + #after[-1] = False + # remove the "found" sequences + befores = sequences[before].T.copy() + befores[1] = new_id + sequences[before] = befores.T + afters = sequences[after].T.copy() + afters[0] = new_id + sequences[after] = afters.T + #sequences[found] = [empty, empty] + here = np.where(found) + sequences = np.delete(sequences, here, axis=0) + + print("({:8}) new token: {:5} \"{}\"".format(len(here[0]), new_id, neg_lookup[new_id])) + + if merges > 0: + # reconstruct all_items out of the sequences + all_items = sequences.reshape(-1)[::2][1:].copy() + + self.padding = '~' + self.reset() + np_item = [] + for i in all_items: + #for np_item in np.split(all_items, np.where(all_items == -1)): + if i == -1: + if len(np_item) == 0: + continue + item = tuple() + for i in np_item: + if i < 0: + assert(i != -1) + item += self.helper(neg_lookup[i]) + else: + item += self.helper(chr(i)) + #die(np_item, item) + self.learn(item) + np_item = [] + elif i != empty: + np_item.append(i) + self.update() + + +def run(pname, args, env): + if not 1 <= len(args) <= 2: + lament("usage: {} {{input file}} [state_fn file]".format(sys.argv[0])) + sys.exit(1) + + args = dict(enumerate(args)) # for .get() + + fn = args[0] + state_fn = args.get(1, None) + + count = int(env.get('COUNT', '8')) + order = int(env.get('ORDER', '3')) + temperature = float(env.get('TEMPERATURE', '0')) + maxn = int(env.get('MAXN', '1000')) + attempts = int(env.get('ATTEMPTS', '-1')) + merges = int(env.get('MERGES', '0')) + + if attempts <= 0: + attempts = None + + brain = PatternBrain(order=order, temperature=temperature) + tool = ATTTT(brain) + + lament('# loading') + if state_fn: + try: + brain.load(state_fn, raw=False) + except FileNotFoundError: + pass + + if brain and brain.new: + lament('# learning') + lines = open(fn).readlines() + brain.learn_all(lines, merges) + + if brain and brain.new and state_fn: + brain.save(state_fn, raw=False) + + lament('# replying') + for i in range(count): + #reply = tool.reply(maxn=maxn, raw=True, attempts=attempts) + #print('{:6.1f}\t{}'.format(reply[1], reply[0])) + print(tool.reply(maxn=maxn, attempts=attempts)) + + +if __name__ == '__main__': + import sys + import os + pname = len(sys.argv) > 0 and sys.argv[0] or '' + args = len(sys.argv) > 1 and sys.argv[1:] or [] + sys.exit(run(pname, args, os.environ)) diff --git a/basic.py b/basic.py new file mode 100755 index 0000000..a19e174 --- /dev/null +++ b/basic.py @@ -0,0 +1,196 @@ +import math +import numpy as np + +from misc import * + + +def normalize(counter): + v = counter.values() + s = float(sum(v)) + m = float(max(v)) + del v + d = {} + for c, cnt in counter.items(): + d[c] = (cnt/s, cnt/m) + return d +# return [(c, cnt/s, cnt/m) for c, cnt in counter.items()] + + +def normalize_sorted(counter): + # mostly just for debugging i guess? + return sorted(normalize(counter), key=lambda t: t[1], reverse=True) + + +# http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139 +class Brain: + + # TODO: don't default padding here, but make sure it's set before running + # the reason is it's the only place that's specific to a string anymore + def __init__(self, order=1, temperature=0.5, padding="~"): + self.order = order + self.padding = padding + self.temperature = temperature + + self.reset() + + + def reset(self): + import collections as cool + # unnormalized + self._machine = cool.defaultdict(cool.Counter) + # normalized + self.machine = None + + self.type = None + self.dirty = False + self.new = True + + + @property + def temperature(self): + return self._temperature + + + @temperature.setter + def temperature(self, value): + self._temperature = value + + if value == 1: + # TODO: proper distribution stuff + self.random = lambda count: np.random.random(count)**2 + elif value == 0: + self.random = np.random.random + else: + # +0.25 = -0.0 + # +0.50 = +0.5 + # +0.75 = +1.0 + point75 = 1 + const = (point75 * 2 - 1) / math.atanh(0.75 * 2 - 1) + unbound = (math.atanh((1 - value) * 2 - 1) * const + 1) / 2 + self.random = easytruncnorm(0, 1, unbound, 0.25).rvs + + + def learn_all(self, items): + for item in items: + self.learn(item) + self.update() + + + def learn(self, item): + if self.type is None and item is not None: + self.type = type(item) + if type(item) is not self.type: + raise Exception("that's no good") + + if self.type == type("string"): + item = item.strip() + + if len(item) == 0: + return + + pad = self.helper(self.padding) * self.order + item = pad + item + pad + + stop = len(item) - self.order + if stop > 0: + for i in range(stop): + history, newitem = item[i:i+self.order], item[i+self.order] + self._machine[history][newitem] += 1 + + self.dirty = True + + + def update(self): + if self.dirty and self._machine: + self.machine = {hist:normalize(items) + for hist, items in self._machine.items()} + self.dirty = False + + + def next(self, history): + history = history[-self.order:] + + dist = self.machine.get(history, None) + if dist == None: + lament('warning: no value: {}'.format(history)) + return None + + x = self.random(1) + for c, v in dist.items(): + # if x <= v: # this is a bad idea + x = x - v[0] + if x <= 0: + return c + + + def helper(self, v): + return v + + + def reply(self, item=None, maxn=1000): + self.update() + + history = self.helper(self.padding) * self.order + + out = [] + for i in range(maxn): + c = self.next(history) + if c.find(self.padding) != -1: + out.append(c.replace(self.padding, '')) + break + history = history[-self.order:] + self.helper(c) + out.append(c) + + return out + + + def load(self, fn, raw=True): + import pickle + if type(fn) == type(''): + f = open(fn, 'rb') + else: + f = fn + + d = pickle.load(f) + + if d['order'] != self.order: + lament('warning: order mismatch. cancelling load.') + return + self.order = d['order'] + + if raw: + if not d.get('_machine'): + lament('warning: no _machine. cancelling load.') + return + self._machine = d['_machine'] + + self.dirty = True + self.update() + else: + if not d.get('machine'): + lament('warning: no machine. cancelling load.') + return + self.machine = d['machine'] + + self.new = False + if f != fn: + f.close() + + + def save(self, fn, raw=True): + import pickle + if type(fn) == type(''): + f = open(fn, 'wb') + else: + f = fn + + d = {} + d['order'] = self.order + if raw: + d['_machine'] = self._machine + else: + d['machine'] = self.machine + pickle.dump(d, f) + + if f != fn: + f.close() diff --git a/misc.py b/misc.py new file mode 100755 index 0000000..e2cd2d7 --- /dev/null +++ b/misc.py @@ -0,0 +1,18 @@ +import sys +lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs) + + +def die(*args, **kwargs): + lament(*args, **kwargs) + sys.exit(1) + + +def easytruncnorm(lower=0, upper=1, loc=0.5, scale=0.25): + import scipy.stats as stats + a = (lower - loc) / scale + b = (upper - loc) / scale + return stats.truncnorm(a=a, b=b, loc=loc, scale=scale) + + +# only make some things visible to "from misc import *" +__all__ = [o for o in locals() if type(o) != 'module' and not o.startswith('_')] From eeb5d2941e591ab488c495eb21cb5706dac423bd Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Wed, 25 May 2016 07:31:48 -0700 Subject: [PATCH 3/5] . --- atttt.py | 156 ++++++++++++++++++++++++++++++++----------------------- basic.py | 11 ++-- misc.py | 1 + 3 files changed, 100 insertions(+), 68 deletions(-) diff --git a/atttt.py b/atttt.py index 4e65b20..0763bcc 100755 --- a/atttt.py +++ b/atttt.py @@ -8,8 +8,8 @@ from basic import Brain def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False): + # via http://stackoverflow.com/a/16973510 # black magic wrapper around np.unique - # via np.dtype((np.void, a.dtype.itemsize * a.shape[1])) return_any = return_index or return_inverse or return_counts if not return_any: np.unique(a.view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))).view(a.dtype).reshape(-1, a.shape[1]) @@ -34,8 +34,9 @@ class ATTTT(): return 1 - def reply(self, item=None, maxn=1000, raw=False, attempts=None): + def reply(self, item=None, maxn=1000, include_scores=False, attempts=None): if attempts == None: + # just guess some value that'll take roughly the same amount of time attempts = int(2**12 / self.brain.order) lament('attempts:', attempts) @@ -46,7 +47,7 @@ class ATTTT(): result = sorted(replies, key=lambda t: t[1], reverse=True)[0] - if raw: + if include_scores: return result else: return result[0] @@ -63,56 +64,31 @@ class PatternBrain(Brain): return (v,) - def learn_all(self, items, merges=1): - min_count = 2 - if merges < 0: - min_count = -merges - merges = 65536 + def resolve_tokens(self, tokens): + # positive values are just unicode characters + return [o < 0 and self.tokens[o] or chr(o) for o in tokens] - # use numpy so this isn't nearly as disgustingly slow - int32_min = -2**(np.dtype(np.int32).itemsize * 8 - 1) - empty = int32_min - neg_lookup = {-1: ''} # default with padding + def new_token(self, value): + new_id = -1 - len(self.tokens) + self.tokens[new_id] = value + return new_id - alignment = 2 - align = lambda x: (x + alignment // 2) // alignment * alignment - new_items = [] - for item in items: - item = item.strip('\n') - # assert at least 1 padding character at the end - next_biggest = align(len(item) + 1) - # fill with padding (-1) - new_item = -np.ones(next_biggest, dtype=np.int32) - for i, c in enumerate(item): - new_item[i] = ord(c) - new_items.append(new_item) - - # add an extra padding item to the head and tail - # for easier conversion from sequence back to all_items later on - pad = -np.ones(1, dtype=np.int32) - new_items.insert(0, pad) - new_items.append(pad) - - all_items = np.concatenate(new_items) - - if merges > 0: - # set up a 2d array to step through at half the row length, - # this means double redundancy, to acquire all the sequences. - # we don't have to .roll it later to get the other half, - # though that would require less memory. - sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy() + def merge_all(self, all_items, merges, min_count=2): + # set up a 2d array to step through at half the row length; + # this means double redundancy; to acquire all the sequences. + # we could instead .roll it later to get the other half. + # that would require less memory, but memory isn't really a concern. + sequences = all_items.repeat(2)[1:-1].reshape(-1, 2).copy() for i in range(merges): - # learn - most_common = (None, 1) - # TODO: eventually check for empty here too invalid = np.any(sequences == -1, axis=1) valid_sequences = np.delete(sequences, np.where(invalid), axis=0) unique, counts = uniq_rows(valid_sequences, return_counts=True) count = counts.max() + most_common = (None, 1) if count > most_common[1]: seq = unique[counts == count][0] most_common = (seq, count) @@ -121,45 +97,85 @@ class PatternBrain(Brain): lament('no more valid sequences') break - new_id = -1 - len(neg_lookup) - neg_lookup[new_id] = "".join([o < 0 and neg_lookup[o] or chr(o) for o in most_common[0]]) + token_value = "".join(self.resolve_tokens(most_common[0])) + new_id = self.new_token(token_value) - if len("".join(neg_lookup.values())) > len(all_items): - lament('preventing dict from growing larger than source') + if len("".join(self.tokens.values())) > len(all_items): + # this might not ever occur + lament('preventing token dictionary from growing larger than source') break - # replace our most common sequence in the sequences + # replace the most common two-token sequence + # with one token to represent both found = np.all(sequences == most_common[0], axis=1) before = np.roll(found, -1) after = np.roll(found, 1) # don't wrap around truth values before[-1] = False after[0] = False - # or remove padding - #before[0] = False - #after[-1] = False # remove the "found" sequences + # and update the previous/next, + # not unlike a doubly-linked list. befores = sequences[before].T.copy() befores[1] = new_id sequences[before] = befores.T afters = sequences[after].T.copy() afters[0] = new_id sequences[after] = afters.T - #sequences[found] = [empty, empty] here = np.where(found) sequences = np.delete(sequences, here, axis=0) - print("({:8}) new token: {:5} \"{}\"".format(len(here[0]), new_id, neg_lookup[new_id])) + print("new token id {:5} occurs {:8} times: \"{}\"".format(new_id, len(here[0]), self.tokens[new_id])) + + # TODO: find unused tokens + + # reconstruct all_items out of the sequences + all_items = sequences.reshape(-1)[::2][1:].copy() + return all_items + + + def learn_all(self, items, merges=0): + min_count = 2 # minimum number of occurences to stop creating tokens at + if merges < 0: + min_count = -merges + merges = 65536 # arbitrary sanity value + + # we'll use numpy matrices so this isn't nearly as disgustingly slow + + self.tokens = {-1: ''} # default with an empty padding token + + # we need to assert that the number of sequences is a multiple of this + # otherwise we can't .reshape() it to be two-dimensional later on + alignment = 2 + align = lambda x: (x + alignment // 2) // alignment * alignment + + new_items = [] + for item in items: + item = item.strip('\n') + # assert at least 1 padding character at the end + next_biggest = align(len(item) + 1) + # initialize with padding (-1) + new_item = -np.ones(next_biggest, dtype=np.int32) + for i, c in enumerate(item): + new_item[i] = ord(c) + new_items.append(new_item) + + # add an extra padding item to the head and tail + # to make it easier to convert from sequences back to items later on + pad = -np.ones(1, dtype=np.int32) + new_items.insert(0, pad) + new_items.append(pad) + + all_items = np.concatenate(new_items) if merges > 0: - # reconstruct all_items out of the sequences - all_items = sequences.reshape(-1)[::2][1:].copy() + all_items = self.merge_all(all_items, merges, min_count) + # begin the actual learning self.padding = '~' self.reset() np_item = [] for i in all_items: - #for np_item in np.split(all_items, np.where(all_items == -1)): if i == -1: if len(np_item) == 0: continue @@ -167,32 +183,40 @@ class PatternBrain(Brain): for i in np_item: if i < 0: assert(i != -1) - item += self.helper(neg_lookup[i]) + item += self.helper(self.tokens[i]) else: item += self.helper(chr(i)) #die(np_item, item) self.learn(item) np_item = [] - elif i != empty: + else: np_item.append(i) self.update() def run(pname, args, env): if not 1 <= len(args) <= 2: - lament("usage: {} {{input file}} [state_fn file]".format(sys.argv[0])) - sys.exit(1) + lament("usage: {} {{input file}} [savestate file]".format(pname)) + return 1 - args = dict(enumerate(args)) # for .get() + args = dict(enumerate(args)) # just for the .get() method fn = args[0] state_fn = args.get(1, None) + # the number of lines to output. count = int(env.get('COUNT', '8')) - order = int(env.get('ORDER', '3')) - temperature = float(env.get('TEMPERATURE', '0')) - maxn = int(env.get('MAXN', '1000')) + # learn and sample using this number of sequential tokens. + order = int(env.get('ORDER', '2')) + # how experimental to be with sampling. + # probably doesn't work properly. + temperature = float(env.get('TEMPERATURE', '0.5')) + # the max character length of output. (not guaranteed) + maxn = int(env.get('MAXN', '240')) + # attempts to maximize scoring attempts = int(env.get('ATTEMPTS', '-1')) + # if positive, maximum number of tokens to merge. + # if negative, minimum number of occurences to stop at. merges = int(env.get('MERGES', '0')) if attempts <= 0: @@ -201,11 +225,12 @@ def run(pname, args, env): brain = PatternBrain(order=order, temperature=temperature) tool = ATTTT(brain) - lament('# loading') if state_fn: + lament('# loading') try: brain.load(state_fn, raw=False) except FileNotFoundError: + lament('# no file to load. skipping') pass if brain and brain.new: @@ -214,6 +239,7 @@ def run(pname, args, env): brain.learn_all(lines, merges) if brain and brain.new and state_fn: + lament('# saving') brain.save(state_fn, raw=False) lament('# replying') @@ -222,6 +248,8 @@ def run(pname, args, env): #print('{:6.1f}\t{}'.format(reply[1], reply[0])) print(tool.reply(maxn=maxn, attempts=attempts)) + return 0 + if __name__ == '__main__': import sys diff --git a/basic.py b/basic.py index a19e174..417a5ab 100755 --- a/basic.py +++ b/basic.py @@ -24,12 +24,10 @@ def normalize_sorted(counter): # http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139 class Brain: - # TODO: don't default padding here, but make sure it's set before running - # the reason is it's the only place that's specific to a string anymore - def __init__(self, order=1, temperature=0.5, padding="~"): + def __init__(self, order=1, temperature=0.5): self.order = order - self.padding = padding self.temperature = temperature + self.padding = None self.reset() @@ -77,6 +75,8 @@ class Brain: def learn(self, item): + assert(self.padding) + if self.type is None and item is not None: self.type = type(item) if type(item) is not self.type: @@ -123,11 +123,14 @@ class Brain: return c + # for overriding in subclasses + # in case the input tokens aren't strings (e.g. tuples) def helper(self, v): return v def reply(self, item=None, maxn=1000): + assert(self.padding) self.update() history = self.helper(self.padding) * self.order diff --git a/misc.py b/misc.py index e2cd2d7..5fcff9d 100755 --- a/misc.py +++ b/misc.py @@ -3,6 +3,7 @@ lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs) def die(*args, **kwargs): + # just for ad-hoc debugging really lament(*args, **kwargs) sys.exit(1) From e61a32c615e78457532766a5d4a2116e04568876 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Wed, 25 May 2016 11:30:07 -0700 Subject: [PATCH 4/5] . --- atttt.py | 83 ++++++++++++++++++++++++++++++++++++++------------------ misc.py | 8 ------ 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/atttt.py b/atttt.py index 0763bcc..f9f0af7 100755 --- a/atttt.py +++ b/atttt.py @@ -7,6 +7,10 @@ from misc import * from basic import Brain +def align(x, alignment): + return (x + alignment // 2) // alignment * alignment + + def uniq_rows(a, return_index=False, return_inverse=False, return_counts=False): # via http://stackoverflow.com/a/16973510 # black magic wrapper around np.unique @@ -66,7 +70,10 @@ class PatternBrain(Brain): def resolve_tokens(self, tokens): # positive values are just unicode characters - return [o < 0 and self.tokens[o] or chr(o) for o in tokens] + if isinstance(tokens, int) or isinstance(tokens, np.int32): + return tokens < 0 and self.tokens[tokens] or chr(tokens) + else: + return [o < 0 and self.tokens[o] or chr(o) for o in tokens] def new_token(self, value): @@ -75,6 +82,46 @@ class PatternBrain(Brain): return new_id + @staticmethod + def prepare_items(items, pad=True): + new_items = [] + for item in items: + item = item.strip('\n') + # assert that the number of sequences is a multiple of 2 + # otherwise we can't .reshape() it to be two-dimensional later on + next_biggest = align(len(item) + 1, 2) + # initialize with padding (-1) + new_item = -np.ones(next_biggest, dtype=np.int32) + for i, c in enumerate(item): + new_item[i] = ord(c) + new_items.append(new_item) + + # add an extra padding item to the head and tail + # to make it easier to convert from sequences back to items later on + if pad: + pad = -np.ones(1, dtype=np.int32) + new_items.insert(0, pad) + new_items.append(pad) + + return np.concatenate(new_items) + + + def stat_tokens(self, all_items, skip_normal=False): + unique, counts = np.unique(all_items, return_counts=True) + count_order = np.argsort(counts)[::-1] + counts_descending = counts[count_order] + unique_descending = unique[count_order] + for i, token_id in enumerate(unique_descending): + if token_id == -1: + continue + if skip_normal and token_id >= 0: + continue + token = self.resolve_tokens(token_id) + lament("token id {:5} occurs {:8} times: \"{}\"".format( + token_id, counts_descending[i], token)) + lament("total tokens: {:5}".format(i + 1)) + + def merge_all(self, all_items, merges, min_count=2): # set up a 2d array to step through at half the row length; # this means double redundancy; to acquire all the sequences. @@ -125,16 +172,17 @@ class PatternBrain(Brain): here = np.where(found) sequences = np.delete(sequences, here, axis=0) - print("new token id {:5} occurs {:8} times: \"{}\"".format(new_id, len(here[0]), self.tokens[new_id])) + lament("new token id {:5} occurs {:8} times: \"{}\"".format( + new_id, len(here[0]), self.tokens[new_id])) - # TODO: find unused tokens + # TODO: find unused tokens? # reconstruct all_items out of the sequences all_items = sequences.reshape(-1)[::2][1:].copy() return all_items - def learn_all(self, items, merges=0): + def learn_all(self, items, merges=0, stat=True): min_count = 2 # minimum number of occurences to stop creating tokens at if merges < 0: min_count = -merges @@ -144,29 +192,7 @@ class PatternBrain(Brain): self.tokens = {-1: ''} # default with an empty padding token - # we need to assert that the number of sequences is a multiple of this - # otherwise we can't .reshape() it to be two-dimensional later on - alignment = 2 - align = lambda x: (x + alignment // 2) // alignment * alignment - - new_items = [] - for item in items: - item = item.strip('\n') - # assert at least 1 padding character at the end - next_biggest = align(len(item) + 1) - # initialize with padding (-1) - new_item = -np.ones(next_biggest, dtype=np.int32) - for i, c in enumerate(item): - new_item[i] = ord(c) - new_items.append(new_item) - - # add an extra padding item to the head and tail - # to make it easier to convert from sequences back to items later on - pad = -np.ones(1, dtype=np.int32) - new_items.insert(0, pad) - new_items.append(pad) - - all_items = np.concatenate(new_items) + all_items = self.prepare_items(items) if merges > 0: all_items = self.merge_all(all_items, merges, min_count) @@ -193,6 +219,9 @@ class PatternBrain(Brain): np_item.append(i) self.update() + if merges != 0 and stat: + self.stat_tokens(all_items) + def run(pname, args, env): if not 1 <= len(args) <= 2: diff --git a/misc.py b/misc.py index 5fcff9d..36da0d4 100755 --- a/misc.py +++ b/misc.py @@ -8,12 +8,4 @@ def die(*args, **kwargs): sys.exit(1) -def easytruncnorm(lower=0, upper=1, loc=0.5, scale=0.25): - import scipy.stats as stats - a = (lower - loc) / scale - b = (upper - loc) / scale - return stats.truncnorm(a=a, b=b, loc=loc, scale=scale) - - -# only make some things visible to "from misc import *" __all__ = [o for o in locals() if type(o) != 'module' and not o.startswith('_')] From b028ee53d9ee494579bbb1ba9b1cc8e29b911721 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Wed, 25 May 2016 11:37:34 -0700 Subject: [PATCH 5/5] . --- atttt.py | 10 +--------- basic.py | 40 +++++++++++++++------------------------- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/atttt.py b/atttt.py index f9f0af7..02f662c 100755 --- a/atttt.py +++ b/atttt.py @@ -60,7 +60,7 @@ class ATTTT(): class PatternBrain(Brain): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__(*args, padding='~', **kwargs) self.tokens = [] @@ -147,11 +147,6 @@ class PatternBrain(Brain): token_value = "".join(self.resolve_tokens(most_common[0])) new_id = self.new_token(token_value) - if len("".join(self.tokens.values())) > len(all_items): - # this might not ever occur - lament('preventing token dictionary from growing larger than source') - break - # replace the most common two-token sequence # with one token to represent both found = np.all(sequences == most_common[0], axis=1) @@ -175,8 +170,6 @@ class PatternBrain(Brain): lament("new token id {:5} occurs {:8} times: \"{}\"".format( new_id, len(here[0]), self.tokens[new_id])) - # TODO: find unused tokens? - # reconstruct all_items out of the sequences all_items = sequences.reshape(-1)[::2][1:].copy() return all_items @@ -198,7 +191,6 @@ class PatternBrain(Brain): all_items = self.merge_all(all_items, merges, min_count) # begin the actual learning - self.padding = '~' self.reset() np_item = [] for i in all_items: diff --git a/basic.py b/basic.py index 417a5ab..0dc1416 100755 --- a/basic.py +++ b/basic.py @@ -9,25 +9,24 @@ def normalize(counter): s = float(sum(v)) m = float(max(v)) del v - d = {} - for c, cnt in counter.items(): - d[c] = (cnt/s, cnt/m) - return d -# return [(c, cnt/s, cnt/m) for c, cnt in counter.items()] + return [(c, cnt/s, cnt/m) for c, cnt in counter.items()] def normalize_sorted(counter): - # mostly just for debugging i guess? + # if the elements were unsorted, + # we couldn't use our lazy method (subtraction) of selecting tokens + # and temperature would correspond to arbitrary tokens + # instead of more/less common tokens. return sorted(normalize(counter), key=lambda t: t[1], reverse=True) # http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139 class Brain: - def __init__(self, order=1, temperature=0.5): + def __init__(self, padding, order=1, temperature=0.5): self.order = order self.temperature = temperature - self.padding = None + self.padding = padding self.reset() @@ -51,21 +50,13 @@ class Brain: @temperature.setter def temperature(self, value): + assert(0 < value < 1) self._temperature = value - if value == 1: - # TODO: proper distribution stuff - self.random = lambda count: np.random.random(count)**2 - elif value == 0: - self.random = np.random.random - else: - # +0.25 = -0.0 - # +0.50 = +0.5 - # +0.75 = +1.0 - point75 = 1 - const = (point75 * 2 - 1) / math.atanh(0.75 * 2 - 1) - unbound = (math.atanh((1 - value) * 2 - 1) * const + 1) / 2 - self.random = easytruncnorm(0, 1, unbound, 0.25).rvs + a = 1 - value * 2 + # http://www.mathopenref.com/graphfunctions.html?fx=(a*x-x)/(2*a*x-a-1)&sg=f&sh=f&xh=1&xl=0&yh=1&yl=0&ah=1&al=-1&a=0.5 + tweak = lambda x: (a * x - x) / (2 * a * x - a - 1) + self.random = lambda n: 1 - tweak(np.random.random(n)) def learn_all(self, items): @@ -102,7 +93,7 @@ class Brain: def update(self): if self.dirty and self._machine: - self.machine = {hist:normalize(items) + self.machine = {hist: normalize_sorted(items) for hist, items in self._machine.items()} self.dirty = False @@ -116,9 +107,8 @@ class Brain: return None x = self.random(1) - for c, v in dist.items(): - # if x <= v: # this is a bad idea - x = x - v[0] + for c, cs, cm in dist: + x = x - cs if x <= 0: return c