import sys import types def lament(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def lower_priority(): """Set the priority of the process to below-normal.""" # via https://stackoverflow.com/a/1023269 if sys.platform == 'win32': try: import win32api, win32process, win32con pid = win32api.GetCurrentProcessId() handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid) win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS) except ImportError: lament("you do not have pywin32 installed.") lament("the process priority could not be lowered.") lament("consider: python -m pip install pypiwin32") lament("consider: conda install pywin32") else: import os os.nice(1) import numpy as np _f = np.float32 # just for speed, not strictly essential: from scipy.special import expit as sigmoid # used for numbering layers like Keras, and keeping initialization consistent: from collections import defaultdict, OrderedDict _layer_counters = defaultdict(lambda: 0) def _check(a): assert isinstance(a, np.ndarray) or type(a) == _f, type(a) assert a.dtype == _f, a.dtype return a _0 = _f(0) _1 = _f(1) _2 = _f(2) _inv2 = _f(1/2) _sqrt2 = _f(np.sqrt(2)) _invsqrt2 = _f(1/np.sqrt(2)) _pi = _f(np.pi) class LayerIncompatibility(Exception): pass # Node Traversal {{{1 class DummyNode: name = "Dummy" def __init__(self, children=None, parents=None): self.children = children if children is not None else [] self.parents = parents if parents is not None else [] def traverse(node_in, node_out, nodes=None, dummy_mode=False): # i have no idea if this is any algorithm in particular. nodes = nodes if nodes is not None else [] seen_up = {} q = [node_out] while len(q) > 0: node = q.pop(0) seen_up[node] = True for parent in node.parents: q.append(parent) if dummy_mode: seen_up[node_in] = True nodes = [] q = [node_in] while len(q) > 0: node = q.pop(0) if not seen_up[node]: continue parents_added = (parent in nodes for parent in node.parents) if not node in nodes and all(parents_added): nodes.append(node) for child in node.children: q.append(child) if dummy_mode: nodes.remove(node_in) return nodes def traverse_all(nodes_in, nodes_out, nodes=None): all_in = DummyNode(children=nodes_in) all_out = DummyNode(parents=nodes_out) return traverse(all_in, all_out, nodes, dummy_mode=True) # Initializations {{{1 # note: these are currently only implemented for 2D shapes. def init_zeros(size, ins=None, outs=None): return np.zeros(size) def init_ones(size, ins=None, outs=None): return np.ones(size) def init_he_normal(size, ins, outs): s = np.sqrt(2 / ins) return np.random.normal(0, s, size=size) def init_he_uniform(size, ins, outs): s = np.sqrt(6 / ins) return np.random.uniform(-s, s, size=size) def init_glorot_normal(size, ins, outs): s = np.sqrt(2 / (ins + outs)) return np.random.normal(0, s, size=size) def init_glorot_uniform(size, ins, outs): s = np.sqrt(6 / (ins + outs)) return np.random.uniform(-s, s, size=size) # Weight container {{{1 class Weights: # we may or may not contain weights -- or any information, for that matter. def __init__(self, **kwargs): self.f = None # forward weights self.g = None # backward weights (gradients) self.shape = None self.init = None self.allocator = None self.regularizer = None self._allocated = False self.configure(**kwargs) def configure(self, **kwargs): for k, v in kwargs.items(): getattr(self, k) # ensures the key already exists setattr(self, k, v) @property def size(self): assert self.shape is not None return np.prod(self.shape) def allocate(self, *args, **kwargs): if self._allocated: raise Exception("attempted to allocate existing weights") self.configure(**kwargs) # intentionally not using isinstance assert type(self.shape) == tuple, self.shape f, g = self.allocator(self.size) assert len(f) == self.size, "{} != {}".format(f.shape, self.size) assert len(g) == self.size, "{} != {}".format(g.shape, self.size) f[:] = self.init(self.size, *args) g[:] = self.init(self.size, *args) self.f = f.reshape(self.shape) self.g = g.reshape(self.shape) self._allocated = True def forward(self): if self.regularizer is None: return 0.0 return self.regularizer.forward(self.f) def backward(self): if self.regularizer is None: return 0.0 return self.regularizer.backward(self.f) def update(self): if self.regularizer is None: return self.g += self.regularizer.backward(self.f) # Loss functions {{{1 class Loss: pass class NLL(Loss): # Negative Log Likelihood def forward(self, p, y): correct = p * y return np.mean(-correct) def backward(self, p, y): return -y / len(p) class CategoricalCrossentropy(Loss): # lifted from theano def __init__(self, eps=1e-6): self.eps = _f(eps) def forward(self, p, y): p = np.clip(p, self.eps, 1 - self.eps) f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1) return np.mean(f) def backward(self, p, y): p = np.clip(p, self.eps, 1 - self.eps) df = (p - y) / (p * (1 - p)) return df / len(y) class Accuracy(Loss): # returns percentage of categories correctly predicted. # utilizes argmax(), so it cannot be used for gradient descent. # use CategoricalCrossentropy or NLL for that instead. def forward(self, p, y): correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1) return np.mean(correct) def backward(self, p, y): raise NotImplementedError("cannot take the gradient of Accuracy") class ResidualLoss(Loss): def forward(self, p, y): return np.mean(self.f(p - y)) def backward(self, p, y): ret = self.df(p - y) / len(y) return ret class SquaredHalved(ResidualLoss): def f(self, r): return np.square(r) / 2 def df(self, r): return r class Squared(ResidualLoss): def f(self, r): return np.square(r) def df(self, r): return 2 * r class Absolute(ResidualLoss): def f(self, r): return np.abs(r) def df(self, r): return np.sign(r) class Huber(ResidualLoss): def __init__(self, delta=1.0): self.delta = _f(delta) def f(self, r): return np.where(r <= self.delta, np.square(r) / 2, self.delta * (np.abs(r) - self.delta / 2)) def df(self, r): return np.where(r <= self.delta, r, self.delta * np.sign(r)) # Regularizers {{{1 class Regularizer: pass class L1L2(Regularizer): def __init__(self, l1=0.0, l2=0.0): self.l1 = _f(l1) self.l2 = _f(l2) def forward(self, X): f = _0 if self.l1: f += np.sum(self.l1 * np.abs(X)) if self.l2: f += np.sum(self.l2 * np.square(X)) return f def backward(self, X): df = np.zeros_like(X) if self.l1: df += self.l1 * np.sign(X) if self.l2: df += self.l2 * 2 * X return df # Optimizers {{{1 class Optimizer: def __init__(self, lr=0.1): self.lr = _f(lr) # learning rate self.reset() def reset(self): pass def compute(self, dW, W): return -self.lr * dW def update(self, dW, W): W += self.compute(dW, W) # some of the the following optimizers are blatantly lifted from tiny-dnn: # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h class Momentum(Optimizer): def __init__(self, lr=0.01, mu=0.9, nesterov=False): self.mu = _f(mu) # momentum self.nesterov = bool(nesterov) super().__init__(lr) def reset(self): self.Vprev = None def compute(self, dW, W): if self.Vprev is None: self.Vprev = np.copy(dW) V = self.mu * self.Vprev - self.lr * dW self.Vprev[:] = V if self.nesterov: return self.mu * V - self.lr * dW return V class Adagrad(Optimizer): def __init__(self, lr=0.01, eps=1e-8): self.eps = _f(eps) super().__init__(lr) def reset(self): self.g = None def compute(self, dW, W): if self.g is None: self.g = np.zeros_like(dW) self.g += np.square(dW) return -self.lr * dW / (np.sqrt(self.g) + self.eps) class RMSprop(Optimizer): # RMSprop generalizes* Adagrad, etc. # * RMSprop == Adagrad when # RMSprop.mu == 1 def __init__(self, lr=1e-4, mu=0.99, eps=1e-8): self.mu = _f(mu) # decay term self.eps = _f(eps) # one might consider the following equation when specifying mu: # mu = e**(-1/t) # default: t = -1/ln(0.99) = ~99.5 # therefore the default of mu=0.99 means # an input decays to 1/e its original amplitude over 99.5 batches. # (this is from DSP, so how relevant it is in SGD is debatable) super().__init__(lr) def reset(self): self.g = None def compute(self, dW, W): if self.g is None: self.g = np.zeros_like(dW) # basically apply a first-order low-pass filter to delta squared self.g += (1 - self.mu) * (np.square(dW) - self.g) # finally sqrt it to complete the running root-mean-square approximation return -self.lr * dW / (np.sqrt(self.g) + self.eps) class RMSpropCentered(Optimizer): # referenced TensorFlow/PyTorch. # paper: https://arxiv.org/pdf/1308.0850v5.pdf def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8): self.aleph = _f(aleph) self.momentum = _f(momentum) self.eps = _f(eps) super().__init__(lr) def reset(self): self.g = None self.mt = None self.vt = None self.delta = None def compute(self, dW, W): if self.g is None: self.g = np.zeros_like(dW) if self.mt is None: self.mt = np.zeros_like(dW) if self.vt is None: self.vt = np.zeros_like(dW) if self.delta is None: self.delta = np.zeros_like(dW) self.mt += (1 - self.aleph) * (dW - self.mt) self.vt += (1 - self.aleph) * (np.square(dW) - self.vt) # PyTorch has the epsilon outside of the sqrt, # TensorFlow and the paper have it within. # in onn, we generally do it outside, as this seems to work better. temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps) # TensorFlow does it this way. self.delta[:] = self.momentum * self.delta + self.lr * temp return -self.delta # PyTorch does it this way. #self.delta[:] = self.momentum * self.delta + temp #return -self.lr * self.delta # they are equivalent only when LR is constant, which it might not be. class Adam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 # Adam generalizes* RMSprop, and # adds a decay term to the regular (non-squared) delta, and performs # debiasing to compensate for the filtered deltas starting from zero. # * Adam == RMSprop when # Adam.b1 == 0 # Adam.b2 == RMSprop.mu def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term self.b1_t_default = _f(b1) # decay term power t self.b2_t_default = _f(b2) # decay term power t self.eps = _f(eps) super().__init__(lr) def reset(self): self.mt = None self.vt = None self.b1_t = self.b1_t_default self.b2_t = self.b2_t_default def compute(self, dW, W): if self.mt is None: self.mt = np.zeros_like(dW) if self.vt is None: self.vt = np.zeros_like(dW) # decay gain self.b1_t *= self.b1 self.b2_t *= self.b2 # filter self.mt += (1 - self.b1) * (dW - self.mt) self.vt += (1 - self.b2) * (np.square(dW) - self.vt) return -self.lr * (self.mt / (1 - self.b1_t)) \ / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) class Nadam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 # paper: http://cs229.stanford.edu/proj2015/054_report.pdf # TODO: double-check this implementation. also read the damn paper. # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term self.eps = _f(eps) super().__init__(lr) def reset(self): self.mt = None self.vt = None self.t = 0 self.sched = 1 def compute(self, dW, W): self.t += 1 if self.mt is None: self.mt = np.zeros_like(dW) if self.vt is None: self.vt = np.zeros_like(dW) ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0)) ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1)) sched0 = self.sched * ut0 sched1 = self.sched * ut0 * ut1 self.sched = sched0 gp = dW / (1 - sched0) self.mt += (1 - self.b1) * (dW - self.mt) self.vt += (1 - self.b2) * (np.square(dW) - self.vt) mtp = self.mt / (1 - sched1) vtp = self.vt / (1 - self.b2**self.t) mt_bar = (1 - ut0) * gp + ut1 * mtp return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) # Abstract Layers {{{1 class Layer: def __init__(self): self.parents = [] self.children = [] self.weights = OrderedDict() self.loss = None # for activity regularizers self.input_shape = None self.output_shape = None kind = self.__class__.__name__ global _layer_counters _layer_counters[kind] += 1 self.name = "{}_{}".format(kind, _layer_counters[kind]) self.unsafe = False # disables assertions for better performance self.shared = False # as in weight sharing def __str__(self): return self.name # methods we might want to override: def forward(self, X): raise NotImplementedError("unimplemented", self) def forward_deterministic(self, X): return self.forward(X) def backward(self, dY): raise NotImplementedError("unimplemented", self) def make_shape(self, parent): if self.input_shape == None: self.input_shape = parent.output_shape if self.output_shape == None: self.output_shape = self.input_shape def do_feed(self, child): self.children.append(child) def be_fed(self, parent): self.parents.append(parent) # TODO: better names for these (still) def _propagate(self, edges, deterministic): if not self.unsafe: assert len(edges) == 1, self if deterministic: return self.forward_deterministic(edges[0]) else: return self.forward(edges[0]) def _backpropagate(self, edges): if len(edges) == 1: return self.backward(edges[0]) return sum((self.backward(dY) for dY in edges)) # general utility methods: def is_compatible(self, parent): return np.all(self.input_shape == parent.output_shape) def feed(self, child): assert self.output_shape is not None, self child.make_shape(self) if not child.is_compatible(self): fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape)) self.do_feed(child) child.be_fed(self) return child def validate_input(self, X): assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) def validate_output(self, Y): assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) def _new_weights(self, name, **kwargs): w = Weights(**kwargs) assert name not in self.weights, name self.weights[name] = w return w def share(self, node): self.weights = node.weights # TODO: this should be all it takes. for k, v in self.weights.items(): vs = getattr(node, k) # hack: key isn't necessarily attribute name! setattr(self, k, vs) self.shared = True def clear_grad(self): for name, w in self.weights.items(): w.g[:] = 0 @property def size(self): return sum((w.size for w in self.weights.values())) def init(self, allocator): ins, outs = self.input_shape[0], self.output_shape[0] for k, w in self.weights.items(): w.allocate(ins, outs, allocator=allocator) def propagate(self, values, deterministic): if not self.unsafe: assert self.parents, self edges = [] for parent in self.parents: if parent in values: X = values[parent] if not self.unsafe: self.validate_input(X) edges.append(X) Y = self._propagate(edges, deterministic) if not self.unsafe: self.validate_output(Y) return Y def backpropagate(self, values): if not self.unsafe: assert self.children, self edges = [] for child in self.children: if child in values: dY = values[child] if not self.unsafe: self.validate_output(dY) edges.append(dY) dX = self._backpropagate(edges) if not self.unsafe: self.validate_input(dX) return dX # Nonparametric Layers {{{1 class Input(Layer): def __init__(self, shape): assert shape is not None super().__init__() self.shape = tuple(shape) self.input_shape = self.shape self.output_shape = self.shape def forward(self, X): return X def backward(self, dY): #self.dY = dY return np.zeros_like(dY) class Reshape(Layer): def __init__(self, new_shape): super().__init__() self.shape = tuple(new_shape) self.output_shape = self.shape def forward(self, X): self.batch_size = X.shape[0] return X.reshape(self.batch_size, *self.output_shape) def backward(self, dY): assert dY.shape[0] == self.batch_size return dY.reshape(self.batch_size, *self.input_shape) class Flatten(Layer): def make_shape(self, parent): shape = parent.output_shape self.input_shape = shape self.output_shape = (np.prod(shape),) def forward(self, X): self.batch_size = X.shape[0] return X.reshape(self.batch_size, *self.output_shape) def backward(self, dY): assert dY.shape[0] == self.batch_size return dY.reshape(self.batch_size, *self.input_shape) class ConstAffine(Layer): def __init__(self, a=1, b=0): super().__init__() self.a = _f(a) self.b = _f(b) def forward(self, X): return self.a * X + self.b def backward(self, dY): return dY * self.a class Sum(Layer): def _propagate(self, edges, deterministic): return np.sum(edges, axis=0) def _backpropagate(self, edges): #assert len(edges) == 1, "unimplemented" return edges[0] # TODO: does this always work? class ActivityRegularizer(Layer): def __init__(self, reg): super().__init__() assert isinstance(reg, Regularizer), reg self.reg = reg def forward(self, X): self.X = X self.loss = np.sum(self.reg.forward(X)) return X def backward(self, dY): return dY + self.reg.backward(self.X) class Dropout(Layer): def __init__(self, dropout=0.0): super().__init__() self.p = _f(1 - dropout) assert 0 <= self.p <= 1 def forward(self, X): self.mask = (np.random.rand(*X.shape) < self.p) / self.p return X * self.mask def forward_deterministic(self, X): #self.mask = _1 return X def backward(self, dY): return dY * self.mask # Activation Layers {{{2 class Identity(Layer): def forward(self, X): return X def backward(self, dY): return dY class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) def forward(self, X): self.sig = sigmoid(X) return self.sig def backward(self, dY): return dY * self.sig * (1 - self.sig) class Softplus(Layer): # integral of Sigmoid. def forward(self, X): self.X = X return np.log(1 + np.exp(X)) def backward(self, dY): return dY * sigmoid(self.X) class Tanh(Layer): def forward(self, X): self.sig = np.tanh(X) return self.sig def backward(self, dY): return dY * (1 - self.sig * self.sig) class LeCunTanh(Layer): # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf # scaled such that f([-1, 1]) = [-1, 1]. # helps preserve an input variance of 1. # second derivative peaks around an input of ±1. def forward(self, X): self.sig = np.tanh(2 / 3 * X) return 1.7159 * self.sig def backward(self, dY): return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig) class Relu(Layer): def forward(self, X): self.cond = X >= 0 return np.where(self.cond, X, 0) def backward(self, dY): return np.where(self.cond, dY, 0) class Elu(Layer): # paper: https://arxiv.org/abs/1511.07289 def __init__(self, alpha=1): super().__init__() self.alpha = _f(alpha) # FIXME: unused def forward(self, X): self.cond = X >= 0 self.neg = np.exp(X) - 1 return np.where(self.cond, X, self.neg) def backward(self, dY): return dY * np.where(self.cond, 1, self.neg + 1) class GeluApprox(Layer): # paper: https://arxiv.org/abs/1606.08415 # plot: https://www.desmos.com/calculator/ydzgtccsld def forward(self, X): self.a = 1.704 * X self.sig = sigmoid(self.a) return X * self.sig def backward(self, dY): return dY * self.sig * (1 + self.a * (1 - self.sig)) class Softmax(Layer): def forward(self, X): alpha = np.max(X, axis=-1, keepdims=True) num = np.exp(X - alpha) den = np.sum(num, axis=-1, keepdims=True) self.sm = num / den return self.sm def backward(self, dY): return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm class LogSoftmax(Softmax): def __init__(self, eps=1e-6): super().__init__() self.eps = _f(eps) def forward(self, X): return np.log(super().forward(X) + self.eps) def backward(self, dY): return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm class Cos(Layer): # performs well on MNIST for some strange reason. def forward(self, X): self.X = X return np.cos(X) def backward(self, dY): return dY * -np.sin(self.X) # Parametric Layers {{{1 class Bias(Layer): # TODO: support axes other than -1 and shapes other than 1D. serialized = { 'b': 'biases', } def __init__(self, init=init_zeros, reg_b=None): super().__init__() self.biases = self._new_weights('biases', init=init, regularizer=reg_b) def make_shape(self, parent): shape = parent.output_shape self.input_shape = shape self.output_shape = shape self.biases.shape = (shape[-1],) def forward(self, X): return X + self.biases.f def backward(self, dY): self.biases.g += dY.sum(0) return dY class Dense(Layer): serialized = { 'W': 'coeffs', 'b': 'biases', } def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None): super().__init__() self.dim = int(dim) self.output_shape = (dim,) self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) def make_shape(self, parent): shape = parent.output_shape self.input_shape = shape assert len(shape) == 1, shape self.coeffs.shape = (shape[0], self.dim) self.biases.shape = (1, self.dim) def forward(self, X): self.X = X return X @ self.coeffs.f + self.biases.f def backward(self, dY): self.coeffs.g += self.X.T @ dY self.biases.g += dY.sum(0, keepdims=True) return dY @ self.coeffs.f.T # Models {{{1 class Model: def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False): self.loss = loss if loss is not None else SquaredHalved() self.mloss = mloss if mloss is not None else loss nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out assert type(nodes_in) == list, type(nodes_in) assert type(nodes_out) == list, type(nodes_out) self.nodes_in = nodes_in self.nodes_out = nodes_out self.nodes = traverse_all(self.nodes_in, self.nodes_out) self.make_weights() for node in self.nodes: node.unsafe = unsafe # TODO: handle the same layer being in more than one node. @property def ordered_nodes(self): # deprecated? we don't guarantee an order like we did before. return self.nodes def make_weights(self): self.param_count = sum((node.size for node in self.nodes if not node.shared)) self.W = np.zeros(self.param_count, dtype=_f) self.dW = np.zeros(self.param_count, dtype=_f) offset = 0 for node in self.nodes: if node.size > 0 and not node.shared: inner_offset = 0 def allocate(size): nonlocal inner_offset o = offset + inner_offset ret = self.W[o:o+size], self.dW[o:o+size] inner_offset += size assert len(ret[0]) == len(ret[1]) assert size == len(ret[0]), (size, len(ret[0])) return ret node.init(allocate) assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node) # i don't care if "less" is grammatically incorrect. # you're mom is grammatically incorrect. assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node) offset += node.size def evaluate(self, input_, deterministic=True): assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead" assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead" node_in = self.nodes_in[0] node_out = self.nodes_out[0] outputs = self.evaluate_multi({node_in: input_}, deterministic) return outputs[node_out] def apply(self, error): # TODO: better name? assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead" assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead" node_in = self.nodes_in[0] node_out = self.nodes_out[0] inputs = self.apply_multi({node_out: error}) return inputs[node_in] def evaluate_multi(self, inputs, deterministic=True): values = dict() outputs = dict() for node in self.nodes: if node in self.nodes_in: assert node in inputs, "missing input for node {}".format(node.name) X = inputs[node] values[node] = node._propagate(np.expand_dims(X, 0), deterministic) else: values[node] = node.propagate(values, deterministic) if node in self.nodes_out: outputs[node] = values[node] return outputs def apply_multi(self, outputs): values = dict() inputs = dict() for node in reversed(self.nodes): if node in self.nodes_out: assert node in outputs, "missing output for node {}".format(node.name) X = outputs[node] values[node] = node._backpropagate(np.expand_dims(X, 0)) else: values[node] = node.backpropagate(values) if node in self.nodes_in: inputs[node] = values[node] return inputs def forward(self, inputs, outputs, measure=False, deterministic=False): predicted = self.evaluate(inputs, deterministic=deterministic) if measure: error = self.mloss.forward(predicted, outputs) else: error = self.loss.forward(predicted, outputs) return error, predicted def backward(self, predicted, outputs, measure=False): if measure: error = self.mloss.backward(predicted, outputs) else: error = self.loss.backward(predicted, outputs) # input_delta is rarely useful; it's just to match the forward pass. input_delta = self.apply(error) return self.dW, input_delta def clear_grad(self): for node in self.nodes: node.clear_grad() def regulate_forward(self): loss = _0 for node in self.nodes: if node.loss is not None: loss += node.loss for k, w in node.weights.items(): loss += w.forward() return loss def regulate(self): for node in self.nodes: for k, w in node.weights.items(): w.update() def load_weights(self, fn): # seemingly compatible with keras' Dense layers. import h5py open(fn) # just ensure the file exists (python's error is better) f = h5py.File(fn, 'r') weights = {} def visitor(name, obj): if isinstance(obj, h5py.Dataset): weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f) f.visititems(visitor) f.close() used = {} for k in weights.keys(): used[k] = False nodes = [node for node in self.nodes if node.size > 0] # TODO: support shared weights. for node in nodes: full_name = str(node).lower() for s_name, o_name in node.serialized.items(): key = full_name + '_' + s_name data = weights[key] target = getattr(node, o_name) target.f[:] = data used[key] = True for k, v in used.items(): if not v: lament("WARNING: unused weight", k) def save_weights(self, fn, overwrite=False): import h5py f = h5py.File(fn, 'w') counts = defaultdict(lambda: 0) nodes = [node for node in self.nodes if node.size > 0] # TODO: support shared weights. for node in nodes: full_name = str(node).lower() grp = f.create_group(full_name) for s_name, o_name in node.serialized.items(): key = full_name + '_' + s_name target = getattr(node, o_name) data = grp.create_dataset(key, target.shape, dtype=_f) data[:] = target.f counts[key] += 1 if counts[key] > 1: lament("WARNING: rewrote weight", key) f.close() def print_graph(self, file=sys.stdout): print('digraph G {', file=file) for node in self.nodes: children = [str(n) for n in node.children] if children: sep = '->' print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file) print('}', file=file) # Rituals {{{1 class Ritual: # i'm just making up names at this point. def __init__(self, learner=None): self.learner = learner if learner is not None else Learner(Optimizer()) self.model = None def reset(self): self.learner.reset(optim=True) self.en = 0 self.bn = 0 def learn(self, inputs, outputs): error, predicted = self.model.forward(inputs, outputs) self.model.backward(predicted, outputs) self.model.regulate() return error, predicted def update(self): optim = self.learner.optim optim.model = self.model optim.update(self.model.dW, self.model.W) def prepare(self, model): self.en = 0 self.bn = 0 self.model = model def _train_batch(self, batch_inputs, batch_outputs, b, batch_count, test_only=False, loss_logging=False, mloss_logging=True): if not test_only and self.learner.per_batch: self.learner.batch(b / batch_count) if test_only: predicted = self.model.evaluate(batch_inputs, deterministic=True) else: error, predicted = self.learn(batch_inputs, batch_outputs) self.model.regulate_forward() self.update() if loss_logging: batch_loss = self.model.loss.forward(predicted, batch_outputs) if np.isnan(batch_loss): raise Exception("nan") self.losses.append(batch_loss) self.cumsum_loss += batch_loss if mloss_logging: # NOTE: this can use the non-deterministic predictions. fixme? batch_mloss = self.model.mloss.forward(predicted, batch_outputs) if np.isnan(batch_mloss): raise Exception("nan") self.mlosses.append(batch_mloss) self.cumsum_mloss += batch_mloss def train_batched(self, inputs_or_generator, outputs_or_batch_count, batch_size=None, return_losses=False, test_only=False, shuffle=True, clear_grad=True): assert isinstance(return_losses, bool) or return_losses == 'both' assert self.model is not None gen = isinstance(inputs_or_generator, types.GeneratorType) if gen: generator = inputs_or_generator batch_count = outputs_or_batch_count assert isinstance(batch_count, int), type(batch_count) else: inputs = inputs_or_generator outputs = outputs_or_batch_count if not test_only: self.en += 1 if shuffle: if gen: raise Exception("shuffling is incompatibile with using a generator.") indices = np.arange(inputs.shape[0]) np.random.shuffle(indices) inputs = inputs[indices] outputs = outputs[indices] self.cumsum_loss, self.cumsum_mloss = _0, _0 self.losses, self.mlosses = [], [] if not gen: batch_count = inputs.shape[0] // batch_size # TODO: lift this restriction assert inputs.shape[0] % batch_size == 0, \ "inputs is not evenly divisible by batch_size" prev_batch_size = None for b in range(batch_count): if not test_only: self.bn += 1 if gen: batch_inputs, batch_outputs = next(generator) batch_size = batch_inputs.shape[0] # TODO: lift this restriction assert batch_size == prev_batch_size or prev_batch_size is None, \ "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size) else: bi = b * batch_size batch_inputs = inputs[ bi:bi+batch_size] batch_outputs = outputs[bi:bi+batch_size] if clear_grad: self.model.clear_grad() self._train_batch(batch_inputs, batch_outputs, b, batch_count, test_only, return_losses=='both', return_losses) prev_batch_size = batch_size avg_mloss = self.cumsum_mloss / _f(batch_count) if return_losses == 'both': avg_loss = self.cumsum_loss / _f(batch_count) return avg_loss, avg_mloss, self.losses, self.mlosses elif return_losses: return avg_mloss, self.mlosses return avg_mloss def test_batched(self, inputs, outputs, *args, **kwargs): return self.train_batched(inputs, outputs, *args, test_only=True, **kwargs) def train_batched_gen(self, generator, batch_count, *args, **kwargs): return self.train_batched(generator, batch_count, *args, shuffle=False, **kwargs) # Learners {{{1 class Learner: per_batch = False def __init__(self, optim, epochs=100, rate=None): assert isinstance(optim, Optimizer) self.optim = optim self.start_rate = rate # None is okay; it'll use optim.lr instead. self.epochs = int(epochs) self.reset() def reset(self, optim=False): self.started = False self.epoch = 0 if optim: self.optim.reset() @property def epoch(self): return self._epoch @epoch.setter def epoch(self, new_epoch): self._epoch = int(new_epoch) if 0 <= self.epoch <= self.epochs: self.rate = self.rate_at(self._epoch) @property def rate(self): return self.optim.lr @rate.setter def rate(self, new_rate): self.optim.lr = new_rate def rate_at(self, epoch): if self.start_rate is None: return self.optim.lr return self.start_rate def next(self): # prepares the next epoch. returns whether or not to continue training. if not self.started: self.started = True self.epoch += 1 if self.epoch > self.epochs: return False return True def batch(self, progress): # TODO: rename # interpolates rates between epochs. # unlike epochs, we do not store batch number as a state. # i.e. calling next() will not respect progress. assert 0 <= progress <= 1 self.rate = self.rate_at(self._epoch + progress) @property def final_rate(self): return self.rate_at(self.epochs - 1e-8) class AnnealingLearner(Learner): def __init__(self, optim, epochs=100, rate=None, halve_every=10): self.halve_every = _f(halve_every) self.anneal = _f(0.5**(1/self.halve_every)) super().__init__(optim, epochs, rate) def rate_at(self, epoch): return super().rate_at(epoch) * self.anneal**epoch def cosmod(x): # plot: https://www.desmos.com/calculator/hlgqmyswy2 return (_1 + np.cos((x % _1) * _pi)) * _inv2 class SGDR(Learner): # Stochastic Gradient Descent with Restarts # paper: https://arxiv.org/abs/1608.03983 # NOTE: this is missing a couple of the proposed features. per_batch = True def __init__(self, optim, epochs=100, rate=None, restarts=0, restart_decay=0.5, callback=None, expando=0): self.restart_epochs = int(epochs) self.decay = _f(restart_decay) self.restarts = int(restarts) self.restart_callback = callback # TODO: rename expando to something not insane self.expando = expando if expando is not None else lambda i: i if type(self.expando) == int: inc = self.expando self.expando = lambda i: i * inc self.splits = [] epochs = 0 for i in range(0, self.restarts + 1): split = epochs + self.restart_epochs + int(self.expando(i)) self.splits.append(split) epochs = split super().__init__(optim, epochs, rate) def split_num(self, epoch): previous = [0] + self.splits for i, split in enumerate(self.splits): if epoch - 1 < split: sub_epoch = epoch - previous[i] next_restart = split - previous[i] return i, sub_epoch, next_restart raise Exception('this should never happen.') def rate_at(self, epoch): base_rate = self.start_rate if self.start_rate is not None else self.optim.lr restart, sub_epoch, next_restart = self.split_num(max(1, epoch)) x = _f(sub_epoch - 1) / _f(next_restart) return base_rate * self.decay**_f(restart) * cosmod(x) def next(self): if not super().next(): return False restart, sub_epoch, next_restart = self.split_num(self.epoch) if restart > 0 and sub_epoch == 1: if self.restart_callback is not None: self.restart_callback(restart) return True class TriangularCLR(Learner): per_batch = True def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0, frequency=100, callback=None): # NOTE: start_rate is treated as upper_rate self.frequency = int(frequency) assert self.frequency > 0 self.callback = callback self.lower_rate = _f(lower_rate) super().__init__(optim, epochs, upper_rate) def _t(self, epoch): # NOTE: this could probably be simplified offset = self.frequency / 2 return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset def rate_at(self, epoch): upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate def next(self): if not super().next(): return False e = self.epoch - 1 if e > 0 and e % self.frequency == 0: if self.callback is not None: self.callback(self.epoch // self.frequency) return True class SineCLR(TriangularCLR): def _t(self, epoch): return np.sin(_pi * _inv2 * super()._t(epoch)) class WaveCLR(TriangularCLR): def _t(self, epoch): return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))