From bbdb91fcb1d48e74c2989e200be7e154fec02399 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Sun, 21 Jan 2018 22:04:25 +0000 Subject: [PATCH] merge and split modules into a package --- .gitignore | 7 + README.md | 2 +- onn_mnist.py => mnist_example.py | 2 +- onn.py | 1429 ------------------------------ onn/__init__.py | 28 + onn/activation.py | 185 ++++ onn/floats.py | 18 + onn/initialization.py | 31 + onn/layer.py | 200 +++++ onn/layer_base.py | 143 +++ onn/learner.py | 180 ++++ onn/loss.py | 129 +++ onn/math.py | 14 + onn/model.py | 197 ++++ onn/nodes.py | 43 + onn/optimizer.py | 531 +++++++++++ onn/optimizer_base.py | 19 + onn/parametric.py | 254 ++++++ onn/regularizer.py | 45 + onn/ritual.py | 90 ++ onn/ritual_base.py | 132 +++ onn/util.py | 37 + onn/weights.py | 58 ++ onn_core.py | 1398 ----------------------------- 24 files changed, 2343 insertions(+), 2829 deletions(-) create mode 100644 .gitignore rename onn_mnist.py => mnist_example.py (99%) mode change 100755 => 100644 delete mode 100755 onn.py create mode 100644 onn/__init__.py create mode 100644 onn/activation.py create mode 100644 onn/floats.py create mode 100644 onn/initialization.py create mode 100644 onn/layer.py create mode 100644 onn/layer_base.py create mode 100644 onn/learner.py create mode 100644 onn/loss.py create mode 100644 onn/math.py create mode 100644 onn/model.py create mode 100644 onn/nodes.py create mode 100644 onn/optimizer.py create mode 100644 onn/optimizer_base.py create mode 100644 onn/parametric.py create mode 100644 onn/regularizer.py create mode 100644 onn/ritual.py create mode 100644 onn/ritual_base.py create mode 100644 onn/util.py create mode 100644 onn/weights.py delete mode 100644 onn_core.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..667fa90 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__ +*.pyc + +# work in progress: +onn/component.py +onn/extra.py +onn/run.py diff --git a/README.md b/README.md index 08bf202..c12b612 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ numpy scipy h5py sklearn dotmap ```python #!/usr/bin/env python3 -from onn_core import * +from onn import * bs = 500 lr = 0.01 reg = L1L2(3.2e-5, 3.2e-4) diff --git a/onn_mnist.py b/mnist_example.py old mode 100755 new mode 100644 similarity index 99% rename from onn_mnist.py rename to mnist_example.py index c321bfe..d1a98ff --- a/onn_mnist.py +++ b/mnist_example.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from onn import * -from onn_core import _f +from onn.floats import * from dotmap import DotMap lower_priority() diff --git a/onn.py b/onn.py deleted file mode 100755 index 126e2f3..0000000 --- a/onn.py +++ /dev/null @@ -1,1429 +0,0 @@ -#!/usr/bin/env python3 - -# external packages required for full functionality: -# numpy scipy h5py sklearn dotmap - -# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*. -# this is gonna take some work. - -from onn_core import * -from onn_core import _check, _f, _0, _1 - -import sys - -_log_was_update = False -def log(left, right, update=False): - s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right) - global _log_was_update - if update and _log_was_update: - lament('\x1B[F' + s) - else: - lament(s) - _log_was_update = update - -class Dummy: - pass - -# Math Utilities {{{1 - -def rolling(a, window): - # http://stackoverflow.com/a/4924433 - shape = (a.size - window + 1, window) - strides = (a.itemsize, a.itemsize) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) - -def rolling_batch(a, window): - # same as rolling, but acts on each batch (axis 0). - shape = (a.shape[0], a.shape[-1] - window + 1, window) - strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) - -# Initializations {{{1 - -def init_gaussian_unit(size, ins, outs): - s = np.sqrt(1 / ins) - return np.random.normal(0, s, size=size) - -# Loss functions {{{1 - -class SomethingElse(ResidualLoss): - # generalizes Absolute and SquaredHalved. - # plot: https://www.desmos.com/calculator/fagjg9vuz7 - def __init__(self, a=4/3): - assert 1 <= a <= 2, "parameter out of range" - self.a = _f(a / 2) - self.b = _f(2 / a) - self.c = _f(2 / a - 1) - - def f(self, r): - return self.a * np.abs(r)**self.b - - def df(self, r): - return np.sign(r) * np.abs(r)**self.c - -class Confidence(Loss): - # this isn't "confidence" in any meaningful way; (e.g. Bayesian) - # it's just a metric of how large the value is of the predicted class. - # when using it for loss, it acts like a crappy regularizer. - # it really just measures how much of a hot-shot the network thinks it is. - - def forward(self, p, y=None): - categories = p.shape[-1] - confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories) - # the exponent in softmax puts a maximum on confidence, - # but we don't compensate for that. if necessary, - # it'd be better to use an activation that doesn't have this limit. - return np.mean(confidence) - - def backward(self, p, y=None): - # in order to agree with the forward pass, - # using this backwards pass as-is will minimize confidence. - categories = p.shape[-1] - detc = p / categories / (1 - 1/categories) - dmax = p == np.max(p, axis=-1, keepdims=True) - return detc * dmax - -# Regularizers {{{1 - -class SaturateRelu(Regularizer): - # paper: https://arxiv.org/abs/1703.09202 - # TODO: test this (and ActivityRegularizer) more thoroughly. - # i've looked at the histogram of the resulting weights. - # it seems like only the layers after this are affected - # the way they should be. - - def __init__(self, lamb=0.0): - self.lamb = _f(lamb) - - def forward(self, X): - return self.lamb * np.where(X >= 0, X, 0) - - def backward(self, X): - return self.lamb * np.where(X >= 0, 1, 0) - -# Optimizers {{{1 - -class FTML(Optimizer): - # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf - # author's implementation: https://github.com/szhengac/optim/commit/923555e - - def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8): - self.iterations = _0 - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.dt1 = None - self.dt = None - self.vt = None - self.zt = None - self.b1_t = _1 - self.b2_t = _1 - - def compute(self, dW, W): - if self.dt1 is None: self.dt1 = np.zeros_like(dW) - if self.dt is None: self.dt = np.zeros_like(dW) - if self.vt is None: self.vt = np.zeros_like(dW) - if self.zt is None: self.zt = np.zeros_like(dW) - - # NOTE: we could probably rewrite these equations to avoid this copy. - self.dt1[:] = self.dt[:] - - self.b1_t *= self.b1 - self.b2_t *= self.b2 - - # hardly an elegant solution. - lr = max(self.lr, self.eps) - - # same as Adam's vt. - self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW - - # you can factor "inner" out of Adam as well. - inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps - self.dt[:] = (1 - self.b1_t) / lr * inner - - sigma_t = self.dt - self.b1 * self.dt1 - - # Adam's mt minus the sigma term. - self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W - - # subtract by weights to avoid having to override self.update. - return -self.zt / self.dt - W - -class MomentumClip(Optimizer): - def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False): - self.mu = _f(mu) - self.clip = _f(clip) - self.nesterov = bool(nesterov) - self.debug = bool(debug) - - super().__init__(lr) - - def reset(self): - self.accum = None - - def compute(self, dW, W): - if self.accum is None: - self.accum = np.zeros_like(dW) - - total_norm = np.linalg.norm(dW) - clip_scale = self.clip / (total_norm + 1e-6) - if clip_scale < 1: - if self.debug: - lament("clipping gradients; norm: {:10.5f}".format(total_norm)) - dW *= clip_scale - - self.accum[:] = self.accum * self.mu + dW - if self.nesterov: - return -self.lr * (self.accum * self.mu + dW) - else: - return -self.lr * self.accum - -class YellowFin(Optimizer): - # paper: https://arxiv.org/abs/1706.03471 - # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/ - # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py - # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666 - - def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20, - debias=True, clip=1.0): - self.lr_default = _f(lr) - self.mu_default = _f(mu) - self.beta = _f(beta) - self.window_size = int(window_size) # curv_win_width - self.debias_enabled = bool(debias) - self.clip = _f(clip) - - self.mu = _f(mu) # momentum - super().__init__(lr) - - def reset(self): - self.accum = None - - self.lr = self.lr_default - self.mu = self.mu_default - - self.step = 0 - self.beta_t = self.beta - - self.curv_win = np.zeros([self.window_size,], dtype=np.float32) - - self.h_min = None - self.h_max = None - - self.g_lpf = 0 - #self.g_squared_lpf = 0 - self.g_norm_squared_lpf = 0 - self.g_norm_lpf = 0 - self.h_min_lpf = 0 - self.h_max_lpf = 0 - self.dist_lpf = 0 - self.lr_lpf = 0 - self.mu_lpf = 0 - - def get_lr_mu(self): - p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var) - w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5) - w = np.power(w3, 1/3) - y = w - p / (3 * w) - sqrt_mu1 = y + 1 - - sqrt_h_min = np.sqrt(self.h_min) - sqrt_h_max = np.sqrt(self.h_max) - sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min) - - sqrt_mu = max(sqrt_mu1, sqrt_mu2) - if sqrt_mu2 > sqrt_mu1: - print('note: taking dr calculation. something may have exploded.') - - lr = np.square(1 - sqrt_mu) / self.h_min - mu = np.square(sqrt_mu) - return lr, mu - - def compute(self, dW, W): - if self.accum is None: - self.accum = np.zeros_like(dW) - - # TODO: prevent allocations everywhere by using [:]. - # assuming that really works. i haven't actually checked. - - total_norm = np.linalg.norm(dW) - clip_scale = self.clip / (total_norm + 1e-6) - if clip_scale < 1: - #print("clipping gradients; norm: {:10.5f}".format(total_norm)) - dW *= clip_scale - - #fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3' - #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100)) - - b = self.beta - m1b = 1 - self.beta - debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1 - - g = dW - g_squared = np.square(g) - g_norm_squared = np.sum(g_squared) - g_norm = np.sqrt(g_norm_squared) - - self.curv_win[self.step % self.window_size] = g_norm_squared - valid_window = self.curv_win[:min(self.window_size, self.step + 1)] - h_min_t = np.min(valid_window) - h_max_t = np.max(valid_window) - - self.g_lpf = b * self.g_lpf + m1b * g - #self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared - self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared - self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm - self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t - self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t - - g_avg = debias * self.g_lpf - #g_squared_avg = debias * self.g_squared_lpf - g_norm_squared_avg = debias * self.g_norm_squared_lpf - g_norm_avg = debias * self.g_norm_lpf - self.h_min = debias * self.h_min_lpf - self.h_max = debias * self.h_max_lpf - assert self.h_max >= self.h_min - - dist = g_norm_avg / g_norm_squared_avg - - self.dist_lpf = b * self.dist_lpf + m1b * dist - - self.dist_avg = debias * self.dist_lpf - - self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg)) - # equivalently: - #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg))) - - if self.step > 0: - lr_for_real, mu_for_real = self.get_lr_mu() - self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real - self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real - self.mu = debias * self.mu_lpf - self.lr = debias * self.lr_lpf - - self.accum[:] = self.accum * self.mu - self.lr * dW - V = self.accum - - self.step += 1 - self.beta_t *= self.beta - return V - -class AddSign(Optimizer): - # paper: https://arxiv.org/abs/1709.07417 - - def __init__(self, lr=0.01, mu=0.9, alpha=1): - self.mu = _f(mu) - self.alpha = _f(alpha) - - super().__init__(lr) - - def reset(self): - self.accum = None - - def compute(self, dW, W): - if self.accum is None: - self.accum = np.zeros_like(dW) - - self.accum[:] = self.accum * self.mu + dW - - signed = np.sign(dW) * np.sign(self.accum) - #signed *= decay - - return -self.lr * dW * (self.alpha + signed) - -class PowerSign(Optimizer): - # paper: https://arxiv.org/abs/1709.07417 - - def __init__(self, lr=0.01, mu=0.9, alpha=np.e): - self.mu = _f(mu) - self.alpha = _f(alpha) - self.use_exp = np.isclose(self.alpha, _f(np.e)) - - super().__init__(lr) - - def reset(self): - self.accum = None - - def compute(self, dW, W): - if self.accum is None: - self.accum = np.zeros_like(dW) - - self.accum[:] = self.accum * self.mu + dW - - signed = np.sign(dW) * np.sign(self.accum) - #signed *= decay - - if self.use_exp: - return -self.lr * dW * np.exp(signed) - else: - return -self.lr * dW * np.power(self.alpha, signed) - -class Neumann(Optimizer): - # paper: https://arxiv.org/abs/1712.03298 - # NOTE: this implementation is missing resetting as described in the paper. - # resetting is totally disabled for now. - # NOTE: this implementation does not use vanilla SGD for its first epochs. - # you should do this yourself if you need it. - # it seems like using a Learner like SineCLR makes this unnecessary. - - def __init__(self, lr=0.01): - self.alpha = _f(1e-7) # cubic. - self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later. - self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing. - # momentum is ∝ (in the shape of) 1 - 1/(1 + t) - self.mu_min = _f(0.5) - self.mu_max = _f(0.9) - self.reset_period = 0 # TODO - - super().__init__(lr) - - def reset(self): - # NOTE: mt and vt are different than the pair in Adam-like optimizers. - self.mt = None # momentum accumulator. - self.vt = None # weight accumulator. - self.t = 0 - - def compute(self, dW, W): - raise Exception("compute() is not available for this Optimizer.") - - def update(self, dW, W): - self.t += 1 - - if self.mt is None: - self.mt = np.zeros_like(dW) - if self.vt is None: - self.vt = np.zeros_like(dW) - - if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0: - self.mt = -self.lr * dW - return - - mu = _1 - _1/_f(self.t) # the + 1 is implicit. - mu = (mu + self.mu_min) * (self.mu_max - self.mu_min) - - delta = W - self.vt - delta_norm_squared = np.square(delta).sum() - delta_norm = np.sqrt(delta_norm_squared) - - alpha = self.alpha - beta = self.beta * dW.size - - cubic_reg = alpha * delta_norm_squared - repulsive_reg = beta / delta_norm_squared - dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm) - - self.mt = mu * self.mt - self.lr * dt - - W += mu * self.mt - self.lr * dt - self.vt = W + self.gamma * (self.vt - W) - -# Nonparametric Layers {{{1 - -class AlphaDropout(Layer): - # to be used alongside Selu activations. - # paper: https://arxiv.org/abs/1706.02515 - - def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099): - super().__init__() - self.alpha = _f(alpha) - self.lamb = _f(lamb) - self.saturated = -self.lamb * self.alpha - self.dropout = _f(dropout) - - @property - def dropout(self): - return self._dropout - - @dropout.setter - def dropout(self, x): - self._dropout = _f(x) - self.q = 1 - self._dropout - assert 0 <= self.q <= 1 - - sat = self.saturated - - self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout) - self.b = -self.a * (self._dropout * sat) - - def forward(self, X): - self.mask = np.random.rand(*X.shape) < self.q - return self.a * np.where(self.mask, X, self.saturated) + self.b - - def forward_deterministic(self, X): - return X - - def backward(self, dY): - return dY * self.a * self.mask - -class Decimate(Layer): - # simple decimaton layer that drops every other sample from the last axis. - - def __init__(self, phase='even'): - super().__init__() - # phase is the set of samples we keep in the forward pass. - assert phase in ('even', 'odd'), phase - self.phase = phase - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - divy = (shape[-1] + 1) // 2 if self.phase == 'even' else shape[-1] // 2 - self.output_shape = tuple(list(shape[:-1]) + [divy]) - self.dX = np.zeros(self.input_shape, dtype=_f) - - def forward(self, X): - self.batch_size = X.shape[0] - if self.phase == 'even': - return X.ravel()[0::2].reshape(self.batch_size, *self.output_shape) - elif self.phase == 'odd': - return X.ravel()[1::2].reshape(self.batch_size, *self.output_shape) - - def backward(self, dY): - assert dY.shape[0] == self.batch_size - dX = np.zeros((self.batch_size, *self.input_shape), dtype=_f) - if self.phase == 'even': - dX.ravel()[0::2] = dY.ravel() - elif self.phase == 'odd': - dX.ravel()[1::2] = dY.ravel() - return dX - -class Undecimate(Layer): - # inverse operation of Decimate. not quite interpolation. - - def __init__(self, phase='even'): - super().__init__() - # phase is the set of samples we keep in the backward pass. - assert phase in ('even', 'odd'), phase - self.phase = phase - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - mult = shape[-1] * 2 - self.output_shape = tuple(list(shape[:-1]) + [mult]) - - def forward(self, X): - self.batch_size = X.shape[0] - Y = np.zeros((self.batch_size, *self.output_shape), dtype=_f) - if self.phase == 'even': - Y.ravel()[0::2] = X.ravel() - elif self.phase == 'odd': - Y.ravel()[1::2] = X.ravel() - return Y - - def backward(self, dY): - assert dY.shape[0] == self.batch_size - if self.phase == 'even': - return dY.ravel()[0::2].reshape(self.batch_size, *self.input_shape) - elif self.phase == 'odd': - return dY.ravel()[1::2].reshape(self.batch_size, *self.input_shape) - -# Activations {{{2 - -class Selu(Layer): - # paper: https://arxiv.org/abs/1706.02515 - - def __init__(self, alpha=1.67326324, lamb=1.05070099): - super().__init__() - self.alpha = _f(alpha) - self.lamb = _f(lamb) - - def forward(self, X): - self.cond = X >= 0 - self.neg = self.alpha * np.exp(X) - return self.lamb * np.where(self.cond, X, self.neg - self.alpha) - - def backward(self, dY): - return dY * self.lamb * np.where(self.cond, 1, self.neg) - -class TanhTest(Layer): - def forward(self, X): - self.sig = np.tanh(1 / 2 * X) - return 2.4004 * self.sig - - def backward(self, dY): - return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig) - -class ExpGB(Layer): - # an output layer for one-hot classification problems. - # use with MSE (SquaredHalved), not CategoricalCrossentropy! - # paper: https://arxiv.org/abs/1707.04199 - - def __init__(self, alpha=0.1, beta=0.0): - super().__init__() - self.alpha = _f(alpha) - self.beta = _f(beta) - - def forward(self, X): - return self.alpha * np.exp(X) + self.beta - - def backward(self, dY): - # this gradient is intentionally incorrect. - return dY - -class CubicGB(Layer): - # an output layer for one-hot classification problems. - # use with MSE (SquaredHalved), not CategoricalCrossentropy! - # paper: https://arxiv.org/abs/1707.04199 - # note: in the paper, it's called pow3GB, which is ugly. - - def __init__(self, alpha=0.1, beta=0.0): - # note: the paper suggests defaults of 0.001 and 0.0, - # but these didn't seem to work as well in my limited testing. - super().__init__() - self.alpha = _f(alpha) - self.beta = _f(beta) - - def forward(self, X): - return self.alpha * X**3 + self.beta - - def backward(self, dY): - # this gradient is intentionally incorrect. - return dY - -# Parametric Layers {{{1 - -class Conv1Dper(Layer): - # periodic (circular) convolution. - # currently only supports one channel I/O. - # some notes: - # we could use FFTs for larger convolutions. - # i think storing the coefficients backwards would - # eliminate reversal in the critical code. - - serialize = { - 'W': 'coeffs', - } - - def __init__(self, kernel_size, pos=None, - init=init_glorot_uniform, reg_w=None): - super().__init__() - self.kernel_size = int(kernel_size) - self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) - if pos is None: - self.wrap0 = (self.kernel_size - 0) // 2 - self.wrap1 = (self.kernel_size - 1) // 2 - elif pos == 'alt': - self.wrap0 = (self.kernel_size - 1) // 2 - self.wrap1 = (self.kernel_size - 0) // 2 - elif pos == 'left': - self.wrap0 = 0 - self.wrap1 = self.kernel_size - 1 - elif pos == 'right': - self.wrap0 = self.kernel_size - 1 - self.wrap1 = 0 - else: - raise Exception("pos parameter not understood: {}".format(pos)) - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - assert len(shape) == 1, shape - self.output_shape = shape - self.coeffs.shape = (1, self.kernel_size) - - def forward(self, X): - if self.wrap0 == 0: - Xper = np.hstack((X,X[:,:self.wrap1])) - elif self.wrap1 == 0: - Xper = np.hstack((X[:,-self.wrap0:],X)) - else: - Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1])) - self.cols = rolling_batch(Xper, self.kernel_size) - convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2) - return convolved - - def backward(self, dY): - self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True) - return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2) - -class LayerNorm(Layer): - # paper: https://arxiv.org/abs/1607.06450 - # note: nonparametric when affine == False - - def __init__(self, eps=1e-5, affine=True): - super().__init__() - self.eps = _f(eps) - self.affine = bool(affine) - - if self.affine: - self.gamma = self._new_weights('gamma', init=init_ones) - self.beta = self._new_weights('beta', init=init_zeros) - self.serialized = { - 'gamma': 'gamma', - 'beta': 'beta', - } - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - self.output_shape = shape - assert len(shape) == 1, shape - if self.affine: - self.gamma.shape = (shape[0],) - self.beta.shape = (shape[0],) - - def forward(self, X): - self.mean = X.mean(0) - self.center = X - self.mean - self.var = self.center.var(0) + self.eps - self.std = np.sqrt(self.var) - - self.Xnorm = self.center / self.std - if self.affine: - return self.gamma.f * self.Xnorm + self.beta.f - return self.Xnorm - - def backward(self, dY): - length = dY.shape[0] - - if self.affine: - dXnorm = dY * self.gamma.f - self.gamma.g += (dY * self.Xnorm).sum(0) - self.beta.g += dY.sum(0) - else: - dXnorm = dY - - dstd = (dXnorm * self.center).sum(0) / -self.var - dcenter = dXnorm / self.std + dstd / self.std * self.center / length - dmean = -dcenter.sum(0) - dX = dcenter + dmean / length - - return dX - -class Denses(Layer): # TODO: rename? - # acts as a separate Dense for each row or column. only for 2D arrays. - - serialized = { - 'W': 'coeffs', - 'b': 'biases', - } - - def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1): - super().__init__() - self.dim = int(dim) - self.weight_init = init - self.axis = int(axis) - self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) - self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - assert len(shape) == 2, shape - - assert -len(shape) <= self.axis < len(shape) - self.axis = self.axis % len(shape) - - self.output_shape = list(shape) - self.output_shape[self.axis] = self.dim - self.output_shape = tuple(self.output_shape) - - in_rows = self.input_shape[0] - in_cols = self.input_shape[1] - out_rows = self.output_shape[0] - out_cols = self.output_shape[1] - - self.coeffs.shape = (in_rows, in_cols, self.dim) - self.biases.shape = (1, out_rows, out_cols) - - def forward(self, X): - self.X = X - if self.axis == 0: - return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f - elif self.axis == 1: - return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f - - def backward(self, dY): - self.biases.g += dY.sum(0, keepdims=True) - if self.axis == 0: - self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY) - return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f) - elif self.axis == 1: - self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY) - return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f) - -class CosineDense(Dense): - # paper: https://arxiv.org/abs/1702.05870 - # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36 - # the paper doesn't mention bias, - # so we treat bias as an additional weight with a constant input of 1. - # this is correct in Dense layers, so i hope it's correct here too. - - eps = 1e-4 - - def forward(self, X): - self.X = X - self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \ - + 1 + self.eps) - self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \ - + np.square(self.biases.f) + self.eps) - self.dot = X @ self.coeffs.f + self.biases.f - Y = self.dot / (self.X_norm * self.W_norm) - return Y - - def backward(self, dY): - ddot = dY / self.X_norm / self.W_norm - dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2 - dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2 - - self.coeffs.g += self.X.T @ ddot \ - + dW_norm / self.W_norm * self.coeffs.f - self.biases.g += ddot.sum(0, keepdims=True) \ - + dW_norm / self.W_norm * self.biases.f - dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X - - return dX - -# Rituals {{{1 - -def stochastic_multiply(W, gamma=0.5, allow_negation=False): - # paper: https://arxiv.org/abs/1606.01981 - - assert W.ndim == 1, W.ndim - assert 0 < gamma < 1, gamma - size = len(W) - alpha = np.max(np.abs(W)) - # NOTE: numpy gives [low, high) but the paper advocates [low, high] - mult = np.random.uniform(gamma, 1/gamma, size=size) - if allow_negation: - # NOTE: i have yet to see this do anything but cause divergence. - # i've referenced the paper several times yet still don't understand - # what i'm doing wrong, so i'm disabling it by default in my code. - # maybe i just need *a lot* more weights to compensate. - prob = (W / alpha + 1) / 2 - samples = np.random.random_sample(size=size) - mult *= np.where(samples < prob, 1, -1) - np.multiply(W, mult, out=W) - -class StochMRitual(Ritual): - # paper: https://arxiv.org/abs/1606.01981 - # this probably doesn't make sense for regression problems, - # let alone small models, but here it is anyway! - - def __init__(self, learner=None, gamma=0.5): - super().__init__(learner) - self.gamma = _f(gamma) - - def prepare(self, model): - self.W = np.copy(model.W) - super().prepare(model) - - def learn(self, inputs, outputs): - # an experiment: - #assert self.learner.rate < 10, self.learner.rate - #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate)) - - self.W[:] = self.model.W - for layer in self.model.ordered_nodes: - if isinstance(layer, Dense): - stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma) - residual = super().learn(inputs, outputs) - self.model.W[:] = self.W - return residual - - def update(self): - super().update() - f = 0.5 - for layer in self.model.ordered_nodes: - if isinstance(layer, Dense): - np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W) - # np.clip(layer.W, -1, 1, out=layer.W) - -class NoisyRitual(Ritual): - def __init__(self, learner=None, - input_noise=0, output_noise=0, gradient_noise=0): - self.input_noise = _f(input_noise) - self.output_noise = _f(output_noise) - self.gradient_noise = _f(gradient_noise) - super().__init__(learner) - - def learn(self, inputs, outputs): - # this is pretty crude - if self.input_noise > 0: - s = self.input_noise - inputs = inputs + np.random.normal(0, s, size=inputs.shape) - if self.output_noise > 0: - s = self.output_noise - outputs = outputs + np.random.normal(0, s, size=outputs.shape) - return super().learn(inputs, outputs) - - def update(self): - # gradient noise paper: https://arxiv.org/abs/1511.06807 - if self.gradient_noise > 0: - size = len(self.model.dW) - gamma = 0.55 - #s = self.gradient_noise / (1 + self.bn) ** gamma - # experiments: - s = self.gradient_noise * np.sqrt(self.learner.rate) - #s = np.square(self.learner.rate) - #s = self.learner.rate / self.en - self.model.dW += np.random.normal(0, max(s, 1e-8), size=size) - super().update() - -# Learners {{{1 - -class PolyLearner(Learner): - per_batch = True - - def __init__(self, optim, epochs=400, coeffs=(1,)): - self.coeffs = tuple(coeffs) - super().__init__(optim, epochs, np.polyval(self.coeffs, 0)) - - def rate_at(self, epoch): - progress = (epoch - 1) / (self.epochs) - ret = np.polyval(self.coeffs, progress) - return np.abs(ret) - -# Components {{{1 - -def _mr_make_norm(norm, *args, **kwargs): - def _mr_norm(y, width, depth, block, multi, activation, style, FC, d): - skip = y - merger = Sum() - skip.feed(merger) - z_start = skip - z_start = z_start.feed(norm(*args, **kwargs)) - z_start = z_start.feed(activation()) - for _ in range(multi): - z = z_start - for j in range(block): - if j > 0: - z = z.feed(norm(*args, **kwargs)) - z = z.feed(activation()) - z = z.feed(FC()) - z.feed(merger) - y = merger - return y - return _mr_norm - -def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d): - skip = y - merger = Sum() - skip.feed(merger) - z_start = skip.feed(activation()) - for _ in range(multi): - z = z_start - for j in range(block): - if j > 0: - z = z.feed(activation()) - z = z.feed(FC()) - z.feed(merger) - y = merger - return y - -def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d): - # this is my own awful contraption. - is_last = d + 1 == depth - needs_sum = not is_last or multi > 1 - skip = y - if needs_sum: - merger = Sum() - if not is_last: - skip.feed(merger) - z_start = skip.feed(activation()) - for _ in range(multi): - z = z_start - for j in range(block): - if j > 0: - z = z.feed(activation()) - z = z.feed(FC()) - if needs_sum: - z.feed(merger) - if needs_sum: - y = merger - else: - y = z - return y - -_mr_styles = dict( - lnorm=_mr_make_norm(LayerNorm), - batchless=_mr_batchless, - onelesssum=_mr_onelesssum, -) - -def multiresnet(x, width, depth, block=2, multi=1, - activation=Relu, style='batchless', - init=init_he_normal): - if style == 'cossim': - style = 'batchless' - DenseClass = CosineDense - else: - DenseClass = Dense - if style not in _mr_styles: - raise Exception('unknown resnet style', style) - - y = x - last_size = x.output_shape[0] - - for d in range(depth): - size = width - FC = lambda: DenseClass(size, init) - - if last_size != size: - y = y.feed(FC()) - - y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d) - - last_size = size - - return y - -# Toy Data {{{1 - -inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform, - glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform, - gaussian_unit=init_gaussian_unit) -activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh, - relu=Relu, elu=Elu, gelu=GeluApprox, selu=Selu, - softplus=Softplus) - -def prettyize(data): - if isinstance(data, np.ndarray): - s = ', '.join(('{:8.2e}'.format(n) for n in data)) - s = '[' + s + ']' - else: - s = '{:8.2e}'.format(data) - return s - -def normalize_data(data, mean=None, std=None): - # in-place - if mean is None or std is None: - mean = np.mean(data, axis=0) - std = np.std(data, axis=0) - mean_str = prettyize(mean) - std_str = prettyize(std) - lament('nod(...,\n {},\n {})'.format(mean_str, std_str)) - sys.exit(1) - data -= _f(mean) - data /= _f(std) - -def toy_data(train_samples, valid_samples, problem=2): - total_samples = train_samples + valid_samples - - nod = normalize_data # shorthand to keep a sane indentation - - if problem == 0: - from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs - inputs, outputs = _f(inputs), _f(outputs) - valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs) - - nod(inputs, 127.5, 73.9) - nod(outputs, 44.8, 21.7) - nod(valid_inputs, 127.5, 73.9) - nod(valid_outputs, 44.8, 21.7) - - elif problem == 1: - from sklearn.datasets import make_friedman1 - inputs, outputs = make_friedman1(total_samples) - inputs, outputs = _f(inputs), _f(outputs) - outputs = np.expand_dims(outputs, -1) - - nod(inputs, 0.5, 1/np.sqrt(12)) - nod(outputs, 14.4, 4.9) - - elif problem == 2: - from sklearn.datasets import make_friedman2 - inputs, outputs = make_friedman2(total_samples) - inputs, outputs = _f(inputs), _f(outputs) - outputs = np.expand_dims(outputs, -1) - - nod(inputs, - [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00], - [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00]) - - nod(outputs, [482], [380]) - - elif problem == 3: - from sklearn.datasets import make_friedman3 - inputs, outputs = make_friedman3(total_samples) - inputs, outputs = _f(inputs), _f(outputs) - outputs = np.expand_dims(outputs, -1) - - nod(inputs, - [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00], - [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00]) - - nod(outputs, [1.32327931], [0.31776295]) - - else: - raise Exception("unknown toy data set", problem) - - if problem != 0: - # split off a validation set - indices = np.arange(inputs.shape[0]) - np.random.shuffle(indices) - valid_inputs = inputs[indices][-valid_samples:] - valid_outputs = outputs[indices][-valid_samples:] - inputs = inputs[indices][:-valid_samples] - outputs = outputs[indices][:-valid_samples] - - return (inputs, outputs), (valid_inputs, valid_outputs) - -# Model Creation {{{1 - -def optim_from_config(config): - if config.optim == 'adam': - d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5 - d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5 - b1 = np.exp(-1/d1) - b2 = np.exp(-1/d2) - o = Nadam if config.nesterov else Adam - optim = o(b1=b1, b2=b2) - elif config.optim == 'ftml': - d1 = config.optim_decay1 if 'optim_decay1' in config else 2 - d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5 - b1 = np.exp(-1/d1) - b2 = np.exp(-1/d2) - optim = FTML(b1=b1, b2=b2) - elif config.optim == 'yf': - d1 = config.optim_decay1 if 'optim_decay1' in config else 999.5 - d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5 - if d1 != d2: - raise Exception("yellowfin only uses one decay term.") - beta = np.exp(-1/d1) - optim = YellowFin(beta=beta) - elif config.optim in ('ag', 'adagrad'): - optim = Adagrad() - elif config.optim in ('rms', 'rmsprop'): - d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5 - mu = np.exp(-1/d2) - optim = RMSprop(mu=mu) - elif config.optim == 'rmsc': - d2 = config.optim_decay2 if 'optim_decay2' in config else 9.5 - mu = np.exp(-1/d2) - optim = RMSpropCentered(momentum=mu) - elif config.optim == 'sgd': - d1 = config.optim_decay1 if 'optim_decay1' in config else 0 - clip = config.gradient_clip if 'gradient_clip' in config else 0.0 - if d1 > 0 or clip > 0: - b1 = np.exp(-1/d1) if d1 > 0 else 0 - if clip > 0: - optim = MomentumClip(mu=b1, nesterov=config.nesterov, clip=clip) - else: - optim = Momentum(mu=b1, nesterov=config.nesterov) - else: - optim = Optimizer() - elif config.optim == 'ps': - d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5 - b1 = np.exp(-1/d1) - optim = PowerSign(mu=b1) - elif config.optim == 'as': - d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5 - b1 = np.exp(-1/d1) - optim = AddSign(mu=b1) - else: - raise Exception('unknown optimizer', config.optim) - - return optim - -def learner_from_config(config, optim, rscb): - if config.learner == 'sgdr': - expando = config.expando if 'expando' in config else None - learner = SGDR(optim, epochs=config.epochs, rate=config.learn, - restart_decay=config.restart_decay, restarts=config.restarts, - callback=rscb, expando=expando) - # final learning rate isn't of interest here; it's gonna be close to 0. - log('total epochs', learner.epochs) - elif config.learner in ('sin', 'sine'): - lower_rate = config.learn * 1e-5 # TODO: allow access to this. - epochs = config.epochs * (config.restarts + 1) - frequency = config.epochs - learner = SineCLR(optim, epochs=epochs, frequency=frequency, - upper_rate=config.learn, lower_rate=lower_rate, - callback=rscb) - elif config.learner == 'wave': - lower_rate = config.learn * 1e-5 # TODO: allow access to this. - epochs = config.epochs * (config.restarts + 1) - frequency = config.epochs - learner = WaveCLR(optim, epochs=epochs, frequency=frequency, - upper_rate=config.learn, lower_rate=lower_rate, - callback=rscb) - elif config.learner == 'anneal': - learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn, - halve_every=config.learn_halve_every) - log("final learning rate", "{:10.8f}".format(learner.final_rate)) - elif config.learner == 'sgd': - learner = Learner(optim, epochs=config.epochs, rate=config.learn) - else: - raise Exception('unknown learner', config.learner) - - return learner - -def lookup_loss(maybe_name): - if isinstance(maybe_name, Loss): - return maybe_name - elif maybe_name == 'mse': - return Squared() - elif maybe_name == 'mshe': # mushy - return SquaredHalved() - elif maybe_name == 'mae': - return Absolute() - elif maybe_name == 'msee': - return SomethingElse() - elif maybe_name == 'huber': - return Huber(delta=0.1) - raise Exception('unknown objective', maybe_name) - -def ritual_from_config(config, learner): - if config.ritual == 'default': - ritual = Ritual(learner=learner) - elif config.ritual == 'stochm': - ritual = StochMRitual(learner=learner) - elif config.ritual == 'noisy': - ritual = NoisyRitual(learner=learner, - input_noise=1e-1, output_noise=1e-2, - gradient_noise=2e-7) - else: - raise Exception('unknown ritual', config.ritual) - - return ritual - -def model_from_config(config, input_features, output_features, callbacks=None): - init = inits[config.init] - activation = activations[config.activation] - - x = Input(shape=(input_features,)) - y = x - y = multiresnet(y, - config.res_width, config.res_depth, - config.res_block, config.res_multi, - activation=activation, init=init, - style=config.parallel_style) - if y.output_shape[0] != output_features: - y = y.feed(Dense(output_features, init)) - - loss = lookup_loss(config.loss) - mloss = lookup_loss(config.mloss) if config.mloss else loss - - model = Model(x, y, loss=loss, mloss=mloss, unsafe=config.unsafe) - - if config.fn_load is not None: - log('loading weights', config.fn_load) - model.load_weights(config.fn_load) - - optim = optim_from_config(config) - - def rscb(restart): - if callbacks: - callbacks.restart() - log("restarting", restart) - if config.restart_optim: - optim.reset() - - learner = learner_from_config(config, optim, rscb) - - ritual = ritual_from_config(config, learner) - - return model, learner, ritual - -# main program {{{1 - -def run(program, args=None): - args = args if args else [] - - lower_priority() - np.random.seed(42069) - - # Config {{{2 - - from dotmap import DotMap - config = DotMap( - fn_load = None, - fn_save = 'optim_nn.h5', - log_fn = 'losses.npz', - - # multi-residual network parameters - res_width = 28, - res_depth = 2, - res_block = 3, # normally 2 for plain resnet - res_multi = 2, # normally 1 for plain resnet - - # style of resnet (order of layers, which layers, etc.) - parallel_style = 'onelesssum', - activation = 'gelu', - - #optim = 'ftml', - #optim_decay1 = 2, - #optim_decay2 = 100, - optim = 'adam', # note: most features only implemented for Adam - optim_decay1 = 24, # first momentum given in batches (optional) - optim_decay2 = 100, # second momentum given in batches (optional) - nesterov = True, # not available for all optimizers. - batch_size = 64, - - # learning parameters - learner = 'sgdr', - learn = 0.00125, - epochs = 24, - learn_halve_every = 16, # only used with anneal - restarts = 4, - restart_decay = 0.25, # only used with SGDR - expando = lambda i: 24 * i, - - # misc - init = 'he_normal', - loss = 'mse', - mloss = 'mse', - ritual = 'default', - restart_optim = False, # restarts also reset internal state of optimizer - warmup = False, # train a couple epochs on gaussian noise and reset - - # logging/output - log10_loss = True, # personally, i'm sick of looking linear loss values! - #fancy_logs = True, # unimplemented (can't turn it off yet) - - problem = 2, - compare = ( - # best results for ~10,000 parameters - # training/validation pairs for each problem (starting from problem 0): - (10**-3.120, 10**-2.901), - # 1080 epochs on these... - (10**-6.747, 10**-6.555), - (10**-7.774, 10**-7.626), - (10**-6.278, 10**-5.234), # overfitting? bad valid set? - ), - - unsafe = True, # aka gotta go fast mode - ) - - for k in ['parallel_style', 'activation', 'optim', 'learner', - 'init', 'loss', 'mloss', 'ritual']: - config[k] = config[k].lower() - - config.learn *= np.sqrt(config.batch_size) - - config.pprint() - - # Toy Data {{{2 - - (inputs, outputs), (valid_inputs, valid_outputs) = \ - toy_data(2**14, 2**11, problem=config.problem) - input_features = inputs.shape[-1] - output_features = outputs.shape[-1] - - # Our Test Model - - callbacks = Dummy() - - model, learner, ritual = \ - model_from_config(config, input_features, output_features, callbacks) - - # Model Information {{{2 - - model.print_graph() - log('parameters', model.param_count) - - # Training {{{2 - - batch_losses = [] - train_losses = [] - valid_losses = [] - - def measure_error(): - def print_error(name, inputs, outputs, comparison=None): - predicted = model.evaluate(inputs) - err = model.mloss.forward(predicted, outputs) - if config.log10_loss: - print(name, "{:12.6e}".format(err)) - if comparison: - err10 = np.log10(err) - cmp10 = np.log10(comparison) - color = '\x1B[31m' if err10 > cmp10 else '\x1B[32m' - log(name + " log10-loss", "{:+6.3f} {}({:+6.3f})\x1B[0m".format(err10, color, err10 - cmp10)) - else: - log(name + " log10-loss", "{:+6.3f}".format(err, np.log10(err))) - else: - log(name + " loss", "{:12.6e}".format(err)) - if comparison: - fmt = "10**({:+7.4f}) times" - log("improvement", fmt.format(np.log10(comparison / err))) - return err - - train_err = print_error("train", - inputs, outputs, - config.compare[config.problem][0]) - valid_err = print_error("valid", - valid_inputs, valid_outputs, - config.compare[config.problem][1]) - train_losses.append(train_err) - valid_losses.append(valid_err) - - callbacks.restart = measure_error - - training = config.epochs > 0 and config.restarts >= 0 - - ritual.prepare(model) - - if training and config.warmup and not config.fn_load: - log("warming", "up") - - # use plain SGD in warmup to prevent (or possibly cause?) numeric issues - temp_optim = learner.optim - temp_loss = model.loss - learner.optim = MomentumClip(lr=0.01, mu=0) - ritual.loss = Absolute() # less likely to blow up; more general - - # NOTE: experiment: trying const batches and batch_size - bs = 256 - target = 1 * 1024 * 1024 - # 4 being sizeof(float) - batches = (target / 4 / np.prod(inputs.shape[1:])) // bs * bs - ins = [int(batches)] + list( inputs.shape[1:]) - outs = [int(batches)] + list(outputs.shape[1:]) - - for _ in range(4): - ritual.train_batched( - np.random.normal(size=ins), - np.random.normal(size=outs), - batch_size=bs) - ritual.reset() - - learner.optim = temp_optim - model.loss = temp_loss - - if training: - measure_error() - - while training and learner.next(): - avg_loss, losses = ritual.train_batched( - inputs, outputs, - config.batch_size, - return_losses=True) - batch_losses += losses - - if config.log10_loss: - fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}" - log("info", fmt.format(learner.epoch, learner.rate, np.log10(avg_loss)), - update=True) - else: - fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}" - log("info", fmt.format(learner.epoch, learner.rate, avg_loss), - update=True) - - measure_error() - - if training and config.fn_save is not None: - log('saving weights', config.fn_save) - model.save_weights(config.fn_save, overwrite=True) - - if training and config.log_fn is not None: - log('saving losses', config.log_fn) - np.savez_compressed(config.log_fn, - batch_losses=np.array(batch_losses, dtype=_f), - train_losses=np.array(train_losses, dtype=_f), - valid_losses=np.array(valid_losses, dtype=_f)) - - # Evaluation {{{2 - # TODO: write this portion again - - return 0 - -# run main program {{{1 - -if __name__ == '__main__': - sys.exit(run(sys.argv[0], sys.argv[1:])) diff --git a/onn/__init__.py b/onn/__init__.py new file mode 100644 index 0000000..19d6020 --- /dev/null +++ b/onn/__init__.py @@ -0,0 +1,28 @@ +# external packages required for full functionality: +# numpy scipy h5py sklearn dotmap + +# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*. +# this is gonna take some work. + +from .activation import * +from .floats import * +from .initialization import * +from .layer import * +from .learner import * +from .loss import * +from .math import * +from .model import * +from .nodes import * +from .optimizer import * +from .parametric import * +from .regularizer import * +from .ritual import * +from .util import * +from .weights import * + +# this is similar to default behaviour of having no __all__ variable at all, +# but ours ignores modules as well. this allows for `import sys` and such +# without clobbering `from our_module import *`. +__all__ = [ + o for o in locals() + if type(o) != 'module' and not o.startswith('_')] diff --git a/onn/activation.py b/onn/activation.py new file mode 100644 index 0000000..7bba491 --- /dev/null +++ b/onn/activation.py @@ -0,0 +1,185 @@ +import numpy as np + +# just for speed, not strictly essential: +from scipy.special import expit as sigmoid + +from .floats import * +from .layer_base import * + +class Identity(Layer): + def forward(self, X): + return X + + def backward(self, dY): + return dY + +class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) + def forward(self, X): + self.sig = sigmoid(X) + return self.sig + + def backward(self, dY): + return dY * self.sig * (1 - self.sig) + +class Softplus(Layer): + # integral of Sigmoid. + + def forward(self, X): + self.X = X + return np.log(1 + np.exp(X)) + + def backward(self, dY): + return dY * sigmoid(self.X) + +class Tanh(Layer): + def forward(self, X): + self.sig = np.tanh(X) + return self.sig + + def backward(self, dY): + return dY * (1 - self.sig * self.sig) + +class LeCunTanh(Layer): + # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf + # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf + # scaled such that f([-1, 1]) = [-1, 1]. + # helps preserve an input variance of 1. + # second derivative peaks around an input of ±1. + + def forward(self, X): + self.sig = np.tanh(2 / 3 * X) + return 1.7159 * self.sig + + def backward(self, dY): + return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig) + +class Relu(Layer): + def forward(self, X): + self.cond = X >= 0 + return np.where(self.cond, X, 0) + + def backward(self, dY): + return np.where(self.cond, dY, 0) + +class Elu(Layer): + # paper: https://arxiv.org/abs/1511.07289 + + def __init__(self, alpha=1): + super().__init__() + self.alpha = _f(alpha) # FIXME: unused + + def forward(self, X): + self.cond = X >= 0 + self.neg = np.exp(X) - 1 + return np.where(self.cond, X, self.neg) + + def backward(self, dY): + return dY * np.where(self.cond, 1, self.neg + 1) + +class GeluApprox(Layer): + # paper: https://arxiv.org/abs/1606.08415 + # plot: https://www.desmos.com/calculator/ydzgtccsld + + def forward(self, X): + self.a = 1.704 * X + self.sig = sigmoid(self.a) + return X * self.sig + + def backward(self, dY): + return dY * self.sig * (1 + self.a * (1 - self.sig)) + +class Softmax(Layer): + def forward(self, X): + alpha = np.max(X, axis=-1, keepdims=True) + num = np.exp(X - alpha) + den = np.sum(num, axis=-1, keepdims=True) + self.sm = num / den + return self.sm + + def backward(self, dY): + return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm + +class LogSoftmax(Softmax): + def __init__(self, eps=1e-6): + super().__init__() + self.eps = _f(eps) + + def forward(self, X): + return np.log(super().forward(X) + self.eps) + + def backward(self, dY): + return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm + +class Cos(Layer): + # performs well on MNIST for some strange reason. + + def forward(self, X): + self.X = X + return np.cos(X) + + def backward(self, dY): + return dY * -np.sin(self.X) + +class Selu(Layer): + # paper: https://arxiv.org/abs/1706.02515 + + def __init__(self, alpha=1.67326324, lamb=1.05070099): + super().__init__() + self.alpha = _f(alpha) + self.lamb = _f(lamb) + + def forward(self, X): + self.cond = X >= 0 + self.neg = self.alpha * np.exp(X) + return self.lamb * np.where(self.cond, X, self.neg - self.alpha) + + def backward(self, dY): + return dY * self.lamb * np.where(self.cond, 1, self.neg) + +# more + +class TanhTest(Layer): + def forward(self, X): + self.sig = np.tanh(1 / 2 * X) + return 2.4004 * self.sig + + def backward(self, dY): + return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig) + +class ExpGB(Layer): + # an output layer for one-hot classification problems. + # use with MSE (SquaredHalved), not CategoricalCrossentropy! + # paper: https://arxiv.org/abs/1707.04199 + + def __init__(self, alpha=0.1, beta=0.0): + super().__init__() + self.alpha = _f(alpha) + self.beta = _f(beta) + + def forward(self, X): + return self.alpha * np.exp(X) + self.beta + + def backward(self, dY): + # this gradient is intentionally incorrect. + return dY + +class CubicGB(Layer): + # an output layer for one-hot classification problems. + # use with MSE (SquaredHalved), not CategoricalCrossentropy! + # paper: https://arxiv.org/abs/1707.04199 + # note: in the paper, it's called pow3GB, which is ugly. + + def __init__(self, alpha=0.1, beta=0.0): + # note: the paper suggests defaults of 0.001 and 0.0, + # but these didn't seem to work as well in my limited testing. + super().__init__() + self.alpha = _f(alpha) + self.beta = _f(beta) + + def forward(self, X): + return self.alpha * X**3 + self.beta + + def backward(self, dY): + # this gradient is intentionally incorrect. + return dY + diff --git a/onn/floats.py b/onn/floats.py new file mode 100644 index 0000000..7d3b2ca --- /dev/null +++ b/onn/floats.py @@ -0,0 +1,18 @@ +import numpy as np + +_f = np.float32 + +def _check(a): + assert isinstance(a, np.ndarray) or type(a) == _f, type(a) + assert a.dtype == _f, a.dtype + return a + +_0 = _f(0) +_1 = _f(1) +_2 = _f(2) +_inv2 = _f(1/2) +_sqrt2 = _f(np.sqrt(2)) +_invsqrt2 = _f(1/np.sqrt(2)) +_pi = _f(np.pi) + +__all__ = [o for o in locals()] diff --git a/onn/initialization.py b/onn/initialization.py new file mode 100644 index 0000000..46916c5 --- /dev/null +++ b/onn/initialization.py @@ -0,0 +1,31 @@ +import numpy as np + +# note: these are currently only implemented for 2D shapes. + +def init_zeros(size, ins=None, outs=None): + return np.zeros(size) + +def init_ones(size, ins=None, outs=None): + return np.ones(size) + +def init_he_normal(size, ins, outs): + s = np.sqrt(2 / ins) + return np.random.normal(0, s, size=size) + +def init_he_uniform(size, ins, outs): + s = np.sqrt(6 / ins) + return np.random.uniform(-s, s, size=size) + +def init_glorot_normal(size, ins, outs): + s = np.sqrt(2 / (ins + outs)) + return np.random.normal(0, s, size=size) + +def init_glorot_uniform(size, ins, outs): + s = np.sqrt(6 / (ins + outs)) + return np.random.uniform(-s, s, size=size) + +# more + +def init_gaussian_unit(size, ins, outs): + s = np.sqrt(1 / ins) + return np.random.normal(0, s, size=size) diff --git a/onn/layer.py b/onn/layer.py new file mode 100644 index 0000000..e1c9342 --- /dev/null +++ b/onn/layer.py @@ -0,0 +1,200 @@ +from .layer_base import * +from .initialization import * +from .floats import * + +# Nonparametric Layers {{{1 + +class Input(Layer): + def __init__(self, shape): + assert shape is not None + super().__init__() + self.shape = tuple(shape) + self.input_shape = self.shape + self.output_shape = self.shape + + def forward(self, X): + return X + + def backward(self, dY): + #self.dY = dY + return np.zeros_like(dY) + +class Reshape(Layer): + def __init__(self, new_shape): + super().__init__() + self.shape = tuple(new_shape) + self.output_shape = self.shape + + def forward(self, X): + self.batch_size = X.shape[0] + return X.reshape(self.batch_size, *self.output_shape) + + def backward(self, dY): + assert dY.shape[0] == self.batch_size + return dY.reshape(self.batch_size, *self.input_shape) + +class Flatten(Layer): + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + self.output_shape = (np.prod(shape),) + + def forward(self, X): + self.batch_size = X.shape[0] + return X.reshape(self.batch_size, *self.output_shape) + + def backward(self, dY): + assert dY.shape[0] == self.batch_size + return dY.reshape(self.batch_size, *self.input_shape) + +class ConstAffine(Layer): + def __init__(self, a=1, b=0): + super().__init__() + self.a = _f(a) + self.b = _f(b) + + def forward(self, X): + return self.a * X + self.b + + def backward(self, dY): + return dY * self.a + +class Sum(Layer): + def _propagate(self, edges, deterministic): + return np.sum(edges, axis=0) + + def _backpropagate(self, edges): + #assert len(edges) == 1, "unimplemented" + return edges[0] # TODO: does this always work? + +class ActivityRegularizer(Layer): + def __init__(self, reg): + super().__init__() + assert isinstance(reg, Regularizer), reg + self.reg = reg + + def forward(self, X): + self.X = X + self.loss = np.sum(self.reg.forward(X)) + return X + + def backward(self, dY): + return dY + self.reg.backward(self.X) + +class Dropout(Layer): + def __init__(self, dropout=0.0): + super().__init__() + self.p = _f(1 - dropout) + assert 0 <= self.p <= 1 + + def forward(self, X): + self.mask = (np.random.rand(*X.shape) < self.p) / self.p + return X * self.mask + + def forward_deterministic(self, X): + #self.mask = _1 + return X + + def backward(self, dY): + return dY * self.mask + +# more + +class AlphaDropout(Layer): + # to be used alongside Selu activations. + # paper: https://arxiv.org/abs/1706.02515 + + def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099): + super().__init__() + self.alpha = _f(alpha) + self.lamb = _f(lamb) + self.saturated = -self.lamb * self.alpha + self.dropout = _f(dropout) + + @property + def dropout(self): + return self._dropout + + @dropout.setter + def dropout(self, x): + self._dropout = _f(x) + self.q = 1 - self._dropout + assert 0 <= self.q <= 1 + + sat = self.saturated + + self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout) + self.b = -self.a * (self._dropout * sat) + + def forward(self, X): + self.mask = np.random.rand(*X.shape) < self.q + return self.a * np.where(self.mask, X, self.saturated) + self.b + + def forward_deterministic(self, X): + return X + + def backward(self, dY): + return dY * self.a * self.mask + +class Decimate(Layer): + # simple decimaton layer that drops every other sample from the last axis. + + def __init__(self, phase='even'): + super().__init__() + # phase is the set of samples we keep in the forward pass. + assert phase in ('even', 'odd'), phase + self.phase = phase + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + divy = (shape[-1] + 1) // 2 if self.phase == 'even' else shape[-1] // 2 + self.output_shape = tuple(list(shape[:-1]) + [divy]) + self.dX = np.zeros(self.input_shape, dtype=_f) + + def forward(self, X): + self.batch_size = X.shape[0] + if self.phase == 'even': + return X.ravel()[0::2].reshape(self.batch_size, *self.output_shape) + elif self.phase == 'odd': + return X.ravel()[1::2].reshape(self.batch_size, *self.output_shape) + + def backward(self, dY): + assert dY.shape[0] == self.batch_size + dX = np.zeros((self.batch_size, *self.input_shape), dtype=_f) + if self.phase == 'even': + dX.ravel()[0::2] = dY.ravel() + elif self.phase == 'odd': + dX.ravel()[1::2] = dY.ravel() + return dX + +class Undecimate(Layer): + # inverse operation of Decimate. not quite interpolation. + + def __init__(self, phase='even'): + super().__init__() + # phase is the set of samples we keep in the backward pass. + assert phase in ('even', 'odd'), phase + self.phase = phase + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + mult = shape[-1] * 2 + self.output_shape = tuple(list(shape[:-1]) + [mult]) + + def forward(self, X): + self.batch_size = X.shape[0] + Y = np.zeros((self.batch_size, *self.output_shape), dtype=_f) + if self.phase == 'even': + Y.ravel()[0::2] = X.ravel() + elif self.phase == 'odd': + Y.ravel()[1::2] = X.ravel() + return Y + + def backward(self, dY): + assert dY.shape[0] == self.batch_size + if self.phase == 'even': + return dY.ravel()[0::2].reshape(self.batch_size, *self.input_shape) + elif self.phase == 'odd': + return dY.ravel()[1::2].reshape(self.batch_size, *self.input_shape) diff --git a/onn/layer_base.py b/onn/layer_base.py new file mode 100644 index 0000000..a8cc253 --- /dev/null +++ b/onn/layer_base.py @@ -0,0 +1,143 @@ +import numpy as np + +from collections import defaultdict, OrderedDict + +from .weights import * + +# used for numbering layers like Keras: +_layer_counters = defaultdict(lambda: 0) + +class LayerIncompatibility(Exception): + pass + +class Layer: + def __init__(self): + self.parents = [] + self.children = [] + self.weights = OrderedDict() + self.loss = None # for activity regularizers + self.input_shape = None + self.output_shape = None + kind = self.__class__.__name__ + global _layer_counters + _layer_counters[kind] += 1 + self.name = "{}_{}".format(kind, _layer_counters[kind]) + self.unsafe = False # disables assertions for better performance + self.shared = False # as in weight sharing + + def __str__(self): + return self.name + + # methods we might want to override: + + def forward(self, X): + raise NotImplementedError("unimplemented", self) + + def forward_deterministic(self, X): + return self.forward(X) + + def backward(self, dY): + raise NotImplementedError("unimplemented", self) + + def make_shape(self, parent): + if self.input_shape == None: + self.input_shape = parent.output_shape + if self.output_shape == None: + self.output_shape = self.input_shape + + def do_feed(self, child): + self.children.append(child) + + def be_fed(self, parent): + self.parents.append(parent) + + # TODO: better names for these (still) + def _propagate(self, edges, deterministic): + if not self.unsafe: + assert len(edges) == 1, self + if deterministic: + return self.forward_deterministic(edges[0]) + else: + return self.forward(edges[0]) + + def _backpropagate(self, edges): + if len(edges) == 1: + return self.backward(edges[0]) + return sum((self.backward(dY) for dY in edges)) + + # general utility methods: + + def is_compatible(self, parent): + return np.all(self.input_shape == parent.output_shape) + + def feed(self, child): + assert self.output_shape is not None, self + child.make_shape(self) + if not child.is_compatible(self): + fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" + raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape)) + self.do_feed(child) + child.be_fed(self) + return child + + def validate_input(self, X): + assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) + + def validate_output(self, Y): + assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) + + def _new_weights(self, name, **kwargs): + w = Weights(**kwargs) + assert name not in self.weights, name + self.weights[name] = w + return w + + def share(self, node): + self.weights = node.weights # TODO: this should be all it takes. + for k, v in self.weights.items(): + vs = getattr(node, k) # hack: key isn't necessarily attribute name! + setattr(self, k, vs) + self.shared = True + + def clear_grad(self): + for name, w in self.weights.items(): + w.g[:] = 0 + + @property + def size(self): + return sum((w.size for w in self.weights.values())) + + def init(self, allocator): + ins, outs = self.input_shape[0], self.output_shape[0] + for k, w in self.weights.items(): + w.allocate(ins, outs, allocator=allocator) + + def propagate(self, values, deterministic): + if not self.unsafe: + assert self.parents, self + edges = [] + for parent in self.parents: + if parent in values: + X = values[parent] + if not self.unsafe: + self.validate_input(X) + edges.append(X) + Y = self._propagate(edges, deterministic) + if not self.unsafe: + self.validate_output(Y) + return Y + + def backpropagate(self, values): + if not self.unsafe: + assert self.children, self + edges = [] + for child in self.children: + if child in values: + dY = values[child] + if not self.unsafe: + self.validate_output(dY) + edges.append(dY) + dX = self._backpropagate(edges) + if not self.unsafe: + self.validate_input(dX) + return dX diff --git a/onn/learner.py b/onn/learner.py new file mode 100644 index 0000000..f60dc99 --- /dev/null +++ b/onn/learner.py @@ -0,0 +1,180 @@ +from .floats import * +from .optimizer_base import * + +class Learner: + per_batch = False + + def __init__(self, optim, epochs=100, rate=None): + assert isinstance(optim, Optimizer) + self.optim = optim + self.start_rate = rate # None is okay; it'll use optim.lr instead. + self.epochs = int(epochs) + self.reset() + + def reset(self, optim=False): + self.started = False + self.epoch = 0 + if optim: + self.optim.reset() + + @property + def epoch(self): + return self._epoch + + @epoch.setter + def epoch(self, new_epoch): + self._epoch = int(new_epoch) + if 0 <= self.epoch <= self.epochs: + self.rate = self.rate_at(self._epoch) + + @property + def rate(self): + return self.optim.lr + + @rate.setter + def rate(self, new_rate): + self.optim.lr = new_rate + + def rate_at(self, epoch): + if self.start_rate is None: + return self.optim.lr + return self.start_rate + + def next(self): + # prepares the next epoch. returns whether or not to continue training. + if not self.started: + self.started = True + self.epoch += 1 + if self.epoch > self.epochs: + return False + return True + + def batch(self, progress): # TODO: rename + # interpolates rates between epochs. + # unlike epochs, we do not store batch number as a state. + # i.e. calling next() will not respect progress. + assert 0 <= progress <= 1 + self.rate = self.rate_at(self._epoch + progress) + + @property + def final_rate(self): + return self.rate_at(self.epochs - 1e-8) + +class AnnealingLearner(Learner): + def __init__(self, optim, epochs=100, rate=None, halve_every=10): + self.halve_every = _f(halve_every) + self.anneal = _f(0.5**(1/self.halve_every)) + super().__init__(optim, epochs, rate) + + def rate_at(self, epoch): + return super().rate_at(epoch) * self.anneal**epoch + +def cosmod(x): + # plot: https://www.desmos.com/calculator/hlgqmyswy2 + return (_1 + np.cos((x % _1) * _pi)) * _inv2 + +class SGDR(Learner): + # Stochastic Gradient Descent with Restarts + # paper: https://arxiv.org/abs/1608.03983 + # NOTE: this is missing a couple of the proposed features. + + per_batch = True + + def __init__(self, optim, epochs=100, rate=None, + restarts=0, restart_decay=0.5, callback=None, + expando=0): + self.restart_epochs = int(epochs) + self.decay = _f(restart_decay) + self.restarts = int(restarts) + self.restart_callback = callback + + # TODO: rename expando to something not insane + self.expando = expando if expando is not None else lambda i: i + if type(self.expando) == int: + inc = self.expando + self.expando = lambda i: i * inc + + self.splits = [] + epochs = 0 + for i in range(0, self.restarts + 1): + split = epochs + self.restart_epochs + int(self.expando(i)) + self.splits.append(split) + epochs = split + super().__init__(optim, epochs, rate) + + def split_num(self, epoch): + previous = [0] + self.splits + for i, split in enumerate(self.splits): + if epoch - 1 < split: + sub_epoch = epoch - previous[i] + next_restart = split - previous[i] + return i, sub_epoch, next_restart + raise Exception('this should never happen.') + + def rate_at(self, epoch): + base_rate = self.start_rate if self.start_rate is not None else self.optim.lr + restart, sub_epoch, next_restart = self.split_num(max(1, epoch)) + x = _f(sub_epoch - 1) / _f(next_restart) + return base_rate * self.decay**_f(restart) * cosmod(x) + + def next(self): + if not super().next(): + return False + restart, sub_epoch, next_restart = self.split_num(self.epoch) + if restart > 0 and sub_epoch == 1: + if self.restart_callback is not None: + self.restart_callback(restart) + return True + +class TriangularCLR(Learner): + per_batch = True + + def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0, + frequency=100, callback=None): + # NOTE: start_rate is treated as upper_rate + self.frequency = int(frequency) + assert self.frequency > 0 + self.callback = callback + self.lower_rate = _f(lower_rate) + super().__init__(optim, epochs, upper_rate) + + def _t(self, epoch): + # NOTE: this could probably be simplified + offset = self.frequency / 2 + return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset + + def rate_at(self, epoch): + upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr + return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate + + def next(self): + if not super().next(): + return False + e = self.epoch - 1 + if e > 0 and e % self.frequency == 0: + if self.callback is not None: + self.callback(self.epoch // self.frequency) + return True + +class SineCLR(TriangularCLR): + def _t(self, epoch): + return np.sin(_pi * _inv2 * super()._t(epoch)) + +class WaveCLR(TriangularCLR): + def _t(self, epoch): + return _inv2 * (_1 - np.cos(_pi * super()._t(epoch))) + +# more + +class PolyLearner(Learner): + per_batch = True + + def __init__(self, optim, epochs=400, coeffs=(1,)): + self.coeffs = tuple(coeffs) + super().__init__(optim, epochs, np.polyval(self.coeffs, 0)) + + def rate_at(self, epoch): + progress = (epoch - 1) / (self.epochs) + ret = np.polyval(self.coeffs, progress) + return np.abs(ret) + diff --git a/onn/loss.py b/onn/loss.py new file mode 100644 index 0000000..94f8cb1 --- /dev/null +++ b/onn/loss.py @@ -0,0 +1,129 @@ +import numpy as np + +from .floats import * + +class Loss: + def forward(self, p, y): + raise NotImplementedError("unimplemented", self) + + def backward(self, p, y): + raise NotImplementedError("unimplemented", self) + +class NLL(Loss): # Negative Log Likelihood + def forward(self, p, y): + correct = p * y + return np.mean(-correct) + + def backward(self, p, y): + return -y / len(p) + +class CategoricalCrossentropy(Loss): + # lifted from theano + + def __init__(self, eps=1e-6): + self.eps = _f(eps) + + def forward(self, p, y): + p = np.clip(p, self.eps, 1 - self.eps) + f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1) + return np.mean(f) + + def backward(self, p, y): + p = np.clip(p, self.eps, 1 - self.eps) + df = (p - y) / (p * (1 - p)) + return df / len(y) + +class Accuracy(Loss): + # returns percentage of categories correctly predicted. + # utilizes argmax(), so it cannot be used for gradient descent. + # use CategoricalCrossentropy or NLL for that instead. + + def forward(self, p, y): + correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1) + return np.mean(correct) + + def backward(self, p, y): + raise NotImplementedError("cannot take the gradient of Accuracy") + +class ResidualLoss(Loss): + def forward(self, p, y): + return np.mean(self.f(p - y)) + + def backward(self, p, y): + ret = self.df(p - y) / len(y) + return ret + +class SquaredHalved(ResidualLoss): + def f(self, r): + return np.square(r) / 2 + + def df(self, r): + return r + +class Squared(ResidualLoss): + def f(self, r): + return np.square(r) + + def df(self, r): + return 2 * r + +class Absolute(ResidualLoss): + def f(self, r): + return np.abs(r) + + def df(self, r): + return np.sign(r) + +class Huber(ResidualLoss): + def __init__(self, delta=1.0): + self.delta = _f(delta) + + def f(self, r): + return np.where(r <= self.delta, + np.square(r) / 2, + self.delta * (np.abs(r) - self.delta / 2)) + + def df(self, r): + return np.where(r <= self.delta, + r, + self.delta * np.sign(r)) + +# more + +class SomethingElse(ResidualLoss): + # generalizes Absolute and SquaredHalved. + # plot: https://www.desmos.com/calculator/fagjg9vuz7 + def __init__(self, a=4/3): + assert 1 <= a <= 2, "parameter out of range" + self.a = _f(a / 2) + self.b = _f(2 / a) + self.c = _f(2 / a - 1) + + def f(self, r): + return self.a * np.abs(r)**self.b + + def df(self, r): + return np.sign(r) * np.abs(r)**self.c + +class Confidence(Loss): + # this isn't "confidence" in any meaningful way; (e.g. Bayesian) + # it's just a metric of how large the value is of the predicted class. + # when using it for loss, it acts like a crappy regularizer. + # it really just measures how much of a hot-shot the network thinks it is. + + def forward(self, p, y=None): + categories = p.shape[-1] + confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories) + # the exponent in softmax puts a maximum on confidence, + # but we don't compensate for that. if necessary, + # it'd be better to use an activation that doesn't have this limit. + return np.mean(confidence) + + def backward(self, p, y=None): + # in order to agree with the forward pass, + # using this backwards pass as-is will minimize confidence. + categories = p.shape[-1] + detc = p / categories / (1 - 1/categories) + dmax = p == np.max(p, axis=-1, keepdims=True) + return detc * dmax + diff --git a/onn/math.py b/onn/math.py new file mode 100644 index 0000000..794dfe6 --- /dev/null +++ b/onn/math.py @@ -0,0 +1,14 @@ +import numpy as np + +def rolling(a, window): + # http://stackoverflow.com/a/4924433 + shape = (a.size - window + 1, window) + strides = (a.itemsize, a.itemsize) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + +def rolling_batch(a, window): + # same as rolling, but acts on each batch (axis 0). + shape = (a.shape[0], a.shape[-1] - window + 1, window) + strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + diff --git a/onn/model.py b/onn/model.py new file mode 100644 index 0000000..bc5f68c --- /dev/null +++ b/onn/model.py @@ -0,0 +1,197 @@ +import sys + +from .floats import * +from .nodes import * +from .layer_base import * + +class Model: + def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False): + self.loss = loss if loss is not None else SquaredHalved() + self.mloss = mloss if mloss is not None else loss + + nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in + nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out + assert type(nodes_in) == list, type(nodes_in) + assert type(nodes_out) == list, type(nodes_out) + self.nodes_in = nodes_in + self.nodes_out = nodes_out + + self.nodes = traverse_all(self.nodes_in, self.nodes_out) + self.make_weights() + for node in self.nodes: + node.unsafe = unsafe + # TODO: handle the same layer being in more than one node. + + @property + def ordered_nodes(self): + # deprecated? we don't guarantee an order like we did before. + return self.nodes + + def make_weights(self): + self.param_count = sum((node.size for node in self.nodes if not node.shared)) + self.W = np.zeros(self.param_count, dtype=_f) + self.dW = np.zeros(self.param_count, dtype=_f) + + offset = 0 + for node in self.nodes: + if node.size > 0 and not node.shared: + inner_offset = 0 + + def allocate(size): + nonlocal inner_offset + o = offset + inner_offset + ret = self.W[o:o+size], self.dW[o:o+size] + inner_offset += size + assert len(ret[0]) == len(ret[1]) + assert size == len(ret[0]), (size, len(ret[0])) + return ret + + node.init(allocate) + assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node) + # i don't care if "less" is grammatically incorrect. + # you're mom is grammatically incorrect. + assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node) + offset += node.size + + def evaluate(self, input_, deterministic=True): + assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead" + assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead" + node_in = self.nodes_in[0] + node_out = self.nodes_out[0] + outputs = self.evaluate_multi({node_in: input_}, deterministic) + return outputs[node_out] + + def apply(self, error): # TODO: better name? + assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead" + assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead" + node_in = self.nodes_in[0] + node_out = self.nodes_out[0] + inputs = self.apply_multi({node_out: error}) + return inputs[node_in] + + def evaluate_multi(self, inputs, deterministic=True): + values = dict() + outputs = dict() + for node in self.nodes: + if node in self.nodes_in: + assert node in inputs, "missing input for node {}".format(node.name) + X = inputs[node] + values[node] = node._propagate(np.expand_dims(X, 0), deterministic) + else: + values[node] = node.propagate(values, deterministic) + if node in self.nodes_out: + outputs[node] = values[node] + return outputs + + def apply_multi(self, outputs): + values = dict() + inputs = dict() + for node in reversed(self.nodes): + if node in self.nodes_out: + assert node in outputs, "missing output for node {}".format(node.name) + X = outputs[node] + values[node] = node._backpropagate(np.expand_dims(X, 0)) + else: + values[node] = node.backpropagate(values) + if node in self.nodes_in: + inputs[node] = values[node] + return inputs + + def forward(self, inputs, outputs, measure=False, deterministic=False): + predicted = self.evaluate(inputs, deterministic=deterministic) + if measure: + error = self.mloss.forward(predicted, outputs) + else: + error = self.loss.forward(predicted, outputs) + return error, predicted + + def backward(self, predicted, outputs, measure=False): + if measure: + error = self.mloss.backward(predicted, outputs) + else: + error = self.loss.backward(predicted, outputs) + # input_delta is rarely useful; it's just to match the forward pass. + input_delta = self.apply(error) + return self.dW, input_delta + + def clear_grad(self): + for node in self.nodes: + node.clear_grad() + + def regulate_forward(self): + loss = _0 + for node in self.nodes: + if node.loss is not None: + loss += node.loss + for k, w in node.weights.items(): + loss += w.forward() + return loss + + def regulate(self): + for node in self.nodes: + for k, w in node.weights.items(): + w.update() + + def load_weights(self, fn): + # seemingly compatible with keras' Dense layers. + import h5py + open(fn) # just ensure the file exists (python's error is better) + f = h5py.File(fn, 'r') + weights = {} + def visitor(name, obj): + if isinstance(obj, h5py.Dataset): + weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f) + f.visititems(visitor) + f.close() + + used = {} + for k in weights.keys(): + used[k] = False + + nodes = [node for node in self.nodes if node.size > 0] + # TODO: support shared weights. + for node in nodes: + full_name = str(node).lower() + for s_name, o_name in node.serialized.items(): + key = full_name + '_' + s_name + data = weights[key] + target = getattr(node, o_name) + target.f[:] = data + used[key] = True + + for k, v in used.items(): + if not v: + lament("WARNING: unused weight", k) + + def save_weights(self, fn, overwrite=False): + import h5py + from collections import defaultdict + + f = h5py.File(fn, 'w') + + counts = defaultdict(lambda: 0) + + nodes = [node for node in self.nodes if node.size > 0] + # TODO: support shared weights. + for node in nodes: + full_name = str(node).lower() + grp = f.create_group(full_name) + for s_name, o_name in node.serialized.items(): + key = full_name + '_' + s_name + target = getattr(node, o_name) + data = grp.create_dataset(key, target.shape, dtype=_f) + data[:] = target.f + counts[key] += 1 + if counts[key] > 1: + lament("WARNING: rewrote weight", key) + + f.close() + + def print_graph(self, file=sys.stdout): + print('digraph G {', file=file) + for node in self.nodes: + children = [str(n) for n in node.children] + if children: + sep = '->' + print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file) + print('}', file=file) diff --git a/onn/nodes.py b/onn/nodes.py new file mode 100644 index 0000000..081b045 --- /dev/null +++ b/onn/nodes.py @@ -0,0 +1,43 @@ +class DummyNode: + name = "Dummy" + + def __init__(self, children=None, parents=None): + self.children = children if children is not None else [] + self.parents = parents if parents is not None else [] + +def traverse(node_in, node_out, nodes=None, dummy_mode=False): + # i have no idea if this is any algorithm in particular. + nodes = nodes if nodes is not None else [] + + seen_up = {} + q = [node_out] + while len(q) > 0: + node = q.pop(0) + seen_up[node] = True + for parent in node.parents: + q.append(parent) + + if dummy_mode: + seen_up[node_in] = True + + nodes = [] + q = [node_in] + while len(q) > 0: + node = q.pop(0) + if not seen_up[node]: + continue + parents_added = (parent in nodes for parent in node.parents) + if not node in nodes and all(parents_added): + nodes.append(node) + for child in node.children: + q.append(child) + + if dummy_mode: + nodes.remove(node_in) + + return nodes + +def traverse_all(nodes_in, nodes_out, nodes=None): + all_in = DummyNode(children=nodes_in) + all_out = DummyNode(parents=nodes_out) + return traverse(all_in, all_out, nodes, dummy_mode=True) diff --git a/onn/optimizer.py b/onn/optimizer.py new file mode 100644 index 0000000..5a0e91c --- /dev/null +++ b/onn/optimizer.py @@ -0,0 +1,531 @@ +import numpy as np + +from .floats import * +from .optimizer_base import * + +# some of the the following optimizers are blatantly lifted from tiny-dnn: +# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h + +class Momentum(Optimizer): + def __init__(self, lr=0.01, mu=0.9, nesterov=False): + self.mu = _f(mu) # momentum + self.nesterov = bool(nesterov) + + super().__init__(lr) + + def reset(self): + self.Vprev = None + + def compute(self, dW, W): + if self.Vprev is None: + self.Vprev = np.copy(dW) + + V = self.mu * self.Vprev - self.lr * dW + self.Vprev[:] = V + if self.nesterov: + return self.mu * V - self.lr * dW + + return V + +class Adagrad(Optimizer): + def __init__(self, lr=0.01, eps=1e-8): + self.eps = _f(eps) + + super().__init__(lr) + + def reset(self): + self.g = None + + def compute(self, dW, W): + if self.g is None: + self.g = np.zeros_like(dW) + + self.g += np.square(dW) + return -self.lr * dW / (np.sqrt(self.g) + self.eps) + +class RMSprop(Optimizer): + # RMSprop generalizes* Adagrad, etc. + + # * RMSprop == Adagrad when + # RMSprop.mu == 1 + + def __init__(self, lr=1e-4, mu=0.99, eps=1e-8): + self.mu = _f(mu) # decay term + self.eps = _f(eps) + + # one might consider the following equation when specifying mu: + # mu = e**(-1/t) + # default: t = -1/ln(0.99) = ~99.5 + # therefore the default of mu=0.99 means + # an input decays to 1/e its original amplitude over 99.5 batches. + # (this is from DSP, so how relevant it is in SGD is debatable) + + super().__init__(lr) + + def reset(self): + self.g = None + + def compute(self, dW, W): + if self.g is None: + self.g = np.zeros_like(dW) + + # basically apply a first-order low-pass filter to delta squared + self.g += (1 - self.mu) * (np.square(dW) - self.g) + + # finally sqrt it to complete the running root-mean-square approximation + return -self.lr * dW / (np.sqrt(self.g) + self.eps) + +class RMSpropCentered(Optimizer): + # referenced TensorFlow/PyTorch. + # paper: https://arxiv.org/pdf/1308.0850v5.pdf + + def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8): + self.aleph = _f(aleph) + self.momentum = _f(momentum) + self.eps = _f(eps) + + super().__init__(lr) + + def reset(self): + self.g = None + self.mt = None + self.vt = None + self.delta = None + + def compute(self, dW, W): + if self.g is None: + self.g = np.zeros_like(dW) + if self.mt is None: + self.mt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + if self.delta is None: + self.delta = np.zeros_like(dW) + + self.mt += (1 - self.aleph) * (dW - self.mt) + self.vt += (1 - self.aleph) * (np.square(dW) - self.vt) + + # PyTorch has the epsilon outside of the sqrt, + # TensorFlow and the paper have it within. + # in onn, we generally do it outside, as this seems to work better. + temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps) + + # TensorFlow does it this way. + self.delta[:] = self.momentum * self.delta + self.lr * temp + return -self.delta + # PyTorch does it this way. + #self.delta[:] = self.momentum * self.delta + temp + #return -self.lr * self.delta + # they are equivalent only when LR is constant, which it might not be. + +class Adam(Optimizer): + # paper: https://arxiv.org/abs/1412.6980 + # Adam generalizes* RMSprop, and + # adds a decay term to the regular (non-squared) delta, and performs + # debiasing to compensate for the filtered deltas starting from zero. + + # * Adam == RMSprop when + # Adam.b1 == 0 + # Adam.b2 == RMSprop.mu + + def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term + self.b1_t_default = _f(b1) # decay term power t + self.b2_t_default = _f(b2) # decay term power t + self.eps = _f(eps) + + super().__init__(lr) + + def reset(self): + self.mt = None + self.vt = None + self.b1_t = self.b1_t_default + self.b2_t = self.b2_t_default + + def compute(self, dW, W): + if self.mt is None: + self.mt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + + # decay gain + self.b1_t *= self.b1 + self.b2_t *= self.b2 + + # filter + self.mt += (1 - self.b1) * (dW - self.mt) + self.vt += (1 - self.b2) * (np.square(dW) - self.vt) + + return -self.lr * (self.mt / (1 - self.b1_t)) \ + / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) + +class Nadam(Optimizer): + # paper: https://arxiv.org/abs/1412.6980 + # paper: http://cs229.stanford.edu/proj2015/054_report.pdf + # TODO: double-check this implementation. also read the damn paper. + # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 + # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py + + def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term + self.eps = _f(eps) + + super().__init__(lr) + + def reset(self): + self.mt = None + self.vt = None + self.t = 0 + self.sched = 1 + + def compute(self, dW, W): + self.t += 1 + + if self.mt is None: + self.mt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + + ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0)) + ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1)) + + sched0 = self.sched * ut0 + sched1 = self.sched * ut0 * ut1 + self.sched = sched0 + + gp = dW / (1 - sched0) + + self.mt += (1 - self.b1) * (dW - self.mt) + self.vt += (1 - self.b2) * (np.square(dW) - self.vt) + + mtp = self.mt / (1 - sched1) + vtp = self.vt / (1 - self.b2**self.t) + + mt_bar = (1 - ut0) * gp + ut1 * mtp + + return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) + +# more + +class FTML(Optimizer): + # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf + # author's implementation: https://github.com/szhengac/optim/commit/923555e + + def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8): + self.iterations = _0 + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term + self.eps = _f(eps) + + super().__init__(lr) + + def reset(self): + self.dt1 = None + self.dt = None + self.vt = None + self.zt = None + self.b1_t = _1 + self.b2_t = _1 + + def compute(self, dW, W): + if self.dt1 is None: self.dt1 = np.zeros_like(dW) + if self.dt is None: self.dt = np.zeros_like(dW) + if self.vt is None: self.vt = np.zeros_like(dW) + if self.zt is None: self.zt = np.zeros_like(dW) + + # NOTE: we could probably rewrite these equations to avoid this copy. + self.dt1[:] = self.dt[:] + + self.b1_t *= self.b1 + self.b2_t *= self.b2 + + # hardly an elegant solution. + lr = max(self.lr, self.eps) + + # same as Adam's vt. + self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW + + # you can factor "inner" out of Adam as well. + inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps + self.dt[:] = (1 - self.b1_t) / lr * inner + + sigma_t = self.dt - self.b1 * self.dt1 + + # Adam's mt minus the sigma term. + self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W + + # subtract by weights to avoid having to override self.update. + return -self.zt / self.dt - W + +class MomentumClip(Optimizer): + def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False): + self.mu = _f(mu) + self.clip = _f(clip) + self.nesterov = bool(nesterov) + self.debug = bool(debug) + + super().__init__(lr) + + def reset(self): + self.accum = None + + def compute(self, dW, W): + if self.accum is None: + self.accum = np.zeros_like(dW) + + total_norm = np.linalg.norm(dW) + clip_scale = self.clip / (total_norm + 1e-6) + if clip_scale < 1: + if self.debug: + lament("clipping gradients; norm: {:10.5f}".format(total_norm)) + dW *= clip_scale + + self.accum[:] = self.accum * self.mu + dW + if self.nesterov: + return -self.lr * (self.accum * self.mu + dW) + else: + return -self.lr * self.accum + +class YellowFin(Optimizer): + # paper: https://arxiv.org/abs/1706.03471 + # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/ + # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py + # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666 + + def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20, + debias=True, clip=1.0): + self.lr_default = _f(lr) + self.mu_default = _f(mu) + self.beta = _f(beta) + self.window_size = int(window_size) # curv_win_width + self.debias_enabled = bool(debias) + self.clip = _f(clip) + + self.mu = _f(mu) # momentum + super().__init__(lr) + + def reset(self): + self.accum = None + + self.lr = self.lr_default + self.mu = self.mu_default + + self.step = 0 + self.beta_t = self.beta + + self.curv_win = np.zeros([self.window_size,], dtype=np.float32) + + self.h_min = None + self.h_max = None + + self.g_lpf = 0 + #self.g_squared_lpf = 0 + self.g_norm_squared_lpf = 0 + self.g_norm_lpf = 0 + self.h_min_lpf = 0 + self.h_max_lpf = 0 + self.dist_lpf = 0 + self.lr_lpf = 0 + self.mu_lpf = 0 + + def get_lr_mu(self): + p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var) + w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5) + w = np.power(w3, 1/3) + y = w - p / (3 * w) + sqrt_mu1 = y + 1 + + sqrt_h_min = np.sqrt(self.h_min) + sqrt_h_max = np.sqrt(self.h_max) + sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min) + + sqrt_mu = max(sqrt_mu1, sqrt_mu2) + if sqrt_mu2 > sqrt_mu1: + print('note: taking dr calculation. something may have exploded.') + + lr = np.square(1 - sqrt_mu) / self.h_min + mu = np.square(sqrt_mu) + return lr, mu + + def compute(self, dW, W): + if self.accum is None: + self.accum = np.zeros_like(dW) + + # TODO: prevent allocations everywhere by using [:]. + # assuming that really works. i haven't actually checked. + + total_norm = np.linalg.norm(dW) + clip_scale = self.clip / (total_norm + 1e-6) + if clip_scale < 1: + #print("clipping gradients; norm: {:10.5f}".format(total_norm)) + dW *= clip_scale + + #fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3' + #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100)) + + b = self.beta + m1b = 1 - self.beta + debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1 + + g = dW + g_squared = np.square(g) + g_norm_squared = np.sum(g_squared) + g_norm = np.sqrt(g_norm_squared) + + self.curv_win[self.step % self.window_size] = g_norm_squared + valid_window = self.curv_win[:min(self.window_size, self.step + 1)] + h_min_t = np.min(valid_window) + h_max_t = np.max(valid_window) + + self.g_lpf = b * self.g_lpf + m1b * g + #self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared + self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared + self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm + self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t + self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t + + g_avg = debias * self.g_lpf + #g_squared_avg = debias * self.g_squared_lpf + g_norm_squared_avg = debias * self.g_norm_squared_lpf + g_norm_avg = debias * self.g_norm_lpf + self.h_min = debias * self.h_min_lpf + self.h_max = debias * self.h_max_lpf + assert self.h_max >= self.h_min + + dist = g_norm_avg / g_norm_squared_avg + + self.dist_lpf = b * self.dist_lpf + m1b * dist + + self.dist_avg = debias * self.dist_lpf + + self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg)) + # equivalently: + #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg))) + + if self.step > 0: + lr_for_real, mu_for_real = self.get_lr_mu() + self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real + self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real + self.mu = debias * self.mu_lpf + self.lr = debias * self.lr_lpf + + self.accum[:] = self.accum * self.mu - self.lr * dW + V = self.accum + + self.step += 1 + self.beta_t *= self.beta + return V + +class AddSign(Optimizer): + # paper: https://arxiv.org/abs/1709.07417 + + def __init__(self, lr=0.01, mu=0.9, alpha=1): + self.mu = _f(mu) + self.alpha = _f(alpha) + + super().__init__(lr) + + def reset(self): + self.accum = None + + def compute(self, dW, W): + if self.accum is None: + self.accum = np.zeros_like(dW) + + self.accum[:] = self.accum * self.mu + dW + + signed = np.sign(dW) * np.sign(self.accum) + #signed *= decay + + return -self.lr * dW * (self.alpha + signed) + +class PowerSign(Optimizer): + # paper: https://arxiv.org/abs/1709.07417 + + def __init__(self, lr=0.01, mu=0.9, alpha=np.e): + self.mu = _f(mu) + self.alpha = _f(alpha) + self.use_exp = np.isclose(self.alpha, _f(np.e)) + + super().__init__(lr) + + def reset(self): + self.accum = None + + def compute(self, dW, W): + if self.accum is None: + self.accum = np.zeros_like(dW) + + self.accum[:] = self.accum * self.mu + dW + + signed = np.sign(dW) * np.sign(self.accum) + #signed *= decay + + if self.use_exp: + return -self.lr * dW * np.exp(signed) + else: + return -self.lr * dW * np.power(self.alpha, signed) + +class Neumann(Optimizer): + # paper: https://arxiv.org/abs/1712.03298 + # NOTE: this implementation is missing resetting as described in the paper. + # resetting is totally disabled for now. + # NOTE: this implementation does not use vanilla SGD for its first epochs. + # you should do this yourself if you need it. + # it seems like using a Learner like SineCLR makes this unnecessary. + + def __init__(self, lr=0.01): + self.alpha = _f(1e-7) # cubic. + self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later. + self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing. + # momentum is ∝ (in the shape of) 1 - 1/(1 + t) + self.mu_min = _f(0.5) + self.mu_max = _f(0.9) + self.reset_period = 0 # TODO + + super().__init__(lr) + + def reset(self): + # NOTE: mt and vt are different than the pair in Adam-like optimizers. + self.mt = None # momentum accumulator. + self.vt = None # weight accumulator. + self.t = 0 + + def compute(self, dW, W): + raise Exception("compute() is not available for this Optimizer.") + + def update(self, dW, W): + self.t += 1 + + if self.mt is None: + self.mt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + + if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0: + self.mt = -self.lr * dW + return + + # momentum quantity: + mu = _1 - _1/_f(self.t) # the + 1 is implicit. + mu = (mu + self.mu_min) * (self.mu_max - self.mu_min) + + # smoothed change in weights: + delta = W - self.vt + delta_norm_squared = np.square(delta).sum() + delta_norm = np.sqrt(delta_norm_squared) + + # regularization terms: (push and pull) + cubic_reg = self.alpha * delta_norm_squared + repulsive_reg = self.beta * dW.size / delta_norm_squared + dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm) + + # plain momentum: + self.mt = mu * self.mt - self.lr * dt + + # weights and accumulator: + W += mu * self.mt - self.lr * dt + self.vt = W + self.gamma * (self.vt - W) + diff --git a/onn/optimizer_base.py b/onn/optimizer_base.py new file mode 100644 index 0000000..d7251c0 --- /dev/null +++ b/onn/optimizer_base.py @@ -0,0 +1,19 @@ +import numpy as np + +from .floats import * + +class Optimizer: + def __init__(self, lr=0.1): + self.lr = _f(lr) # learning rate + self.reset() + + def reset(self): + pass + + def compute(self, dW, W): + return -self.lr * dW + + def update(self, dW, W): + W += self.compute(dW, W) + + diff --git a/onn/parametric.py b/onn/parametric.py new file mode 100644 index 0000000..b1e6560 --- /dev/null +++ b/onn/parametric.py @@ -0,0 +1,254 @@ +import numpy as np + +from .floats import * +from .layer_base import * +from .initialization import * + +class Bias(Layer): + # TODO: support axes other than -1 and shapes other than 1D. + + serialized = { + 'b': 'biases', + } + + def __init__(self, init=init_zeros, reg_b=None): + super().__init__() + self.biases = self._new_weights('biases', init=init, regularizer=reg_b) + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + self.output_shape = shape + self.biases.shape = (shape[-1],) + + def forward(self, X): + return X + self.biases.f + + def backward(self, dY): + self.biases.g += dY.sum(0) + return dY + +class Dense(Layer): + serialized = { + 'W': 'coeffs', + 'b': 'biases', + } + + def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None): + super().__init__() + self.dim = int(dim) + self.output_shape = (dim,) + self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) + self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + assert len(shape) == 1, shape + self.coeffs.shape = (shape[0], self.dim) + self.biases.shape = (1, self.dim) + + def forward(self, X): + self.X = X + return X @ self.coeffs.f + self.biases.f + + def backward(self, dY): + self.coeffs.g += self.X.T @ dY + self.biases.g += dY.sum(0, keepdims=True) + return dY @ self.coeffs.f.T + + +# more + +class Conv1Dper(Layer): + # periodic (circular) convolution. + # currently only supports one channel I/O. + # some notes: + # we could use FFTs for larger convolutions. + # i think storing the coefficients backwards would + # eliminate reversal in the critical code. + + serialize = { + 'W': 'coeffs', + } + + def __init__(self, kernel_size, pos=None, + init=init_glorot_uniform, reg_w=None): + super().__init__() + self.kernel_size = int(kernel_size) + self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) + if pos is None: + self.wrap0 = (self.kernel_size - 0) // 2 + self.wrap1 = (self.kernel_size - 1) // 2 + elif pos == 'alt': + self.wrap0 = (self.kernel_size - 1) // 2 + self.wrap1 = (self.kernel_size - 0) // 2 + elif pos == 'left': + self.wrap0 = 0 + self.wrap1 = self.kernel_size - 1 + elif pos == 'right': + self.wrap0 = self.kernel_size - 1 + self.wrap1 = 0 + else: + raise Exception("pos parameter not understood: {}".format(pos)) + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + assert len(shape) == 1, shape + self.output_shape = shape + self.coeffs.shape = (1, self.kernel_size) + + def forward(self, X): + if self.wrap0 == 0: + Xper = np.hstack((X,X[:,:self.wrap1])) + elif self.wrap1 == 0: + Xper = np.hstack((X[:,-self.wrap0:],X)) + else: + Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1])) + self.cols = rolling_batch(Xper, self.kernel_size) + convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2) + return convolved + + def backward(self, dY): + self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True) + return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2) + +class LayerNorm(Layer): + # paper: https://arxiv.org/abs/1607.06450 + # note: nonparametric when affine == False + + def __init__(self, eps=1e-5, affine=True): + super().__init__() + self.eps = _f(eps) + self.affine = bool(affine) + + if self.affine: + self.gamma = self._new_weights('gamma', init=init_ones) + self.beta = self._new_weights('beta', init=init_zeros) + self.serialized = { + 'gamma': 'gamma', + 'beta': 'beta', + } + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + self.output_shape = shape + assert len(shape) == 1, shape + if self.affine: + self.gamma.shape = (shape[0],) + self.beta.shape = (shape[0],) + + def forward(self, X): + self.mean = X.mean(0) + self.center = X - self.mean + self.var = self.center.var(0) + self.eps + self.std = np.sqrt(self.var) + + self.Xnorm = self.center / self.std + if self.affine: + return self.gamma.f * self.Xnorm + self.beta.f + return self.Xnorm + + def backward(self, dY): + length = dY.shape[0] + + if self.affine: + dXnorm = dY * self.gamma.f + self.gamma.g += (dY * self.Xnorm).sum(0) + self.beta.g += dY.sum(0) + else: + dXnorm = dY + + dstd = (dXnorm * self.center).sum(0) / -self.var + dcenter = dXnorm / self.std + dstd / self.std * self.center / length + dmean = -dcenter.sum(0) + dX = dcenter + dmean / length + + return dX + +class Denses(Layer): # TODO: rename? + # acts as a separate Dense for each row or column. only for 2D arrays. + + serialized = { + 'W': 'coeffs', + 'b': 'biases', + } + + def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1): + super().__init__() + self.dim = int(dim) + self.weight_init = init + self.axis = int(axis) + self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) + self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) + + def make_shape(self, parent): + shape = parent.output_shape + self.input_shape = shape + assert len(shape) == 2, shape + + assert -len(shape) <= self.axis < len(shape) + self.axis = self.axis % len(shape) + + self.output_shape = list(shape) + self.output_shape[self.axis] = self.dim + self.output_shape = tuple(self.output_shape) + + in_rows = self.input_shape[0] + in_cols = self.input_shape[1] + out_rows = self.output_shape[0] + out_cols = self.output_shape[1] + + self.coeffs.shape = (in_rows, in_cols, self.dim) + self.biases.shape = (1, out_rows, out_cols) + + def forward(self, X): + self.X = X + if self.axis == 0: + return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f + elif self.axis == 1: + return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f + + def backward(self, dY): + self.biases.g += dY.sum(0, keepdims=True) + if self.axis == 0: + self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY) + return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f) + elif self.axis == 1: + self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY) + return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f) + +class CosineDense(Dense): + # paper: https://arxiv.org/abs/1702.05870 + # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36 + # the paper doesn't mention bias, + # so we treat bias as an additional weight with a constant input of 1. + # this is correct in Dense layers, so i hope it's correct here too. + + eps = 1e-4 + + def forward(self, X): + self.X = X + self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \ + + 1 + self.eps) + self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \ + + np.square(self.biases.f) + self.eps) + self.dot = X @ self.coeffs.f + self.biases.f + Y = self.dot / (self.X_norm * self.W_norm) + return Y + + def backward(self, dY): + ddot = dY / self.X_norm / self.W_norm + dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2 + dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2 + + self.coeffs.g += self.X.T @ ddot \ + + dW_norm / self.W_norm * self.coeffs.f + self.biases.g += ddot.sum(0, keepdims=True) \ + + dW_norm / self.W_norm * self.biases.f + dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X + + return dX + diff --git a/onn/regularizer.py b/onn/regularizer.py new file mode 100644 index 0000000..b8b1ac7 --- /dev/null +++ b/onn/regularizer.py @@ -0,0 +1,45 @@ +import numpy as np + +from .floats import * + +class Regularizer: + pass + +class L1L2(Regularizer): + def __init__(self, l1=0.0, l2=0.0): + self.l1 = _f(l1) + self.l2 = _f(l2) + + def forward(self, X): + f = _0 + if self.l1: + f += np.sum(self.l1 * np.abs(X)) + if self.l2: + f += np.sum(self.l2 * np.square(X)) + return f + + def backward(self, X): + df = np.zeros_like(X) + if self.l1: + df += self.l1 * np.sign(X) + if self.l2: + df += self.l2 * 2 * X + return df + +# more + +class SaturateRelu(Regularizer): + # paper: https://arxiv.org/abs/1703.09202 + # TODO: test this (and ActivityRegularizer) more thoroughly. + # i've looked at the histogram of the resulting weights. + # it seems like only the layers after this are affected + # the way they should be. + + def __init__(self, lamb=0.0): + self.lamb = _f(lamb) + + def forward(self, X): + return self.lamb * np.where(X >= 0, X, 0) + + def backward(self, X): + return self.lamb * np.where(X >= 0, 1, 0) diff --git a/onn/ritual.py b/onn/ritual.py new file mode 100644 index 0000000..7499e6b --- /dev/null +++ b/onn/ritual.py @@ -0,0 +1,90 @@ +import numpy as np + +from .floats import * +from .initialization import * +from .ritual_base import * + +def stochastic_multiply(W, gamma=0.5, allow_negation=False): + # paper: https://arxiv.org/abs/1606.01981 + + assert W.ndim == 1, W.ndim + assert 0 < gamma < 1, gamma + size = len(W) + alpha = np.max(np.abs(W)) + # NOTE: numpy gives [low, high) but the paper advocates [low, high] + mult = np.random.uniform(gamma, 1/gamma, size=size) + if allow_negation: + # NOTE: i have yet to see this do anything but cause divergence. + # i've referenced the paper several times yet still don't understand + # what i'm doing wrong, so i'm disabling it by default in my code. + # maybe i just need *a lot* more weights to compensate. + prob = (W / alpha + 1) / 2 + samples = np.random.random_sample(size=size) + mult *= np.where(samples < prob, 1, -1) + np.multiply(W, mult, out=W) + +class StochMRitual(Ritual): + # paper: https://arxiv.org/abs/1606.01981 + # this probably doesn't make sense for regression problems, + # let alone small models, but here it is anyway! + + def __init__(self, learner=None, gamma=0.5): + super().__init__(learner) + self.gamma = _f(gamma) + + def prepare(self, model): + self.W = np.copy(model.W) + super().prepare(model) + + def learn(self, inputs, outputs): + # an experiment: + #assert self.learner.rate < 10, self.learner.rate + #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate)) + + self.W[:] = self.model.W + for layer in self.model.ordered_nodes: + if isinstance(layer, Dense): + stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma) + residual = super().learn(inputs, outputs) + self.model.W[:] = self.W + return residual + + def update(self): + super().update() + f = 0.5 + for layer in self.model.ordered_nodes: + if isinstance(layer, Dense): + np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W) + # np.clip(layer.W, -1, 1, out=layer.W) + +class NoisyRitual(Ritual): + def __init__(self, learner=None, + input_noise=0, output_noise=0, gradient_noise=0): + self.input_noise = _f(input_noise) + self.output_noise = _f(output_noise) + self.gradient_noise = _f(gradient_noise) + super().__init__(learner) + + def learn(self, inputs, outputs): + # this is pretty crude + if self.input_noise > 0: + s = self.input_noise + inputs = inputs + np.random.normal(0, s, size=inputs.shape) + if self.output_noise > 0: + s = self.output_noise + outputs = outputs + np.random.normal(0, s, size=outputs.shape) + return super().learn(inputs, outputs) + + def update(self): + # gradient noise paper: https://arxiv.org/abs/1511.06807 + if self.gradient_noise > 0: + size = len(self.model.dW) + gamma = 0.55 + #s = self.gradient_noise / (1 + self.bn) ** gamma + # experiments: + s = self.gradient_noise * np.sqrt(self.learner.rate) + #s = np.square(self.learner.rate) + #s = self.learner.rate / self.en + self.model.dW += np.random.normal(0, max(s, 1e-8), size=size) + super().update() + diff --git a/onn/ritual_base.py b/onn/ritual_base.py new file mode 100644 index 0000000..5362a3a --- /dev/null +++ b/onn/ritual_base.py @@ -0,0 +1,132 @@ +import types +import numpy as np + +from .floats import * + +class Ritual: # i'm just making up names at this point. + def __init__(self, learner=None): + self.learner = learner if learner is not None else Learner(Optimizer()) + self.model = None + + def reset(self): + self.learner.reset(optim=True) + self.en = 0 + self.bn = 0 + + def learn(self, inputs, outputs): + error, predicted = self.model.forward(inputs, outputs) + self.model.backward(predicted, outputs) + self.model.regulate() + return error, predicted + + def update(self): + optim = self.learner.optim + optim.model = self.model + optim.update(self.model.dW, self.model.W) + + def prepare(self, model): + self.en = 0 + self.bn = 0 + self.model = model + + def _train_batch(self, batch_inputs, batch_outputs, b, batch_count, + test_only=False, loss_logging=False, mloss_logging=True): + if not test_only and self.learner.per_batch: + self.learner.batch(b / batch_count) + + if test_only: + predicted = self.model.evaluate(batch_inputs, deterministic=True) + else: + error, predicted = self.learn(batch_inputs, batch_outputs) + self.model.regulate_forward() + self.update() + + if loss_logging: + batch_loss = self.model.loss.forward(predicted, batch_outputs) + if np.isnan(batch_loss): + raise Exception("nan") + self.losses.append(batch_loss) + self.cumsum_loss += batch_loss + + if mloss_logging: + # NOTE: this can use the non-deterministic predictions. fixme? + batch_mloss = self.model.mloss.forward(predicted, batch_outputs) + if np.isnan(batch_mloss): + raise Exception("nan") + self.mlosses.append(batch_mloss) + self.cumsum_mloss += batch_mloss + + def train_batched(self, inputs_or_generator, outputs_or_batch_count, + batch_size=None, + return_losses=False, test_only=False, shuffle=True, + clear_grad=True): + assert isinstance(return_losses, bool) or return_losses == 'both' + assert self.model is not None + + gen = isinstance(inputs_or_generator, types.GeneratorType) + if gen: + generator = inputs_or_generator + batch_count = outputs_or_batch_count + assert isinstance(batch_count, int), type(batch_count) + else: + inputs = inputs_or_generator + outputs = outputs_or_batch_count + + if not test_only: + self.en += 1 + + if shuffle: + if gen: + raise Exception("shuffling is incompatibile with using a generator.") + indices = np.arange(inputs.shape[0]) + np.random.shuffle(indices) + inputs = inputs[indices] + outputs = outputs[indices] + + self.cumsum_loss, self.cumsum_mloss = _0, _0 + self.losses, self.mlosses = [], [] + + if not gen: + batch_count = inputs.shape[0] // batch_size + # TODO: lift this restriction + assert inputs.shape[0] % batch_size == 0, \ + "inputs is not evenly divisible by batch_size" + + prev_batch_size = None + for b in range(batch_count): + if not test_only: + self.bn += 1 + + if gen: + batch_inputs, batch_outputs = next(generator) + batch_size = batch_inputs.shape[0] + # TODO: lift this restriction + assert batch_size == prev_batch_size or prev_batch_size is None, \ + "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size) + else: + bi = b * batch_size + batch_inputs = inputs[ bi:bi+batch_size] + batch_outputs = outputs[bi:bi+batch_size] + + if clear_grad: + self.model.clear_grad() + self._train_batch(batch_inputs, batch_outputs, b, batch_count, + test_only, return_losses=='both', return_losses) + + prev_batch_size = batch_size + + avg_mloss = self.cumsum_mloss / _f(batch_count) + if return_losses == 'both': + avg_loss = self.cumsum_loss / _f(batch_count) + return avg_loss, avg_mloss, self.losses, self.mlosses + elif return_losses: + return avg_mloss, self.mlosses + return avg_mloss + + def test_batched(self, inputs, outputs, *args, **kwargs): + return self.train_batched(inputs, outputs, *args, + test_only=True, **kwargs) + + def train_batched_gen(self, generator, batch_count, *args, **kwargs): + return self.train_batched(generator, batch_count, *args, + shuffle=False, **kwargs) diff --git a/onn/util.py b/onn/util.py new file mode 100644 index 0000000..edd895c --- /dev/null +++ b/onn/util.py @@ -0,0 +1,37 @@ +import sys + +def lament(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + +def lower_priority(): + """Set the priority of the process to below-normal.""" + # via https://stackoverflow.com/a/1023269 + if sys.platform == 'win32': + try: + import win32api, win32process, win32con + pid = win32api.GetCurrentProcessId() + handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid) + win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS) + except ImportError: + lament("you do not have pywin32 installed.") + lament("the process priority could not be lowered.") + lament("consider: python -m pip install pypiwin32") + lament("consider: conda install pywin32") + else: + import os + os.nice(1) + +# more + +_log_was_update = False +def log(left, right, update=False): + s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right) + global _log_was_update + if update and _log_was_update: + lament('\x1B[F' + s) + else: + lament(s) + _log_was_update = update + +class Dummy: + pass diff --git a/onn/weights.py b/onn/weights.py new file mode 100644 index 0000000..a531d64 --- /dev/null +++ b/onn/weights.py @@ -0,0 +1,58 @@ +import numpy as np + +class Weights: + # we may or may not contain weights -- or any information, for that matter. + + def __init__(self, **kwargs): + self.f = None # forward weights + self.g = None # backward weights (gradients) + self.shape = None + self.init = None + self.allocator = None + self.regularizer = None + self._allocated = False + + self.configure(**kwargs) + + def configure(self, **kwargs): + for k, v in kwargs.items(): + getattr(self, k) # ensures the key already exists + setattr(self, k, v) + + @property + def size(self): + assert self.shape is not None + return np.prod(self.shape) + + def allocate(self, *args, **kwargs): + if self._allocated: + raise Exception("attempted to allocate existing weights") + self.configure(**kwargs) + + # intentionally not using isinstance + assert type(self.shape) == tuple, self.shape + + f, g = self.allocator(self.size) + assert len(f) == self.size, "{} != {}".format(f.shape, self.size) + assert len(g) == self.size, "{} != {}".format(g.shape, self.size) + f[:] = self.init(self.size, *args) + g[:] = self.init(self.size, *args) + self.f = f.reshape(self.shape) + self.g = g.reshape(self.shape) + + self._allocated = True + + def forward(self): + if self.regularizer is None: + return 0.0 + return self.regularizer.forward(self.f) + + def backward(self): + if self.regularizer is None: + return 0.0 + return self.regularizer.backward(self.f) + + def update(self): + if self.regularizer is None: + return + self.g += self.regularizer.backward(self.f) diff --git a/onn_core.py b/onn_core.py deleted file mode 100644 index 62be569..0000000 --- a/onn_core.py +++ /dev/null @@ -1,1398 +0,0 @@ -import sys -import types - -def lament(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - -def lower_priority(): - """Set the priority of the process to below-normal.""" - # via https://stackoverflow.com/a/1023269 - if sys.platform == 'win32': - try: - import win32api, win32process, win32con - pid = win32api.GetCurrentProcessId() - handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid) - win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS) - except ImportError: - lament("you do not have pywin32 installed.") - lament("the process priority could not be lowered.") - lament("consider: python -m pip install pypiwin32") - lament("consider: conda install pywin32") - else: - import os - os.nice(1) - -import numpy as np -_f = np.float32 - -# just for speed, not strictly essential: -from scipy.special import expit as sigmoid - -# used for numbering layers like Keras, and keeping initialization consistent: -from collections import defaultdict, OrderedDict -_layer_counters = defaultdict(lambda: 0) - -def _check(a): - assert isinstance(a, np.ndarray) or type(a) == _f, type(a) - assert a.dtype == _f, a.dtype - return a - -_0 = _f(0) -_1 = _f(1) -_2 = _f(2) -_inv2 = _f(1/2) -_sqrt2 = _f(np.sqrt(2)) -_invsqrt2 = _f(1/np.sqrt(2)) -_pi = _f(np.pi) - -class LayerIncompatibility(Exception): - pass - -# Node Traversal {{{1 - -class DummyNode: - name = "Dummy" - - def __init__(self, children=None, parents=None): - self.children = children if children is not None else [] - self.parents = parents if parents is not None else [] - -def traverse(node_in, node_out, nodes=None, dummy_mode=False): - # i have no idea if this is any algorithm in particular. - nodes = nodes if nodes is not None else [] - - seen_up = {} - q = [node_out] - while len(q) > 0: - node = q.pop(0) - seen_up[node] = True - for parent in node.parents: - q.append(parent) - - if dummy_mode: - seen_up[node_in] = True - - nodes = [] - q = [node_in] - while len(q) > 0: - node = q.pop(0) - if not seen_up[node]: - continue - parents_added = (parent in nodes for parent in node.parents) - if not node in nodes and all(parents_added): - nodes.append(node) - for child in node.children: - q.append(child) - - if dummy_mode: - nodes.remove(node_in) - - return nodes - -def traverse_all(nodes_in, nodes_out, nodes=None): - all_in = DummyNode(children=nodes_in) - all_out = DummyNode(parents=nodes_out) - return traverse(all_in, all_out, nodes, dummy_mode=True) - -# Initializations {{{1 - -# note: these are currently only implemented for 2D shapes. - -def init_zeros(size, ins=None, outs=None): - return np.zeros(size) - -def init_ones(size, ins=None, outs=None): - return np.ones(size) - -def init_he_normal(size, ins, outs): - s = np.sqrt(2 / ins) - return np.random.normal(0, s, size=size) - -def init_he_uniform(size, ins, outs): - s = np.sqrt(6 / ins) - return np.random.uniform(-s, s, size=size) - -def init_glorot_normal(size, ins, outs): - s = np.sqrt(2 / (ins + outs)) - return np.random.normal(0, s, size=size) - -def init_glorot_uniform(size, ins, outs): - s = np.sqrt(6 / (ins + outs)) - return np.random.uniform(-s, s, size=size) - -# Weight container {{{1 - -class Weights: - # we may or may not contain weights -- or any information, for that matter. - - def __init__(self, **kwargs): - self.f = None # forward weights - self.g = None # backward weights (gradients) - self.shape = None - self.init = None - self.allocator = None - self.regularizer = None - self._allocated = False - - self.configure(**kwargs) - - def configure(self, **kwargs): - for k, v in kwargs.items(): - getattr(self, k) # ensures the key already exists - setattr(self, k, v) - - @property - def size(self): - assert self.shape is not None - return np.prod(self.shape) - - def allocate(self, *args, **kwargs): - if self._allocated: - raise Exception("attempted to allocate existing weights") - self.configure(**kwargs) - - # intentionally not using isinstance - assert type(self.shape) == tuple, self.shape - - f, g = self.allocator(self.size) - assert len(f) == self.size, "{} != {}".format(f.shape, self.size) - assert len(g) == self.size, "{} != {}".format(g.shape, self.size) - f[:] = self.init(self.size, *args) - g[:] = self.init(self.size, *args) - self.f = f.reshape(self.shape) - self.g = g.reshape(self.shape) - - self._allocated = True - - def forward(self): - if self.regularizer is None: - return 0.0 - return self.regularizer.forward(self.f) - - def backward(self): - if self.regularizer is None: - return 0.0 - return self.regularizer.backward(self.f) - - def update(self): - if self.regularizer is None: - return - self.g += self.regularizer.backward(self.f) - -# Loss functions {{{1 - -class Loss: - pass - -class NLL(Loss): # Negative Log Likelihood - def forward(self, p, y): - correct = p * y - return np.mean(-correct) - - def backward(self, p, y): - return -y / len(p) - -class CategoricalCrossentropy(Loss): - # lifted from theano - - def __init__(self, eps=1e-6): - self.eps = _f(eps) - - def forward(self, p, y): - p = np.clip(p, self.eps, 1 - self.eps) - f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1) - return np.mean(f) - - def backward(self, p, y): - p = np.clip(p, self.eps, 1 - self.eps) - df = (p - y) / (p * (1 - p)) - return df / len(y) - -class Accuracy(Loss): - # returns percentage of categories correctly predicted. - # utilizes argmax(), so it cannot be used for gradient descent. - # use CategoricalCrossentropy or NLL for that instead. - - def forward(self, p, y): - correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1) - return np.mean(correct) - - def backward(self, p, y): - raise NotImplementedError("cannot take the gradient of Accuracy") - -class ResidualLoss(Loss): - def forward(self, p, y): - return np.mean(self.f(p - y)) - - def backward(self, p, y): - ret = self.df(p - y) / len(y) - return ret - -class SquaredHalved(ResidualLoss): - def f(self, r): - return np.square(r) / 2 - - def df(self, r): - return r - -class Squared(ResidualLoss): - def f(self, r): - return np.square(r) - - def df(self, r): - return 2 * r - -class Absolute(ResidualLoss): - def f(self, r): - return np.abs(r) - - def df(self, r): - return np.sign(r) - -class Huber(ResidualLoss): - def __init__(self, delta=1.0): - self.delta = _f(delta) - - def f(self, r): - return np.where(r <= self.delta, - np.square(r) / 2, - self.delta * (np.abs(r) - self.delta / 2)) - - def df(self, r): - return np.where(r <= self.delta, - r, - self.delta * np.sign(r)) - -# Regularizers {{{1 - -class Regularizer: - pass - -class L1L2(Regularizer): - def __init__(self, l1=0.0, l2=0.0): - self.l1 = _f(l1) - self.l2 = _f(l2) - - def forward(self, X): - f = _0 - if self.l1: - f += np.sum(self.l1 * np.abs(X)) - if self.l2: - f += np.sum(self.l2 * np.square(X)) - return f - - def backward(self, X): - df = np.zeros_like(X) - if self.l1: - df += self.l1 * np.sign(X) - if self.l2: - df += self.l2 * 2 * X - return df - -# Optimizers {{{1 - -class Optimizer: - def __init__(self, lr=0.1): - self.lr = _f(lr) # learning rate - self.reset() - - def reset(self): - pass - - def compute(self, dW, W): - return -self.lr * dW - - def update(self, dW, W): - W += self.compute(dW, W) - -# some of the the following optimizers are blatantly lifted from tiny-dnn: -# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h - -class Momentum(Optimizer): - def __init__(self, lr=0.01, mu=0.9, nesterov=False): - self.mu = _f(mu) # momentum - self.nesterov = bool(nesterov) - - super().__init__(lr) - - def reset(self): - self.Vprev = None - - def compute(self, dW, W): - if self.Vprev is None: - self.Vprev = np.copy(dW) - - V = self.mu * self.Vprev - self.lr * dW - self.Vprev[:] = V - if self.nesterov: - return self.mu * V - self.lr * dW - - return V - -class Adagrad(Optimizer): - def __init__(self, lr=0.01, eps=1e-8): - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.g = None - - def compute(self, dW, W): - if self.g is None: - self.g = np.zeros_like(dW) - - self.g += np.square(dW) - return -self.lr * dW / (np.sqrt(self.g) + self.eps) - -class RMSprop(Optimizer): - # RMSprop generalizes* Adagrad, etc. - - # * RMSprop == Adagrad when - # RMSprop.mu == 1 - - def __init__(self, lr=1e-4, mu=0.99, eps=1e-8): - self.mu = _f(mu) # decay term - self.eps = _f(eps) - - # one might consider the following equation when specifying mu: - # mu = e**(-1/t) - # default: t = -1/ln(0.99) = ~99.5 - # therefore the default of mu=0.99 means - # an input decays to 1/e its original amplitude over 99.5 batches. - # (this is from DSP, so how relevant it is in SGD is debatable) - - super().__init__(lr) - - def reset(self): - self.g = None - - def compute(self, dW, W): - if self.g is None: - self.g = np.zeros_like(dW) - - # basically apply a first-order low-pass filter to delta squared - self.g += (1 - self.mu) * (np.square(dW) - self.g) - - # finally sqrt it to complete the running root-mean-square approximation - return -self.lr * dW / (np.sqrt(self.g) + self.eps) - -class RMSpropCentered(Optimizer): - # referenced TensorFlow/PyTorch. - # paper: https://arxiv.org/pdf/1308.0850v5.pdf - - def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8): - self.aleph = _f(aleph) - self.momentum = _f(momentum) - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.g = None - self.mt = None - self.vt = None - self.delta = None - - def compute(self, dW, W): - if self.g is None: - self.g = np.zeros_like(dW) - if self.mt is None: - self.mt = np.zeros_like(dW) - if self.vt is None: - self.vt = np.zeros_like(dW) - if self.delta is None: - self.delta = np.zeros_like(dW) - - self.mt += (1 - self.aleph) * (dW - self.mt) - self.vt += (1 - self.aleph) * (np.square(dW) - self.vt) - - # PyTorch has the epsilon outside of the sqrt, - # TensorFlow and the paper have it within. - # in onn, we generally do it outside, as this seems to work better. - temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps) - - # TensorFlow does it this way. - self.delta[:] = self.momentum * self.delta + self.lr * temp - return -self.delta - # PyTorch does it this way. - #self.delta[:] = self.momentum * self.delta + temp - #return -self.lr * self.delta - # they are equivalent only when LR is constant, which it might not be. - -class Adam(Optimizer): - # paper: https://arxiv.org/abs/1412.6980 - # Adam generalizes* RMSprop, and - # adds a decay term to the regular (non-squared) delta, and performs - # debiasing to compensate for the filtered deltas starting from zero. - - # * Adam == RMSprop when - # Adam.b1 == 0 - # Adam.b2 == RMSprop.mu - - def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term - self.b1_t_default = _f(b1) # decay term power t - self.b2_t_default = _f(b2) # decay term power t - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.mt = None - self.vt = None - self.b1_t = self.b1_t_default - self.b2_t = self.b2_t_default - - def compute(self, dW, W): - if self.mt is None: - self.mt = np.zeros_like(dW) - if self.vt is None: - self.vt = np.zeros_like(dW) - - # decay gain - self.b1_t *= self.b1 - self.b2_t *= self.b2 - - # filter - self.mt += (1 - self.b1) * (dW - self.mt) - self.vt += (1 - self.b2) * (np.square(dW) - self.vt) - - return -self.lr * (self.mt / (1 - self.b1_t)) \ - / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) - -class Nadam(Optimizer): - # paper: https://arxiv.org/abs/1412.6980 - # paper: http://cs229.stanford.edu/proj2015/054_report.pdf - # TODO: double-check this implementation. also read the damn paper. - # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 - # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py - - def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.mt = None - self.vt = None - self.t = 0 - self.sched = 1 - - def compute(self, dW, W): - self.t += 1 - - if self.mt is None: - self.mt = np.zeros_like(dW) - if self.vt is None: - self.vt = np.zeros_like(dW) - - ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0)) - ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1)) - - sched0 = self.sched * ut0 - sched1 = self.sched * ut0 * ut1 - self.sched = sched0 - - gp = dW / (1 - sched0) - - self.mt += (1 - self.b1) * (dW - self.mt) - self.vt += (1 - self.b2) * (np.square(dW) - self.vt) - - mtp = self.mt / (1 - sched1) - vtp = self.vt / (1 - self.b2**self.t) - - mt_bar = (1 - ut0) * gp + ut1 * mtp - - return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) - -# Abstract Layers {{{1 - -class Layer: - def __init__(self): - self.parents = [] - self.children = [] - self.weights = OrderedDict() - self.loss = None # for activity regularizers - self.input_shape = None - self.output_shape = None - kind = self.__class__.__name__ - global _layer_counters - _layer_counters[kind] += 1 - self.name = "{}_{}".format(kind, _layer_counters[kind]) - self.unsafe = False # disables assertions for better performance - self.shared = False # as in weight sharing - - def __str__(self): - return self.name - - # methods we might want to override: - - def forward(self, X): - raise NotImplementedError("unimplemented", self) - - def forward_deterministic(self, X): - return self.forward(X) - - def backward(self, dY): - raise NotImplementedError("unimplemented", self) - - def make_shape(self, parent): - if self.input_shape == None: - self.input_shape = parent.output_shape - if self.output_shape == None: - self.output_shape = self.input_shape - - def do_feed(self, child): - self.children.append(child) - - def be_fed(self, parent): - self.parents.append(parent) - - # TODO: better names for these (still) - def _propagate(self, edges, deterministic): - if not self.unsafe: - assert len(edges) == 1, self - if deterministic: - return self.forward_deterministic(edges[0]) - else: - return self.forward(edges[0]) - - def _backpropagate(self, edges): - if len(edges) == 1: - return self.backward(edges[0]) - return sum((self.backward(dY) for dY in edges)) - - # general utility methods: - - def is_compatible(self, parent): - return np.all(self.input_shape == parent.output_shape) - - def feed(self, child): - assert self.output_shape is not None, self - child.make_shape(self) - if not child.is_compatible(self): - fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" - raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape)) - self.do_feed(child) - child.be_fed(self) - return child - - def validate_input(self, X): - assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) - - def validate_output(self, Y): - assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) - - def _new_weights(self, name, **kwargs): - w = Weights(**kwargs) - assert name not in self.weights, name - self.weights[name] = w - return w - - def share(self, node): - self.weights = node.weights # TODO: this should be all it takes. - for k, v in self.weights.items(): - vs = getattr(node, k) # hack: key isn't necessarily attribute name! - setattr(self, k, vs) - self.shared = True - - def clear_grad(self): - for name, w in self.weights.items(): - w.g[:] = 0 - - @property - def size(self): - return sum((w.size for w in self.weights.values())) - - def init(self, allocator): - ins, outs = self.input_shape[0], self.output_shape[0] - for k, w in self.weights.items(): - w.allocate(ins, outs, allocator=allocator) - - def propagate(self, values, deterministic): - if not self.unsafe: - assert self.parents, self - edges = [] - for parent in self.parents: - if parent in values: - X = values[parent] - if not self.unsafe: - self.validate_input(X) - edges.append(X) - Y = self._propagate(edges, deterministic) - if not self.unsafe: - self.validate_output(Y) - return Y - - def backpropagate(self, values): - if not self.unsafe: - assert self.children, self - edges = [] - for child in self.children: - if child in values: - dY = values[child] - if not self.unsafe: - self.validate_output(dY) - edges.append(dY) - dX = self._backpropagate(edges) - if not self.unsafe: - self.validate_input(dX) - return dX - -# Nonparametric Layers {{{1 - -class Input(Layer): - def __init__(self, shape): - assert shape is not None - super().__init__() - self.shape = tuple(shape) - self.input_shape = self.shape - self.output_shape = self.shape - - def forward(self, X): - return X - - def backward(self, dY): - #self.dY = dY - return np.zeros_like(dY) - -class Reshape(Layer): - def __init__(self, new_shape): - super().__init__() - self.shape = tuple(new_shape) - self.output_shape = self.shape - - def forward(self, X): - self.batch_size = X.shape[0] - return X.reshape(self.batch_size, *self.output_shape) - - def backward(self, dY): - assert dY.shape[0] == self.batch_size - return dY.reshape(self.batch_size, *self.input_shape) - -class Flatten(Layer): - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - self.output_shape = (np.prod(shape),) - - def forward(self, X): - self.batch_size = X.shape[0] - return X.reshape(self.batch_size, *self.output_shape) - - def backward(self, dY): - assert dY.shape[0] == self.batch_size - return dY.reshape(self.batch_size, *self.input_shape) - -class ConstAffine(Layer): - def __init__(self, a=1, b=0): - super().__init__() - self.a = _f(a) - self.b = _f(b) - - def forward(self, X): - return self.a * X + self.b - - def backward(self, dY): - return dY * self.a - -class Sum(Layer): - def _propagate(self, edges, deterministic): - return np.sum(edges, axis=0) - - def _backpropagate(self, edges): - #assert len(edges) == 1, "unimplemented" - return edges[0] # TODO: does this always work? - -class ActivityRegularizer(Layer): - def __init__(self, reg): - super().__init__() - assert isinstance(reg, Regularizer), reg - self.reg = reg - - def forward(self, X): - self.X = X - self.loss = np.sum(self.reg.forward(X)) - return X - - def backward(self, dY): - return dY + self.reg.backward(self.X) - -class Dropout(Layer): - def __init__(self, dropout=0.0): - super().__init__() - self.p = _f(1 - dropout) - assert 0 <= self.p <= 1 - - def forward(self, X): - self.mask = (np.random.rand(*X.shape) < self.p) / self.p - return X * self.mask - - def forward_deterministic(self, X): - #self.mask = _1 - return X - - def backward(self, dY): - return dY * self.mask - -# Activation Layers {{{2 - -class Identity(Layer): - def forward(self, X): - return X - - def backward(self, dY): - return dY - -class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) - def forward(self, X): - self.sig = sigmoid(X) - return self.sig - - def backward(self, dY): - return dY * self.sig * (1 - self.sig) - -class Softplus(Layer): - # integral of Sigmoid. - - def forward(self, X): - self.X = X - return np.log(1 + np.exp(X)) - - def backward(self, dY): - return dY * sigmoid(self.X) - -class Tanh(Layer): - def forward(self, X): - self.sig = np.tanh(X) - return self.sig - - def backward(self, dY): - return dY * (1 - self.sig * self.sig) - -class LeCunTanh(Layer): - # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf - # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf - # scaled such that f([-1, 1]) = [-1, 1]. - # helps preserve an input variance of 1. - # second derivative peaks around an input of ±1. - - def forward(self, X): - self.sig = np.tanh(2 / 3 * X) - return 1.7159 * self.sig - - def backward(self, dY): - return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig) - -class Relu(Layer): - def forward(self, X): - self.cond = X >= 0 - return np.where(self.cond, X, 0) - - def backward(self, dY): - return np.where(self.cond, dY, 0) - -class Elu(Layer): - # paper: https://arxiv.org/abs/1511.07289 - - def __init__(self, alpha=1): - super().__init__() - self.alpha = _f(alpha) # FIXME: unused - - def forward(self, X): - self.cond = X >= 0 - self.neg = np.exp(X) - 1 - return np.where(self.cond, X, self.neg) - - def backward(self, dY): - return dY * np.where(self.cond, 1, self.neg + 1) - -class GeluApprox(Layer): - # paper: https://arxiv.org/abs/1606.08415 - # plot: https://www.desmos.com/calculator/ydzgtccsld - - def forward(self, X): - self.a = 1.704 * X - self.sig = sigmoid(self.a) - return X * self.sig - - def backward(self, dY): - return dY * self.sig * (1 + self.a * (1 - self.sig)) - -class Softmax(Layer): - def forward(self, X): - alpha = np.max(X, axis=-1, keepdims=True) - num = np.exp(X - alpha) - den = np.sum(num, axis=-1, keepdims=True) - self.sm = num / den - return self.sm - - def backward(self, dY): - return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm - -class LogSoftmax(Softmax): - def __init__(self, eps=1e-6): - super().__init__() - self.eps = _f(eps) - - def forward(self, X): - return np.log(super().forward(X) + self.eps) - - def backward(self, dY): - return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm - -class Cos(Layer): - # performs well on MNIST for some strange reason. - - def forward(self, X): - self.X = X - return np.cos(X) - - def backward(self, dY): - return dY * -np.sin(self.X) - -# Parametric Layers {{{1 - -class Bias(Layer): - # TODO: support axes other than -1 and shapes other than 1D. - - serialized = { - 'b': 'biases', - } - - def __init__(self, init=init_zeros, reg_b=None): - super().__init__() - self.biases = self._new_weights('biases', init=init, regularizer=reg_b) - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - self.output_shape = shape - self.biases.shape = (shape[-1],) - - def forward(self, X): - return X + self.biases.f - - def backward(self, dY): - self.biases.g += dY.sum(0) - return dY - -class Dense(Layer): - serialized = { - 'W': 'coeffs', - 'b': 'biases', - } - - def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None): - super().__init__() - self.dim = int(dim) - self.output_shape = (dim,) - self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) - self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) - - def make_shape(self, parent): - shape = parent.output_shape - self.input_shape = shape - assert len(shape) == 1, shape - self.coeffs.shape = (shape[0], self.dim) - self.biases.shape = (1, self.dim) - - def forward(self, X): - self.X = X - return X @ self.coeffs.f + self.biases.f - - def backward(self, dY): - self.coeffs.g += self.X.T @ dY - self.biases.g += dY.sum(0, keepdims=True) - return dY @ self.coeffs.f.T - -# Models {{{1 - -class Model: - def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False): - self.loss = loss if loss is not None else SquaredHalved() - self.mloss = mloss if mloss is not None else loss - - nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in - nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out - assert type(nodes_in) == list, type(nodes_in) - assert type(nodes_out) == list, type(nodes_out) - self.nodes_in = nodes_in - self.nodes_out = nodes_out - - self.nodes = traverse_all(self.nodes_in, self.nodes_out) - self.make_weights() - for node in self.nodes: - node.unsafe = unsafe - # TODO: handle the same layer being in more than one node. - - @property - def ordered_nodes(self): - # deprecated? we don't guarantee an order like we did before. - return self.nodes - - def make_weights(self): - self.param_count = sum((node.size for node in self.nodes if not node.shared)) - self.W = np.zeros(self.param_count, dtype=_f) - self.dW = np.zeros(self.param_count, dtype=_f) - - offset = 0 - for node in self.nodes: - if node.size > 0 and not node.shared: - inner_offset = 0 - - def allocate(size): - nonlocal inner_offset - o = offset + inner_offset - ret = self.W[o:o+size], self.dW[o:o+size] - inner_offset += size - assert len(ret[0]) == len(ret[1]) - assert size == len(ret[0]), (size, len(ret[0])) - return ret - - node.init(allocate) - assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node) - # i don't care if "less" is grammatically incorrect. - # you're mom is grammatically incorrect. - assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node) - offset += node.size - - def evaluate(self, input_, deterministic=True): - assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead" - assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead" - node_in = self.nodes_in[0] - node_out = self.nodes_out[0] - outputs = self.evaluate_multi({node_in: input_}, deterministic) - return outputs[node_out] - - def apply(self, error): # TODO: better name? - assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead" - assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead" - node_in = self.nodes_in[0] - node_out = self.nodes_out[0] - inputs = self.apply_multi({node_out: error}) - return inputs[node_in] - - def evaluate_multi(self, inputs, deterministic=True): - values = dict() - outputs = dict() - for node in self.nodes: - if node in self.nodes_in: - assert node in inputs, "missing input for node {}".format(node.name) - X = inputs[node] - values[node] = node._propagate(np.expand_dims(X, 0), deterministic) - else: - values[node] = node.propagate(values, deterministic) - if node in self.nodes_out: - outputs[node] = values[node] - return outputs - - def apply_multi(self, outputs): - values = dict() - inputs = dict() - for node in reversed(self.nodes): - if node in self.nodes_out: - assert node in outputs, "missing output for node {}".format(node.name) - X = outputs[node] - values[node] = node._backpropagate(np.expand_dims(X, 0)) - else: - values[node] = node.backpropagate(values) - if node in self.nodes_in: - inputs[node] = values[node] - return inputs - - def forward(self, inputs, outputs, measure=False, deterministic=False): - predicted = self.evaluate(inputs, deterministic=deterministic) - if measure: - error = self.mloss.forward(predicted, outputs) - else: - error = self.loss.forward(predicted, outputs) - return error, predicted - - def backward(self, predicted, outputs, measure=False): - if measure: - error = self.mloss.backward(predicted, outputs) - else: - error = self.loss.backward(predicted, outputs) - # input_delta is rarely useful; it's just to match the forward pass. - input_delta = self.apply(error) - return self.dW, input_delta - - def clear_grad(self): - for node in self.nodes: - node.clear_grad() - - def regulate_forward(self): - loss = _0 - for node in self.nodes: - if node.loss is not None: - loss += node.loss - for k, w in node.weights.items(): - loss += w.forward() - return loss - - def regulate(self): - for node in self.nodes: - for k, w in node.weights.items(): - w.update() - - def load_weights(self, fn): - # seemingly compatible with keras' Dense layers. - import h5py - open(fn) # just ensure the file exists (python's error is better) - f = h5py.File(fn, 'r') - weights = {} - def visitor(name, obj): - if isinstance(obj, h5py.Dataset): - weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f) - f.visititems(visitor) - f.close() - - used = {} - for k in weights.keys(): - used[k] = False - - nodes = [node for node in self.nodes if node.size > 0] - # TODO: support shared weights. - for node in nodes: - full_name = str(node).lower() - for s_name, o_name in node.serialized.items(): - key = full_name + '_' + s_name - data = weights[key] - target = getattr(node, o_name) - target.f[:] = data - used[key] = True - - for k, v in used.items(): - if not v: - lament("WARNING: unused weight", k) - - def save_weights(self, fn, overwrite=False): - import h5py - f = h5py.File(fn, 'w') - - counts = defaultdict(lambda: 0) - - nodes = [node for node in self.nodes if node.size > 0] - # TODO: support shared weights. - for node in nodes: - full_name = str(node).lower() - grp = f.create_group(full_name) - for s_name, o_name in node.serialized.items(): - key = full_name + '_' + s_name - target = getattr(node, o_name) - data = grp.create_dataset(key, target.shape, dtype=_f) - data[:] = target.f - counts[key] += 1 - if counts[key] > 1: - lament("WARNING: rewrote weight", key) - - f.close() - - def print_graph(self, file=sys.stdout): - print('digraph G {', file=file) - for node in self.nodes: - children = [str(n) for n in node.children] - if children: - sep = '->' - print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file) - print('}', file=file) - -# Rituals {{{1 - -class Ritual: # i'm just making up names at this point. - def __init__(self, learner=None): - self.learner = learner if learner is not None else Learner(Optimizer()) - self.model = None - - def reset(self): - self.learner.reset(optim=True) - self.en = 0 - self.bn = 0 - - def learn(self, inputs, outputs): - error, predicted = self.model.forward(inputs, outputs) - self.model.backward(predicted, outputs) - self.model.regulate() - return error, predicted - - def update(self): - optim = self.learner.optim - optim.model = self.model - optim.update(self.model.dW, self.model.W) - - def prepare(self, model): - self.en = 0 - self.bn = 0 - self.model = model - - def _train_batch(self, batch_inputs, batch_outputs, b, batch_count, - test_only=False, loss_logging=False, mloss_logging=True): - if not test_only and self.learner.per_batch: - self.learner.batch(b / batch_count) - - if test_only: - predicted = self.model.evaluate(batch_inputs, deterministic=True) - else: - error, predicted = self.learn(batch_inputs, batch_outputs) - self.model.regulate_forward() - self.update() - - if loss_logging: - batch_loss = self.model.loss.forward(predicted, batch_outputs) - if np.isnan(batch_loss): - raise Exception("nan") - self.losses.append(batch_loss) - self.cumsum_loss += batch_loss - - if mloss_logging: - # NOTE: this can use the non-deterministic predictions. fixme? - batch_mloss = self.model.mloss.forward(predicted, batch_outputs) - if np.isnan(batch_mloss): - raise Exception("nan") - self.mlosses.append(batch_mloss) - self.cumsum_mloss += batch_mloss - - def train_batched(self, inputs_or_generator, outputs_or_batch_count, - batch_size=None, - return_losses=False, test_only=False, shuffle=True, - clear_grad=True): - assert isinstance(return_losses, bool) or return_losses == 'both' - assert self.model is not None - - gen = isinstance(inputs_or_generator, types.GeneratorType) - if gen: - generator = inputs_or_generator - batch_count = outputs_or_batch_count - assert isinstance(batch_count, int), type(batch_count) - else: - inputs = inputs_or_generator - outputs = outputs_or_batch_count - - if not test_only: - self.en += 1 - - if shuffle: - if gen: - raise Exception("shuffling is incompatibile with using a generator.") - indices = np.arange(inputs.shape[0]) - np.random.shuffle(indices) - inputs = inputs[indices] - outputs = outputs[indices] - - self.cumsum_loss, self.cumsum_mloss = _0, _0 - self.losses, self.mlosses = [], [] - - if not gen: - batch_count = inputs.shape[0] // batch_size - # TODO: lift this restriction - assert inputs.shape[0] % batch_size == 0, \ - "inputs is not evenly divisible by batch_size" - - prev_batch_size = None - for b in range(batch_count): - if not test_only: - self.bn += 1 - - if gen: - batch_inputs, batch_outputs = next(generator) - batch_size = batch_inputs.shape[0] - # TODO: lift this restriction - assert batch_size == prev_batch_size or prev_batch_size is None, \ - "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size) - else: - bi = b * batch_size - batch_inputs = inputs[ bi:bi+batch_size] - batch_outputs = outputs[bi:bi+batch_size] - - if clear_grad: - self.model.clear_grad() - self._train_batch(batch_inputs, batch_outputs, b, batch_count, - test_only, return_losses=='both', return_losses) - - prev_batch_size = batch_size - - avg_mloss = self.cumsum_mloss / _f(batch_count) - if return_losses == 'both': - avg_loss = self.cumsum_loss / _f(batch_count) - return avg_loss, avg_mloss, self.losses, self.mlosses - elif return_losses: - return avg_mloss, self.mlosses - return avg_mloss - - def test_batched(self, inputs, outputs, *args, **kwargs): - return self.train_batched(inputs, outputs, *args, - test_only=True, **kwargs) - - def train_batched_gen(self, generator, batch_count, *args, **kwargs): - return self.train_batched(generator, batch_count, *args, - shuffle=False, **kwargs) - -# Learners {{{1 - -class Learner: - per_batch = False - - def __init__(self, optim, epochs=100, rate=None): - assert isinstance(optim, Optimizer) - self.optim = optim - self.start_rate = rate # None is okay; it'll use optim.lr instead. - self.epochs = int(epochs) - self.reset() - - def reset(self, optim=False): - self.started = False - self.epoch = 0 - if optim: - self.optim.reset() - - @property - def epoch(self): - return self._epoch - - @epoch.setter - def epoch(self, new_epoch): - self._epoch = int(new_epoch) - if 0 <= self.epoch <= self.epochs: - self.rate = self.rate_at(self._epoch) - - @property - def rate(self): - return self.optim.lr - - @rate.setter - def rate(self, new_rate): - self.optim.lr = new_rate - - def rate_at(self, epoch): - if self.start_rate is None: - return self.optim.lr - return self.start_rate - - def next(self): - # prepares the next epoch. returns whether or not to continue training. - if not self.started: - self.started = True - self.epoch += 1 - if self.epoch > self.epochs: - return False - return True - - def batch(self, progress): # TODO: rename - # interpolates rates between epochs. - # unlike epochs, we do not store batch number as a state. - # i.e. calling next() will not respect progress. - assert 0 <= progress <= 1 - self.rate = self.rate_at(self._epoch + progress) - - @property - def final_rate(self): - return self.rate_at(self.epochs - 1e-8) - -class AnnealingLearner(Learner): - def __init__(self, optim, epochs=100, rate=None, halve_every=10): - self.halve_every = _f(halve_every) - self.anneal = _f(0.5**(1/self.halve_every)) - super().__init__(optim, epochs, rate) - - def rate_at(self, epoch): - return super().rate_at(epoch) * self.anneal**epoch - -def cosmod(x): - # plot: https://www.desmos.com/calculator/hlgqmyswy2 - return (_1 + np.cos((x % _1) * _pi)) * _inv2 - -class SGDR(Learner): - # Stochastic Gradient Descent with Restarts - # paper: https://arxiv.org/abs/1608.03983 - # NOTE: this is missing a couple of the proposed features. - - per_batch = True - - def __init__(self, optim, epochs=100, rate=None, - restarts=0, restart_decay=0.5, callback=None, - expando=0): - self.restart_epochs = int(epochs) - self.decay = _f(restart_decay) - self.restarts = int(restarts) - self.restart_callback = callback - - # TODO: rename expando to something not insane - self.expando = expando if expando is not None else lambda i: i - if type(self.expando) == int: - inc = self.expando - self.expando = lambda i: i * inc - - self.splits = [] - epochs = 0 - for i in range(0, self.restarts + 1): - split = epochs + self.restart_epochs + int(self.expando(i)) - self.splits.append(split) - epochs = split - super().__init__(optim, epochs, rate) - - def split_num(self, epoch): - previous = [0] + self.splits - for i, split in enumerate(self.splits): - if epoch - 1 < split: - sub_epoch = epoch - previous[i] - next_restart = split - previous[i] - return i, sub_epoch, next_restart - raise Exception('this should never happen.') - - def rate_at(self, epoch): - base_rate = self.start_rate if self.start_rate is not None else self.optim.lr - restart, sub_epoch, next_restart = self.split_num(max(1, epoch)) - x = _f(sub_epoch - 1) / _f(next_restart) - return base_rate * self.decay**_f(restart) * cosmod(x) - - def next(self): - if not super().next(): - return False - restart, sub_epoch, next_restart = self.split_num(self.epoch) - if restart > 0 and sub_epoch == 1: - if self.restart_callback is not None: - self.restart_callback(restart) - return True - -class TriangularCLR(Learner): - per_batch = True - - def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0, - frequency=100, callback=None): - # NOTE: start_rate is treated as upper_rate - self.frequency = int(frequency) - assert self.frequency > 0 - self.callback = callback - self.lower_rate = _f(lower_rate) - super().__init__(optim, epochs, upper_rate) - - def _t(self, epoch): - # NOTE: this could probably be simplified - offset = self.frequency / 2 - return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset - - def rate_at(self, epoch): - upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr - return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate - - def next(self): - if not super().next(): - return False - e = self.epoch - 1 - if e > 0 and e % self.frequency == 0: - if self.callback is not None: - self.callback(self.epoch // self.frequency) - return True - -class SineCLR(TriangularCLR): - def _t(self, epoch): - return np.sin(_pi * _inv2 * super()._t(epoch)) - -class WaveCLR(TriangularCLR): - def _t(self, epoch): - return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))