optim/onn.py

#!/usr/bin/env python3

# external packages required for full functionality:
# numpy scipy h5py sklearn dotmap

# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
#           this is gonna take some work.

from onn_core import *
from onn_core import _check, _f, _0, _1

import sys

_log_was_update = False
def log(left, right, update=False):
    s = "\x1B[1m  {:>20}:\x1B[0m   {}".format(left, right)
    global _log_was_update
    if update and _log_was_update:
        lament('\x1B[F' + s)
    else:
        lament(s)
    _log_was_update = update

class Dummy:
    pass

# Math Utilities {{{1

def rolling(a, window):
    # http://stackoverflow.com/a/4924433
    shape = (a.size - window + 1, window)
    strides = (a.itemsize, a.itemsize)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def rolling_batch(a, window):
    # same as rolling, but acts on each batch (axis 0).
    shape = (a.shape[0], a.shape[-1] - window + 1, window)
    strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

# Initializations {{{1

def init_gaussian_unit(size, ins, outs):
    s = np.sqrt(1 / ins)
    return np.random.normal(0, s, size=size)

# Loss functions {{{1

class SomethingElse(ResidualLoss):
    # generalizes Absolute and SquaredHalved.
    # plot: https://www.desmos.com/calculator/fagjg9vuz7
    def __init__(self, a=4/3):
        assert 1 <= a <= 2, "parameter out of range"
        self.a = _f(a / 2)
        self.b = _f(2 / a)
        self.c = _f(2 / a - 1)

    def f(self, r):
        return self.a * np.abs(r)**self.b

    def df(self, r):
        return np.sign(r) * np.abs(r)**self.c

class Confidence(Loss):
    # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
    # it's just a metric of how large the value is of the predicted class.
    # when using it for loss, it acts like a crappy regularizer.
    # it really just measures how much of a hot-shot the network thinks it is.

    def forward(self, p, y=None):
        categories = p.shape[-1]
        confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
        # the exponent in softmax puts a maximum on confidence,
        # but we don't compensate for that. if necessary,
        # it'd be better to use an activation that doesn't have this limit.
        return np.mean(confidence)

    def backward(self, p, y=None):
        # in order to agree with the forward pass,
        # using this backwards pass as-is will minimize confidence.
        categories = p.shape[-1]
        detc = p / categories / (1 - 1/categories)
        dmax = p == np.max(p, axis=-1, keepdims=True)
        return detc * dmax

# Regularizers {{{1

class SaturateRelu(Regularizer):
    # paper: https://arxiv.org/abs/1703.09202
    # TODO: test this (and ActivityRegularizer) more thoroughly.
    #       i've looked at the histogram of the resulting weights.
    #       it seems like only the layers after this are affected
    #       the way they should be.

    def __init__(self, lamb=0.0):
        self.lamb = _f(lamb)

    def forward(self, X):
        return self.lamb * np.where(X >= 0, X, 0)

    def backward(self, X):
        return self.lamb * np.where(X >= 0, 1, 0)

# Optimizers {{{1

class FTML(Optimizer):
    # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
    # author's implementation: https://github.com/szhengac/optim/commit/923555e

    def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
        self.iterations = _0
        self.b1 = _f(b1) # decay term
        self.b2 = _f(b2) # decay term
        self.eps = _f(eps)

        super().__init__(lr)

    def reset(self):
        self.dt1 = None
        self.dt = None
        self.vt = None
        self.zt = None
        self.b1_t = _1
        self.b2_t = _1

    def compute(self, dW, W):
        if self.dt1 is None: self.dt1 = np.zeros_like(dW)
        if self.dt is None: self.dt = np.zeros_like(dW)
        if self.vt is None: self.vt = np.zeros_like(dW)
        if self.zt is None: self.zt = np.zeros_like(dW)

        # NOTE: we could probably rewrite these equations to avoid this copy.
        self.dt1[:] = self.dt[:]

        self.b1_t *= self.b1
        self.b2_t *= self.b2

        # hardly an elegant solution.
        lr = max(self.lr, self.eps)

        # same as Adam's vt.
        self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW

        # you can factor "inner" out of Adam as well.
        inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
        self.dt[:] = (1 - self.b1_t) / lr * inner

        sigma_t = self.dt - self.b1 * self.dt1

        # Adam's mt minus the sigma term.
        self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W

        # subtract by weights to avoid having to override self.update.
        return -self.zt / self.dt - W

class MomentumClip(Optimizer):
    def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
        self.mu = _f(mu)
        self.clip = _f(clip)
        self.nesterov = bool(nesterov)
        self.debug = bool(debug)

        super().__init__(lr)

    def reset(self):
        self.accum = None

    def compute(self, dW, W):
        if self.accum is None:
            self.accum = np.zeros_like(dW)

        total_norm = np.linalg.norm(dW)
        clip_scale = self.clip / (total_norm + 1e-6)
        if clip_scale < 1:
            if self.debug:
                lament("clipping gradients; norm: {:10.5f}".format(total_norm))
            dW *= clip_scale

        self.accum[:] = self.accum * self.mu + dW
        if self.nesterov:
            return -self.lr * (self.accum * self.mu + dW)
        else:
            return -self.lr * self.accum

class YellowFin(Optimizer):
    # paper: https://arxiv.org/abs/1706.03471
    # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
    # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
    # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666

    def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
                 debias=True, clip=1.0):
        self.lr_default = _f(lr)
        self.mu_default = _f(mu)
        self.beta = _f(beta)
        self.window_size = int(window_size) # curv_win_width
        self.debias_enabled = bool(debias)
        self.clip = _f(clip)

        self.mu = _f(mu) # momentum
        super().__init__(lr)

    def reset(self):
        self.accum = None

        self.lr = self.lr_default
        self.mu = self.mu_default

        self.step = 0
        self.beta_t = self.beta

        self.curv_win = np.zeros([self.window_size,], dtype=np.float32)

        self.h_min = None
        self.h_max = None

        self.g_lpf = 0
        #self.g_squared_lpf = 0
        self.g_norm_squared_lpf = 0
        self.g_norm_lpf = 0
        self.h_min_lpf = 0
        self.h_max_lpf = 0
        self.dist_lpf = 0
        self.lr_lpf = 0
        self.mu_lpf = 0

    def get_lr_mu(self):
        p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
        w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
        w = np.power(w3, 1/3)
        y = w - p / (3 * w)
        sqrt_mu1 = y + 1

        sqrt_h_min = np.sqrt(self.h_min)
        sqrt_h_max = np.sqrt(self.h_max)
        sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min)

        sqrt_mu = max(sqrt_mu1, sqrt_mu2)
        if sqrt_mu2 > sqrt_mu1:
            print('note: taking dr calculation. something may have exploded.')

        lr = np.square(1 - sqrt_mu) / self.h_min
        mu = np.square(sqrt_mu)
        return lr, mu

    def compute(self, dW, W):
        if self.accum is None:
            self.accum = np.zeros_like(dW)

        # TODO: prevent allocations everywhere by using [:].
        #       assuming that really works. i haven't actually checked.

        total_norm = np.linalg.norm(dW)
        clip_scale = self.clip / (total_norm + 1e-6)
        if clip_scale < 1:
            #print("clipping gradients; norm: {:10.5f}".format(total_norm))
            dW *= clip_scale

        #fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
        #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))

        b = self.beta
        m1b = 1 - self.beta
        debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1

        g = dW
        g_squared = np.square(g)
        g_norm_squared = np.sum(g_squared)
        g_norm = np.sqrt(g_norm_squared)

        self.curv_win[self.step % self.window_size] = g_norm_squared
        valid_window = self.curv_win[:min(self.window_size, self.step + 1)]
        h_min_t = np.min(valid_window)
        h_max_t = np.max(valid_window)

        self.g_lpf              = b * self.g_lpf              + m1b * g
        #self.g_squared_lpf      = b * self.g_squared_lpf      + m1b * g_squared
        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
        self.g_norm_lpf         = b * self.g_norm_lpf         + m1b * g_norm
        self.h_min_lpf          = b * self.h_min_lpf          + m1b * h_min_t
        self.h_max_lpf          = b * self.h_max_lpf          + m1b * h_max_t

        g_avg              = debias * self.g_lpf
        #g_squared_avg      = debias * self.g_squared_lpf
        g_norm_squared_avg = debias * self.g_norm_squared_lpf
        g_norm_avg         = debias * self.g_norm_lpf
        self.h_min         = debias * self.h_min_lpf
        self.h_max         = debias * self.h_max_lpf
        assert self.h_max >= self.h_min

        dist = g_norm_avg / g_norm_squared_avg

        self.dist_lpf           = b * self.dist_lpf           + m1b * dist

        self.dist_avg      = debias * self.dist_lpf

        self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
        # equivalently:
        #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))

        if self.step > 0:
            lr_for_real, mu_for_real = self.get_lr_mu()
            self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real
            self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real
            self.mu = debias * self.mu_lpf
            self.lr = debias * self.lr_lpf

        self.accum[:] = self.accum * self.mu - self.lr * dW
        V = self.accum

        self.step += 1
        self.beta_t *= self.beta
        return V

class AddSign(Optimizer):
    # paper: https://arxiv.org/abs/1709.07417

    def __init__(self, lr=0.01, mu=0.9, alpha=1):
        self.mu = _f(mu)
        self.alpha = _f(alpha)

        super().__init__(lr)

    def reset(self):
        self.accum = None

    def compute(self, dW, W):
        if self.accum is None:
            self.accum = np.zeros_like(dW)

        self.accum[:] = self.accum * self.mu + dW

        signed = np.sign(dW) * np.sign(self.accum)
        #signed *= decay

        return -self.lr * dW * (self.alpha + signed)

class PowerSign(Optimizer):
    # paper: https://arxiv.org/abs/1709.07417

    def __init__(self, lr=0.01, mu=0.9, alpha=np.e):
        self.mu = _f(mu)
        self.alpha = _f(alpha)
        self.use_exp = np.isclose(self.alpha, _f(np.e))

        super().__init__(lr)

    def reset(self):
        self.accum = None

    def compute(self, dW, W):
        if self.accum is None:
            self.accum = np.zeros_like(dW)

        self.accum[:] = self.accum * self.mu + dW

        signed = np.sign(dW) * np.sign(self.accum)
        #signed *= decay

        if self.use_exp:
            return -self.lr * dW * np.exp(signed)
        else:
            return -self.lr * dW * np.power(self.alpha, signed)

class Neumann(Optimizer):
    # paper: https://arxiv.org/abs/1712.03298
    # NOTE: this implementation is missing resetting as described in the paper.
    #       resetting is totally disabled for now.
    # NOTE: this implementation does not use vanilla SGD for its first epochs.
    #       you should do this yourself if you need it.
    #       it seems like using a Learner like SineCLR makes this unnecessary.

    def __init__(self, lr=0.01):
        self.alpha = _f(1e-7) # cubic.
        self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
        self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
        # momentum is ∝ (in the shape of) 1 - 1/(1 + t)
        self.mu_min = _f(0.5)
        self.mu_max = _f(0.9)
        self.reset_period = 0 # TODO

        super().__init__(lr)

    def reset(self):
        # NOTE: mt and vt are different than the pair in Adam-like optimizers.
        self.mt = None # momentum accumulator.
        self.vt = None # weight accumulator.
        self.t = 0

    def compute(self, dW, W):
        raise Exception("compute() is not available for this Optimizer.")

    def update(self, dW, W):
        self.t += 1

        if self.mt is None:
            self.mt = np.zeros_like(dW)
        if self.vt is None:
            self.vt = np.zeros_like(dW)

        if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0:
            self.mt = -self.lr * dW
            return

        mu = _1 - _1/_f(self.t) # the + 1 is implicit.
        mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)

        delta = W - self.vt
        delta_norm_squared = np.square(delta).sum()
        delta_norm = np.sqrt(delta_norm_squared)

        alpha = self.alpha
        beta = self.beta * dW.size

        cubic_reg = alpha * delta_norm_squared
        repulsive_reg = beta / delta_norm_squared
        dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm)

        self.mt = mu * self.mt - self.lr * dt

        W += mu * self.mt - self.lr * dt
        self.vt = W + self.gamma * (self.vt - W)

# Nonparametric Layers {{{1

class AlphaDropout(Layer):
    # to be used alongside Selu activations.
    # paper: https://arxiv.org/abs/1706.02515

    def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099):
        super().__init__()
        self.alpha = _f(alpha)
        self.lamb = _f(lamb)
        self.saturated = -self.lamb * self.alpha
        self.dropout = _f(dropout)

    @property
    def dropout(self):
        return self._dropout

    @dropout.setter
    def dropout(self, x):
        self._dropout = _f(x)
        self.q = 1 - self._dropout
        assert 0 <= self.q <= 1

        sat = self.saturated

        self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout)
        self.b = -self.a * (self._dropout * sat)

    def forward(self, X):
        self.mask = np.random.rand(*X.shape) < self.q
        return self.a * np.where(self.mask, X, self.saturated) + self.b

    def forward_deterministic(self, X):
        return X

    def backward(self, dY):
        return dY * self.a * self.mask

class Decimate(Layer):
    # simple decimaton layer that drops every other sample from the last axis.

    def __init__(self, phase='even'):
        super().__init__()
        # phase is the set of samples we keep in the forward pass.
        assert phase in ('even', 'odd'), phase
        self.phase = phase

    def make_shape(self, parent):
        shape = parent.output_shape
        self.input_shape = shape
        divy = (shape[-1] + 1) // 2 if self.phase == 'even' else shape[-1] // 2
        self.output_shape = tuple(list(shape[:-1]) + [divy])
        self.dX = np.zeros(self.input_shape, dtype=_f)

    def forward(self, X):
        self.batch_size = X.shape[0]
        if self.phase == 'even':
            return X.ravel()[0::2].reshape(self.batch_size, *self.output_shape)
        elif self.phase == 'odd':
            return X.ravel()[1::2].reshape(self.batch_size, *self.output_shape)

    def backward(self, dY):
        assert dY.shape[0] == self.batch_size
        dX = np.zeros((self.batch_size, *self.input_shape), dtype=_f)
        if self.phase == 'even':
            dX.ravel()[0::2] = dY.ravel()
        elif self.phase == 'odd':
            dX.ravel()[1::2] = dY.ravel()
        return dX

class Undecimate(Layer):
    # inverse operation of Decimate. not quite interpolation.

    def __init__(self, phase='even'):
        super().__init__()
        # phase is the set of samples we keep in the backward pass.
        assert phase in ('even', 'odd'), phase
        self.phase = phase

    def make_shape(self, parent):
        shape = parent.output_shape
        self.input_shape = shape
        mult = shape[-1] * 2
        self.output_shape = tuple(list(shape[:-1]) + [mult])

    def forward(self, X):
        self.batch_size = X.shape[0]
        Y = np.zeros((self.batch_size, *self.output_shape), dtype=_f)
        if self.phase == 'even':
            Y.ravel()[0::2] = X.ravel()
        elif self.phase == 'odd':
            Y.ravel()[1::2] = X.ravel()
        return Y

    def backward(self, dY):
        assert dY.shape[0] == self.batch_size
        if self.phase == 'even':
            return dY.ravel()[0::2].reshape(self.batch_size, *self.input_shape)
        elif self.phase == 'odd':
            return dY.ravel()[1::2].reshape(self.batch_size, *self.input_shape)

# Activations {{{2

class Selu(Layer):
    # paper: https://arxiv.org/abs/1706.02515

    def __init__(self, alpha=1.67326324, lamb=1.05070099):
        super().__init__()
        self.alpha = _f(alpha)
        self.lamb = _f(lamb)

    def forward(self, X):
        self.cond = X >= 0
        self.neg = self.alpha * np.exp(X)
        return self.lamb * np.where(self.cond, X, self.neg - self.alpha)

    def backward(self, dY):
        return dY * self.lamb * np.where(self.cond, 1, self.neg)

class TanhTest(Layer):
    def forward(self, X):
        self.sig = np.tanh(1 / 2 * X)
        return 2.4004 * self.sig

    def backward(self, dY):
        return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)

class ExpGB(Layer):
    # an output layer for one-hot classification problems.
    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
    # paper: https://arxiv.org/abs/1707.04199

    def __init__(self, alpha=0.1, beta=0.0):
        super().__init__()
        self.alpha = _f(alpha)
        self.beta = _f(beta)

    def forward(self, X):
        return self.alpha * np.exp(X) + self.beta

    def backward(self, dY):
        # this gradient is intentionally incorrect.
        return dY

class CubicGB(Layer):
    # an output layer for one-hot classification problems.
    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
    # paper: https://arxiv.org/abs/1707.04199
    # note: in the paper, it's called pow3GB, which is ugly.

    def __init__(self, alpha=0.1, beta=0.0):
        # note: the paper suggests defaults of 0.001 and 0.0,
        # but these didn't seem to work as well in my limited testing.
        super().__init__()
        self.alpha = _f(alpha)
        self.beta = _f(beta)

    def forward(self, X):
        return self.alpha * X**3 + self.beta

    def backward(self, dY):
        # this gradient is intentionally incorrect.
        return dY

# Parametric Layers {{{1

class Conv1Dper(Layer):
    # periodic (circular) convolution.
    # currently only supports one channel I/O.
    # some notes:
    # we could use FFTs for larger convolutions.
    # i think storing the coefficients backwards would
    # eliminate reversal in the critical code.

    serialize = {
        'W': 'coeffs',
    }

    def __init__(self, kernel_size, pos=None,
                 init=init_glorot_uniform, reg_w=None):
        super().__init__()
        self.kernel_size = int(kernel_size)
        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
        if pos is None:
            self.wrap0 = (self.kernel_size - 0) // 2
            self.wrap1 = (self.kernel_size - 1) // 2
        elif pos == 'alt':
            self.wrap0 = (self.kernel_size - 1) // 2
            self.wrap1 = (self.kernel_size - 0) // 2
        elif pos == 'left':
            self.wrap0 = 0
            self.wrap1 = self.kernel_size - 1
        elif pos == 'right':
            self.wrap0 = self.kernel_size - 1
            self.wrap1 = 0
        else:
            raise Exception("pos parameter not understood: {}".format(pos))

    def make_shape(self, parent):
        shape = parent.output_shape
        self.input_shape = shape
        assert len(shape) == 1, shape
        self.output_shape = shape
        self.coeffs.shape = (1, self.kernel_size)

    def forward(self, X):
        if self.wrap0 == 0:
            Xper = np.hstack((X,X[:,:self.wrap1]))
        elif self.wrap1 == 0:
            Xper = np.hstack((X[:,-self.wrap0:],X))
        else:
            Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
        self.cols = rolling_batch(Xper, self.kernel_size)
        convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
        return convolved

    def backward(self, dY):
        self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
        return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)

class LayerNorm(Layer):
    # paper: https://arxiv.org/abs/1607.06450
    # note: nonparametric when affine == False

    def __init__(self, eps=1e-5, affine=True):
        super().__init__()
        self.eps = _f(eps)
        self.affine = bool(affine)

        if self.affine:
            self.gamma = self._new_weights('gamma', init=init_ones)
            self.beta = self._new_weights('beta', init=init_zeros)
            self.serialized = {
                'gamma': 'gamma',
                'beta': 'beta',
            }

    def make_shape(self, parent):
        shape = parent.output_shape
        self.input_shape = shape
        self.output_shape = shape
        assert len(shape) == 1, shape
        if self.affine:
            self.gamma.shape = (shape[0],)
            self.beta.shape = (shape[0],)

    def forward(self, X):
        self.mean = X.mean(0)
        self.center = X - self.mean
        self.var = self.center.var(0) + self.eps
        self.std = np.sqrt(self.var)

        self.Xnorm = self.center / self.std
        if self.affine:
            return self.gamma.f * self.Xnorm + self.beta.f
        return self.Xnorm

    def backward(self, dY):
        length = dY.shape[0]

        if self.affine:
            dXnorm = dY * self.gamma.f
            self.gamma.g += (dY * self.Xnorm).sum(0)
            self.beta.g += dY.sum(0)
        else:
            dXnorm = dY

        dstd = (dXnorm * self.center).sum(0) / -self.var
        dcenter = dXnorm / self.std + dstd / self.std * self.center / length
        dmean = -dcenter.sum(0)
        dX = dcenter + dmean / length

        return dX

class Denses(Layer): # TODO: rename?
    # acts as a separate Dense for each row or column. only for 2D arrays.

    serialized = {
        'W': 'coeffs',
        'b': 'biases',
    }

    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
        super().__init__()
        self.dim = int(dim)
        self.weight_init = init
        self.axis = int(axis)
        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)

    def make_shape(self, parent):
        shape = parent.output_shape
        self.input_shape = shape
        assert len(shape) == 2, shape

        assert -len(shape) <= self.axis < len(shape)
        self.axis = self.axis % len(shape)

        self.output_shape = list(shape)
        self.output_shape[self.axis] = self.dim
        self.output_shape = tuple(self.output_shape)

        in_rows = self.input_shape[0]
        in_cols = self.input_shape[1]
        out_rows = self.output_shape[0]
        out_cols = self.output_shape[1]

        self.coeffs.shape = (in_rows, in_cols, self.dim)
        self.biases.shape = (1, out_rows, out_cols)

    def forward(self, X):
        self.X = X
        if self.axis == 0:
            return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f
        elif self.axis == 1:
            return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f

    def backward(self, dY):
        self.biases.g += dY.sum(0, keepdims=True)
        if self.axis == 0:
            self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY)
            return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
        elif self.axis == 1:
            self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
            return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)

class CosineDense(Dense):
    # paper: https://arxiv.org/abs/1702.05870
    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
    # the paper doesn't mention bias,
    # so we treat bias as an additional weight with a constant input of 1.
    # this is correct in Dense layers, so i hope it's correct here too.

    eps = 1e-4

    def forward(self, X):
        self.X = X
        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
          + 1 + self.eps)
        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
          + np.square(self.biases.f) + self.eps)
        self.dot = X @ self.coeffs.f + self.biases.f
        Y = self.dot / (self.X_norm * self.W_norm)
        return Y

    def backward(self, dY):
        ddot = dY / self.X_norm / self.W_norm
        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2

        self.coeffs.g += self.X.T @ ddot            \
          + dW_norm / self.W_norm * self.coeffs.f
        self.biases.g += ddot.sum(0, keepdims=True) \
          + dW_norm / self.W_norm * self.biases.f
        dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X

        return dX

# Rituals {{{1

def stochastic_multiply(W, gamma=0.5, allow_negation=False):
    # paper: https://arxiv.org/abs/1606.01981

    assert W.ndim == 1, W.ndim
    assert 0 < gamma < 1, gamma
    size = len(W)
    alpha = np.max(np.abs(W))
    # NOTE: numpy gives [low, high) but the paper advocates [low, high]
    mult = np.random.uniform(gamma, 1/gamma, size=size)
    if allow_negation:
        # NOTE: i have yet to see this do anything but cause divergence.
        # i've referenced the paper several times yet still don't understand
        # what i'm doing wrong, so i'm disabling it by default in my code.
        # maybe i just need *a lot* more weights to compensate.
        prob = (W / alpha + 1) / 2
        samples = np.random.random_sample(size=size)
        mult *= np.where(samples < prob, 1, -1)
    np.multiply(W, mult, out=W)

class StochMRitual(Ritual):
    # paper: https://arxiv.org/abs/1606.01981
    # this probably doesn't make sense for regression problems,
    # let alone small models, but here it is anyway!

    def __init__(self, learner=None, gamma=0.5):
        super().__init__(learner)
        self.gamma = _f(gamma)

    def prepare(self, model):
        self.W = np.copy(model.W)
        super().prepare(model)

    def learn(self, inputs, outputs):
        # an experiment:
        #assert self.learner.rate < 10, self.learner.rate
        #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))

        self.W[:] = self.model.W
        for layer in self.model.ordered_nodes:
            if isinstance(layer, Dense):
                stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
        residual = super().learn(inputs, outputs)
        self.model.W[:] = self.W
        return residual

    def update(self):
        super().update()
        f = 0.5
        for layer in self.model.ordered_nodes:
            if isinstance(layer, Dense):
                np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
            #   np.clip(layer.W, -1, 1, out=layer.W)

class NoisyRitual(Ritual):
    def __init__(self, learner=None,
                 input_noise=0, output_noise=0, gradient_noise=0):
        self.input_noise = _f(input_noise)
        self.output_noise = _f(output_noise)
        self.gradient_noise = _f(gradient_noise)
        super().__init__(learner)

    def learn(self, inputs, outputs):
        # this is pretty crude
        if self.input_noise > 0:
            s = self.input_noise
            inputs =   inputs + np.random.normal(0, s, size=inputs.shape)
        if self.output_noise > 0:
            s = self.output_noise
            outputs = outputs + np.random.normal(0, s, size=outputs.shape)
        return super().learn(inputs, outputs)

    def update(self):
        # gradient noise paper: https://arxiv.org/abs/1511.06807
        if self.gradient_noise > 0:
            size = len(self.model.dW)
            gamma = 0.55
            #s = self.gradient_noise / (1 + self.bn) ** gamma
            # experiments:
            s = self.gradient_noise * np.sqrt(self.learner.rate)
            #s = np.square(self.learner.rate)
            #s = self.learner.rate / self.en
            self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
        super().update()

# Learners {{{1

class PolyLearner(Learner):
    per_batch = True

    def __init__(self, optim, epochs=400, coeffs=(1,)):
        self.coeffs = tuple(coeffs)
        super().__init__(optim, epochs, np.polyval(self.coeffs, 0))

    def rate_at(self, epoch):
        progress = (epoch - 1) / (self.epochs)
        ret = np.polyval(self.coeffs, progress)
        return np.abs(ret)

# Components {{{1

def _mr_make_norm(norm, *args, **kwargs):
    def _mr_norm(y, width, depth, block, multi, activation, style, FC, d):
        skip = y
        merger = Sum()
        skip.feed(merger)
        z_start = skip
        z_start = z_start.feed(norm(*args, **kwargs))
        z_start = z_start.feed(activation())
        for _ in range(multi):
            z = z_start
            for j in range(block):
                if j > 0:
                    z = z.feed(norm(*args, **kwargs))
                    z = z.feed(activation())
                z = z.feed(FC())
            z.feed(merger)
        y = merger
        return y
    return _mr_norm

def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
    skip = y
    merger = Sum()
    skip.feed(merger)
    z_start = skip.feed(activation())
    for _ in range(multi):
        z = z_start
        for j in range(block):
            if j > 0:
                z = z.feed(activation())
            z = z.feed(FC())
        z.feed(merger)
    y = merger
    return y

def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
    # this is my own awful contraption.
    is_last = d + 1 == depth
    needs_sum = not is_last or multi > 1
    skip = y
    if needs_sum:
        merger = Sum()
    if not is_last:
        skip.feed(merger)
    z_start = skip.feed(activation())
    for _ in range(multi):
        z = z_start
        for j in range(block):
            if j > 0:
                z = z.feed(activation())
            z = z.feed(FC())
        if needs_sum:
            z.feed(merger)
    if needs_sum:
        y = merger
    else:
        y = z
    return y

_mr_styles = dict(
    lnorm=_mr_make_norm(LayerNorm),
    batchless=_mr_batchless,
    onelesssum=_mr_onelesssum,
)

def multiresnet(x, width, depth, block=2, multi=1,
                activation=Relu, style='batchless',
                init=init_he_normal):
    if style == 'cossim':
        style = 'batchless'
        DenseClass = CosineDense
    else:
        DenseClass = Dense
    if style not in _mr_styles:
        raise Exception('unknown resnet style', style)

    y = x
    last_size = x.output_shape[0]

    for d in range(depth):
        size = width
        FC = lambda: DenseClass(size, init)

        if last_size != size:
            y = y.feed(FC())

        y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d)

        last_size = size

    return y

# Toy Data {{{1

inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform,
             glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform,
             gaussian_unit=init_gaussian_unit)
activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh,
                   relu=Relu, elu=Elu, gelu=GeluApprox, selu=Selu,
                   softplus=Softplus)

def prettyize(data):
    if isinstance(data, np.ndarray):
        s = ', '.join(('{:8.2e}'.format(n) for n in data))
        s = '[' + s + ']'
    else:
        s = '{:8.2e}'.format(data)
    return s

def normalize_data(data, mean=None, std=None):
    # in-place
    if mean is None or std is None:
        mean = np.mean(data, axis=0)
        std  =  np.std(data, axis=0)
        mean_str = prettyize(mean)
        std_str = prettyize(std)
        lament('nod(...,\n    {},\n    {})'.format(mean_str, std_str))
        sys.exit(1)
    data -= _f(mean)
    data /= _f(std)

def toy_data(train_samples, valid_samples, problem=2):
    total_samples = train_samples + valid_samples

    nod = normalize_data # shorthand to keep a sane indentation

    if problem == 0:
        from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs
        inputs, outputs = _f(inputs), _f(outputs)
        valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)

        nod(inputs, 127.5, 73.9)
        nod(outputs, 44.8, 21.7)
        nod(valid_inputs, 127.5, 73.9)
        nod(valid_outputs, 44.8, 21.7)

    elif problem == 1:
        from sklearn.datasets import make_friedman1
        inputs, outputs = make_friedman1(total_samples)
        inputs, outputs = _f(inputs), _f(outputs)
        outputs = np.expand_dims(outputs, -1)

        nod(inputs, 0.5, 1/np.sqrt(12))
        nod(outputs, 14.4, 4.9)

    elif problem == 2:
        from sklearn.datasets import make_friedman2
        inputs, outputs = make_friedman2(total_samples)
        inputs, outputs = _f(inputs), _f(outputs)
        outputs = np.expand_dims(outputs, -1)

        nod(inputs,
            [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
            [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])

        nod(outputs, [482], [380])

    elif problem == 3:
        from sklearn.datasets import make_friedman3
        inputs, outputs = make_friedman3(total_samples)
        inputs, outputs = _f(inputs), _f(outputs)
        outputs = np.expand_dims(outputs, -1)

        nod(inputs,
            [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
            [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])

        nod(outputs, [1.32327931], [0.31776295])

    else:
        raise Exception("unknown toy data set", problem)

    if problem != 0:
        # split off a validation set
        indices = np.arange(inputs.shape[0])
        np.random.shuffle(indices)
        valid_inputs  =  inputs[indices][-valid_samples:]
        valid_outputs = outputs[indices][-valid_samples:]
        inputs  =  inputs[indices][:-valid_samples]
        outputs = outputs[indices][:-valid_samples]

    return (inputs, outputs), (valid_inputs, valid_outputs)

# Model Creation {{{1

def optim_from_config(config):
    if config.optim == 'adam':
        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
        b1 = np.exp(-1/d1)
        b2 = np.exp(-1/d2)
        o = Nadam if config.nesterov else Adam
        optim = o(b1=b1, b2=b2)
    elif config.optim == 'ftml':
        d1 = config.optim_decay1 if 'optim_decay1' in config else 2
        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
        b1 = np.exp(-1/d1)
        b2 = np.exp(-1/d2)
        optim = FTML(b1=b1, b2=b2)
    elif config.optim == 'yf':
        d1 = config.optim_decay1 if 'optim_decay1' in config else 999.5
        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
        if d1 != d2:
            raise Exception("yellowfin only uses one decay term.")
        beta = np.exp(-1/d1)
        optim = YellowFin(beta=beta)
    elif config.optim in ('ag', 'adagrad'):
        optim = Adagrad()
    elif config.optim in ('rms', 'rmsprop'):
        d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
        mu = np.exp(-1/d2)
        optim = RMSprop(mu=mu)
    elif config.optim == 'rmsc':
        d2 = config.optim_decay2 if 'optim_decay2' in config else 9.5
        mu = np.exp(-1/d2)
        optim = RMSpropCentered(momentum=mu)
    elif config.optim == 'sgd':
        d1 = config.optim_decay1 if 'optim_decay1' in config else 0
        clip = config.gradient_clip if 'gradient_clip' in config else 0.0
        if d1 > 0 or clip > 0:
            b1 = np.exp(-1/d1) if d1 > 0 else 0
            if clip > 0:
                optim = MomentumClip(mu=b1, nesterov=config.nesterov, clip=clip)
            else:
                optim = Momentum(mu=b1, nesterov=config.nesterov)
        else:
            optim = Optimizer()
    elif config.optim == 'ps':
        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
        b1 = np.exp(-1/d1)
        optim = PowerSign(mu=b1)
    elif config.optim == 'as':
        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
        b1 = np.exp(-1/d1)
        optim = AddSign(mu=b1)
    else:
        raise Exception('unknown optimizer', config.optim)

    return optim

def learner_from_config(config, optim, rscb):
    if config.learner == 'sgdr':
        expando = config.expando if 'expando' in config else None
        learner = SGDR(optim, epochs=config.epochs, rate=config.learn,
                       restart_decay=config.restart_decay, restarts=config.restarts,
                       callback=rscb, expando=expando)
        # final learning rate isn't of interest here; it's gonna be close to 0.
        log('total epochs', learner.epochs)
    elif config.learner in ('sin', 'sine'):
        lower_rate = config.learn * 1e-5 # TODO: allow access to this.
        epochs = config.epochs * (config.restarts + 1)
        frequency = config.epochs
        learner = SineCLR(optim, epochs=epochs, frequency=frequency,
                          upper_rate=config.learn, lower_rate=lower_rate,
                          callback=rscb)
    elif config.learner == 'wave':
        lower_rate = config.learn * 1e-5 # TODO: allow access to this.
        epochs = config.epochs * (config.restarts + 1)
        frequency = config.epochs
        learner = WaveCLR(optim, epochs=epochs, frequency=frequency,
                          upper_rate=config.learn, lower_rate=lower_rate,
                          callback=rscb)
    elif config.learner == 'anneal':
        learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
                                   halve_every=config.learn_halve_every)
        log("final learning rate", "{:10.8f}".format(learner.final_rate))
    elif config.learner == 'sgd':
        learner = Learner(optim, epochs=config.epochs, rate=config.learn)
    else:
        raise Exception('unknown learner', config.learner)

    return learner

def lookup_loss(maybe_name):
    if isinstance(maybe_name, Loss):
        return maybe_name
    elif maybe_name == 'mse':
        return Squared()
    elif maybe_name == 'mshe': # mushy
        return SquaredHalved()
    elif maybe_name == 'mae':
        return Absolute()
    elif maybe_name == 'msee':
        return SomethingElse()
    elif maybe_name == 'huber':
        return Huber(delta=0.1)
    raise Exception('unknown objective', maybe_name)

def ritual_from_config(config, learner):
    if config.ritual == 'default':
        ritual = Ritual(learner=learner)
    elif config.ritual == 'stochm':
        ritual = StochMRitual(learner=learner)
    elif config.ritual == 'noisy':
        ritual = NoisyRitual(learner=learner,
                             input_noise=1e-1, output_noise=1e-2,
                             gradient_noise=2e-7)
    else:
        raise Exception('unknown ritual', config.ritual)

    return ritual

def model_from_config(config, input_features, output_features, callbacks=None):
    init = inits[config.init]
    activation = activations[config.activation]

    x = Input(shape=(input_features,))
    y = x
    y = multiresnet(y,
                    config.res_width, config.res_depth,
                    config.res_block, config.res_multi,
                    activation=activation, init=init,
                    style=config.parallel_style)
    if y.output_shape[0] != output_features:
        y = y.feed(Dense(output_features, init))

    loss = lookup_loss(config.loss)
    mloss = lookup_loss(config.mloss) if config.mloss else loss

    model = Model(x, y, loss=loss, mloss=mloss, unsafe=config.unsafe)

    if config.fn_load is not None:
        log('loading weights', config.fn_load)
        model.load_weights(config.fn_load)

    optim = optim_from_config(config)

    def rscb(restart):
        if callbacks:
            callbacks.restart()
        log("restarting", restart)
        if config.restart_optim:
            optim.reset()

    learner = learner_from_config(config, optim, rscb)

    ritual = ritual_from_config(config, learner)

    return model, learner, ritual

# main program {{{1

def run(program, args=None):
    args = args if args else []

    lower_priority()
    np.random.seed(42069)

    # Config {{{2

    from dotmap import DotMap
    config = DotMap(
        fn_load = None,
        fn_save = 'optim_nn.h5',
        log_fn = 'losses.npz',

        # multi-residual network parameters
        res_width = 28,
        res_depth = 2,
        res_block = 3, # normally 2 for plain resnet
        res_multi = 2, # normally 1 for plain resnet

        # style of resnet (order of layers, which layers, etc.)
        parallel_style = 'onelesssum',
        activation = 'gelu',

        #optim = 'ftml',
        #optim_decay1 = 2,
        #optim_decay2 = 100,
        optim = 'adam', # note: most features only implemented for Adam
        optim_decay1 = 24,  #  first momentum given in batches (optional)
        optim_decay2 = 100, # second momentum given in batches (optional)
        nesterov = True, # not available for all optimizers.
        batch_size = 64,

        # learning parameters
        learner = 'sgdr',
        learn = 0.00125,
        epochs = 24,
        learn_halve_every = 16, # only used with anneal
        restarts = 4,
        restart_decay = 0.25, # only used with SGDR
        expando = lambda i: 24 * i,

        # misc
        init = 'he_normal',
        loss = 'mse',
        mloss = 'mse',
        ritual = 'default',
        restart_optim = False, # restarts also reset internal state of optimizer
        warmup = False, # train a couple epochs on gaussian noise and reset

        # logging/output
        log10_loss = True, # personally, i'm sick of looking linear loss values!
        #fancy_logs = True, # unimplemented (can't turn it off yet)

        problem = 2,
        compare = (
            # best results for ~10,000 parameters
            # training/validation pairs for each problem (starting from problem 0):
            (10**-3.120, 10**-2.901),
            # 1080 epochs on these...
            (10**-6.747, 10**-6.555),
            (10**-7.774, 10**-7.626),
            (10**-6.278, 10**-5.234), # overfitting? bad valid set?
        ),

        unsafe = True, # aka gotta go fast mode
    )

    for k in ['parallel_style', 'activation', 'optim', 'learner',
              'init', 'loss', 'mloss', 'ritual']:
        config[k] = config[k].lower()

    config.learn *= np.sqrt(config.batch_size)

    config.pprint()

    # Toy Data {{{2

    (inputs, outputs), (valid_inputs, valid_outputs) = \
      toy_data(2**14, 2**11, problem=config.problem)
    input_features  =  inputs.shape[-1]
    output_features = outputs.shape[-1]

    # Our Test Model

    callbacks = Dummy()

    model, learner, ritual = \
      model_from_config(config, input_features, output_features, callbacks)

    # Model Information {{{2

    model.print_graph()
    log('parameters', model.param_count)

    # Training {{{2

    batch_losses = []
    train_losses = []
    valid_losses = []

    def measure_error():
        def print_error(name, inputs, outputs, comparison=None):
            predicted = model.evaluate(inputs)
            err = model.mloss.forward(predicted, outputs)
            if config.log10_loss:
                print(name, "{:12.6e}".format(err))
                if comparison:
                    err10 = np.log10(err)
                    cmp10 = np.log10(comparison)
                    color = '\x1B[31m' if err10 > cmp10 else '\x1B[32m'
                    log(name + " log10-loss", "{:+6.3f} {}({:+6.3f})\x1B[0m".format(err10, color, err10 - cmp10))
                else:
                    log(name + " log10-loss", "{:+6.3f}".format(err, np.log10(err)))
            else:
                log(name + " loss", "{:12.6e}".format(err))
                if comparison:
                    fmt = "10**({:+7.4f}) times"
                    log("improvement", fmt.format(np.log10(comparison / err)))
            return err

        train_err = print_error("train",
                                inputs, outputs,
                                config.compare[config.problem][0])
        valid_err = print_error("valid",
                                valid_inputs, valid_outputs,
                                config.compare[config.problem][1])
        train_losses.append(train_err)
        valid_losses.append(valid_err)

    callbacks.restart = measure_error

    training = config.epochs > 0 and config.restarts >= 0

    ritual.prepare(model)

    if training and config.warmup and not config.fn_load:
        log("warming", "up")

        # use plain SGD in warmup to prevent (or possibly cause?) numeric issues
        temp_optim = learner.optim
        temp_loss = model.loss
        learner.optim = MomentumClip(lr=0.01, mu=0)
        ritual.loss = Absolute() # less likely to blow up; more general

        # NOTE: experiment: trying const batches and batch_size
        bs = 256
        target = 1 * 1024 * 1024
        # 4 being sizeof(float)
        batches = (target / 4 / np.prod(inputs.shape[1:])) // bs * bs
        ins  = [int(batches)] + list( inputs.shape[1:])
        outs = [int(batches)] + list(outputs.shape[1:])

        for _ in range(4):
            ritual.train_batched(
                np.random.normal(size=ins),
                np.random.normal(size=outs),
                batch_size=bs)
            ritual.reset()

        learner.optim = temp_optim
        model.loss = temp_loss

    if training:
        measure_error()

    while training and learner.next():
        avg_loss, losses = ritual.train_batched(
            inputs, outputs,
            config.batch_size,
            return_losses=True)
        batch_losses += losses

        if config.log10_loss:
            fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}"
            log("info", fmt.format(learner.epoch, learner.rate, np.log10(avg_loss)),
                update=True)
        else:
            fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
            log("info", fmt.format(learner.epoch, learner.rate, avg_loss),
                update=True)

    measure_error()

    if training and config.fn_save is not None:
        log('saving weights', config.fn_save)
        model.save_weights(config.fn_save, overwrite=True)

    if training and config.log_fn is not None:
        log('saving losses', config.log_fn)
        np.savez_compressed(config.log_fn,
                            batch_losses=np.array(batch_losses, dtype=_f),
                            train_losses=np.array(train_losses, dtype=_f),
                            valid_losses=np.array(valid_losses, dtype=_f))

    # Evaluation {{{2
    # TODO: write this portion again

    return 0

# run main program {{{1

if __name__ == '__main__':
    sys.exit(run(sys.argv[0], sys.argv[1:]))