From bbdb91fcb1d48e74c2989e200be7e154fec02399 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Sun, 21 Jan 2018 22:04:25 +0000
Subject: [PATCH] merge and split modules into a package

---
 .gitignore                       |    7 +
 README.md                        |    2 +-
 onn_mnist.py => mnist_example.py |    2 +-
 onn.py                           | 1429 ------------------------------
 onn/__init__.py                  |   28 +
 onn/activation.py                |  185 ++++
 onn/floats.py                    |   18 +
 onn/initialization.py            |   31 +
 onn/layer.py                     |  200 +++++
 onn/layer_base.py                |  143 +++
 onn/learner.py                   |  180 ++++
 onn/loss.py                      |  129 +++
 onn/math.py                      |   14 +
 onn/model.py                     |  197 ++++
 onn/nodes.py                     |   43 +
 onn/optimizer.py                 |  531 +++++++++++
 onn/optimizer_base.py            |   19 +
 onn/parametric.py                |  254 ++++++
 onn/regularizer.py               |   45 +
 onn/ritual.py                    |   90 ++
 onn/ritual_base.py               |  132 +++
 onn/util.py                      |   37 +
 onn/weights.py                   |   58 ++
 onn_core.py                      | 1398 -----------------------------
 24 files changed, 2343 insertions(+), 2829 deletions(-)
 create mode 100644 .gitignore
 rename onn_mnist.py => mnist_example.py (99%)
 mode change 100755 => 100644
 delete mode 100755 onn.py
 create mode 100644 onn/__init__.py
 create mode 100644 onn/activation.py
 create mode 100644 onn/floats.py
 create mode 100644 onn/initialization.py
 create mode 100644 onn/layer.py
 create mode 100644 onn/layer_base.py
 create mode 100644 onn/learner.py
 create mode 100644 onn/loss.py
 create mode 100644 onn/math.py
 create mode 100644 onn/model.py
 create mode 100644 onn/nodes.py
 create mode 100644 onn/optimizer.py
 create mode 100644 onn/optimizer_base.py
 create mode 100644 onn/parametric.py
 create mode 100644 onn/regularizer.py
 create mode 100644 onn/ritual.py
 create mode 100644 onn/ritual_base.py
 create mode 100644 onn/util.py
 create mode 100644 onn/weights.py
 delete mode 100644 onn_core.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..667fa90
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__
+*.pyc
+
+# work in progress:
+onn/component.py
+onn/extra.py
+onn/run.py
diff --git a/README.md b/README.md
index 08bf202..c12b612 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ numpy scipy h5py sklearn dotmap
 
 ```python
 #!/usr/bin/env python3
-from onn_core import *
+from onn import *
 bs = 500
 lr = 0.01
 reg = L1L2(3.2e-5, 3.2e-4)
diff --git a/onn_mnist.py b/mnist_example.py
old mode 100755
new mode 100644
similarity index 99%
rename from onn_mnist.py
rename to mnist_example.py
index c321bfe..d1a98ff
--- a/onn_mnist.py
+++ b/mnist_example.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from onn import *
-from onn_core import _f
+from onn.floats import *
 from dotmap import DotMap
 
 lower_priority()
diff --git a/onn.py b/onn.py
deleted file mode 100755
index 126e2f3..0000000
--- a/onn.py
+++ /dev/null
@@ -1,1429 +0,0 @@
-#!/usr/bin/env python3
-
-# external packages required for full functionality:
-# numpy scipy h5py sklearn dotmap
-
-# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
-#           this is gonna take some work.
-
-from onn_core import *
-from onn_core import _check, _f, _0, _1
-
-import sys
-
-_log_was_update = False
-def log(left, right, update=False):
-    s = "\x1B[1m  {:>20}:\x1B[0m   {}".format(left, right)
-    global _log_was_update
-    if update and _log_was_update:
-        lament('\x1B[F' + s)
-    else:
-        lament(s)
-    _log_was_update = update
-
-class Dummy:
-    pass
-
-# Math Utilities {{{1
-
-def rolling(a, window):
-    # http://stackoverflow.com/a/4924433
-    shape = (a.size - window + 1, window)
-    strides = (a.itemsize, a.itemsize)
-    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
-
-def rolling_batch(a, window):
-    # same as rolling, but acts on each batch (axis 0).
-    shape = (a.shape[0], a.shape[-1] - window + 1, window)
-    strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
-    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
-
-# Initializations {{{1
-
-def init_gaussian_unit(size, ins, outs):
-    s = np.sqrt(1 / ins)
-    return np.random.normal(0, s, size=size)
-
-# Loss functions {{{1
-
-class SomethingElse(ResidualLoss):
-    # generalizes Absolute and SquaredHalved.
-    # plot: https://www.desmos.com/calculator/fagjg9vuz7
-    def __init__(self, a=4/3):
-        assert 1 <= a <= 2, "parameter out of range"
-        self.a = _f(a / 2)
-        self.b = _f(2 / a)
-        self.c = _f(2 / a - 1)
-
-    def f(self, r):
-        return self.a * np.abs(r)**self.b
-
-    def df(self, r):
-        return np.sign(r) * np.abs(r)**self.c
-
-class Confidence(Loss):
-    # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
-    # it's just a metric of how large the value is of the predicted class.
-    # when using it for loss, it acts like a crappy regularizer.
-    # it really just measures how much of a hot-shot the network thinks it is.
-
-    def forward(self, p, y=None):
-        categories = p.shape[-1]
-        confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
-        # the exponent in softmax puts a maximum on confidence,
-        # but we don't compensate for that. if necessary,
-        # it'd be better to use an activation that doesn't have this limit.
-        return np.mean(confidence)
-
-    def backward(self, p, y=None):
-        # in order to agree with the forward pass,
-        # using this backwards pass as-is will minimize confidence.
-        categories = p.shape[-1]
-        detc = p / categories / (1 - 1/categories)
-        dmax = p == np.max(p, axis=-1, keepdims=True)
-        return detc * dmax
-
-# Regularizers {{{1
-
-class SaturateRelu(Regularizer):
-    # paper: https://arxiv.org/abs/1703.09202
-    # TODO: test this (and ActivityRegularizer) more thoroughly.
-    #       i've looked at the histogram of the resulting weights.
-    #       it seems like only the layers after this are affected
-    #       the way they should be.
-
-    def __init__(self, lamb=0.0):
-        self.lamb = _f(lamb)
-
-    def forward(self, X):
-        return self.lamb * np.where(X >= 0, X, 0)
-
-    def backward(self, X):
-        return self.lamb * np.where(X >= 0, 1, 0)
-
-# Optimizers {{{1
-
-class FTML(Optimizer):
-    # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
-    # author's implementation: https://github.com/szhengac/optim/commit/923555e
-
-    def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
-        self.iterations = _0
-        self.b1 = _f(b1) # decay term
-        self.b2 = _f(b2) # decay term
-        self.eps = _f(eps)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.dt1 = None
-        self.dt = None
-        self.vt = None
-        self.zt = None
-        self.b1_t = _1
-        self.b2_t = _1
-
-    def compute(self, dW, W):
-        if self.dt1 is None: self.dt1 = np.zeros_like(dW)
-        if self.dt is None: self.dt = np.zeros_like(dW)
-        if self.vt is None: self.vt = np.zeros_like(dW)
-        if self.zt is None: self.zt = np.zeros_like(dW)
-
-        # NOTE: we could probably rewrite these equations to avoid this copy.
-        self.dt1[:] = self.dt[:]
-
-        self.b1_t *= self.b1
-        self.b2_t *= self.b2
-
-        # hardly an elegant solution.
-        lr = max(self.lr, self.eps)
-
-        # same as Adam's vt.
-        self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
-
-        # you can factor "inner" out of Adam as well.
-        inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
-        self.dt[:] = (1 - self.b1_t) / lr * inner
-
-        sigma_t = self.dt - self.b1 * self.dt1
-
-        # Adam's mt minus the sigma term.
-        self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
-
-        # subtract by weights to avoid having to override self.update.
-        return -self.zt / self.dt - W
-
-class MomentumClip(Optimizer):
-    def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
-        self.mu = _f(mu)
-        self.clip = _f(clip)
-        self.nesterov = bool(nesterov)
-        self.debug = bool(debug)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.accum = None
-
-    def compute(self, dW, W):
-        if self.accum is None:
-            self.accum = np.zeros_like(dW)
-
-        total_norm = np.linalg.norm(dW)
-        clip_scale = self.clip / (total_norm + 1e-6)
-        if clip_scale < 1:
-            if self.debug:
-                lament("clipping gradients; norm: {:10.5f}".format(total_norm))
-            dW *= clip_scale
-
-        self.accum[:] = self.accum * self.mu + dW
-        if self.nesterov:
-            return -self.lr * (self.accum * self.mu + dW)
-        else:
-            return -self.lr * self.accum
-
-class YellowFin(Optimizer):
-    # paper: https://arxiv.org/abs/1706.03471
-    # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
-    # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
-    # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
-
-    def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
-                 debias=True, clip=1.0):
-        self.lr_default = _f(lr)
-        self.mu_default = _f(mu)
-        self.beta = _f(beta)
-        self.window_size = int(window_size) # curv_win_width
-        self.debias_enabled = bool(debias)
-        self.clip = _f(clip)
-
-        self.mu = _f(mu) # momentum
-        super().__init__(lr)
-
-    def reset(self):
-        self.accum = None
-
-        self.lr = self.lr_default
-        self.mu = self.mu_default
-
-        self.step = 0
-        self.beta_t = self.beta
-
-        self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
-
-        self.h_min = None
-        self.h_max = None
-
-        self.g_lpf = 0
-        #self.g_squared_lpf = 0
-        self.g_norm_squared_lpf = 0
-        self.g_norm_lpf = 0
-        self.h_min_lpf = 0
-        self.h_max_lpf = 0
-        self.dist_lpf = 0
-        self.lr_lpf = 0
-        self.mu_lpf = 0
-
-    def get_lr_mu(self):
-        p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
-        w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
-        w = np.power(w3, 1/3)
-        y = w - p / (3 * w)
-        sqrt_mu1 = y + 1
-
-        sqrt_h_min = np.sqrt(self.h_min)
-        sqrt_h_max = np.sqrt(self.h_max)
-        sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min)
-
-        sqrt_mu = max(sqrt_mu1, sqrt_mu2)
-        if sqrt_mu2 > sqrt_mu1:
-            print('note: taking dr calculation. something may have exploded.')
-
-        lr = np.square(1 - sqrt_mu) / self.h_min
-        mu = np.square(sqrt_mu)
-        return lr, mu
-
-    def compute(self, dW, W):
-        if self.accum is None:
-            self.accum = np.zeros_like(dW)
-
-        # TODO: prevent allocations everywhere by using [:].
-        #       assuming that really works. i haven't actually checked.
-
-        total_norm = np.linalg.norm(dW)
-        clip_scale = self.clip / (total_norm + 1e-6)
-        if clip_scale < 1:
-            #print("clipping gradients; norm: {:10.5f}".format(total_norm))
-            dW *= clip_scale
-
-        #fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
-        #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
-
-        b = self.beta
-        m1b = 1 - self.beta
-        debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1
-
-        g = dW
-        g_squared = np.square(g)
-        g_norm_squared = np.sum(g_squared)
-        g_norm = np.sqrt(g_norm_squared)
-
-        self.curv_win[self.step % self.window_size] = g_norm_squared
-        valid_window = self.curv_win[:min(self.window_size, self.step + 1)]
-        h_min_t = np.min(valid_window)
-        h_max_t = np.max(valid_window)
-
-        self.g_lpf              = b * self.g_lpf              + m1b * g
-        #self.g_squared_lpf      = b * self.g_squared_lpf      + m1b * g_squared
-        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
-        self.g_norm_lpf         = b * self.g_norm_lpf         + m1b * g_norm
-        self.h_min_lpf          = b * self.h_min_lpf          + m1b * h_min_t
-        self.h_max_lpf          = b * self.h_max_lpf          + m1b * h_max_t
-
-        g_avg              = debias * self.g_lpf
-        #g_squared_avg      = debias * self.g_squared_lpf
-        g_norm_squared_avg = debias * self.g_norm_squared_lpf
-        g_norm_avg         = debias * self.g_norm_lpf
-        self.h_min         = debias * self.h_min_lpf
-        self.h_max         = debias * self.h_max_lpf
-        assert self.h_max >= self.h_min
-
-        dist = g_norm_avg / g_norm_squared_avg
-
-        self.dist_lpf           = b * self.dist_lpf           + m1b * dist
-
-        self.dist_avg      = debias * self.dist_lpf
-
-        self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
-        # equivalently:
-        #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
-
-        if self.step > 0:
-            lr_for_real, mu_for_real = self.get_lr_mu()
-            self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real
-            self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real
-            self.mu = debias * self.mu_lpf
-            self.lr = debias * self.lr_lpf
-
-        self.accum[:] = self.accum * self.mu - self.lr * dW
-        V = self.accum
-
-        self.step += 1
-        self.beta_t *= self.beta
-        return V
-
-class AddSign(Optimizer):
-    # paper: https://arxiv.org/abs/1709.07417
-
-    def __init__(self, lr=0.01, mu=0.9, alpha=1):
-        self.mu = _f(mu)
-        self.alpha = _f(alpha)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.accum = None
-
-    def compute(self, dW, W):
-        if self.accum is None:
-            self.accum = np.zeros_like(dW)
-
-        self.accum[:] = self.accum * self.mu + dW
-
-        signed = np.sign(dW) * np.sign(self.accum)
-        #signed *= decay
-
-        return -self.lr * dW * (self.alpha + signed)
-
-class PowerSign(Optimizer):
-    # paper: https://arxiv.org/abs/1709.07417
-
-    def __init__(self, lr=0.01, mu=0.9, alpha=np.e):
-        self.mu = _f(mu)
-        self.alpha = _f(alpha)
-        self.use_exp = np.isclose(self.alpha, _f(np.e))
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.accum = None
-
-    def compute(self, dW, W):
-        if self.accum is None:
-            self.accum = np.zeros_like(dW)
-
-        self.accum[:] = self.accum * self.mu + dW
-
-        signed = np.sign(dW) * np.sign(self.accum)
-        #signed *= decay
-
-        if self.use_exp:
-            return -self.lr * dW * np.exp(signed)
-        else:
-            return -self.lr * dW * np.power(self.alpha, signed)
-
-class Neumann(Optimizer):
-    # paper: https://arxiv.org/abs/1712.03298
-    # NOTE: this implementation is missing resetting as described in the paper.
-    #       resetting is totally disabled for now.
-    # NOTE: this implementation does not use vanilla SGD for its first epochs.
-    #       you should do this yourself if you need it.
-    #       it seems like using a Learner like SineCLR makes this unnecessary.
-
-    def __init__(self, lr=0.01):
-        self.alpha = _f(1e-7) # cubic.
-        self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
-        self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
-        # momentum is ∝ (in the shape of) 1 - 1/(1 + t)
-        self.mu_min = _f(0.5)
-        self.mu_max = _f(0.9)
-        self.reset_period = 0 # TODO
-
-        super().__init__(lr)
-
-    def reset(self):
-        # NOTE: mt and vt are different than the pair in Adam-like optimizers.
-        self.mt = None # momentum accumulator.
-        self.vt = None # weight accumulator.
-        self.t = 0
-
-    def compute(self, dW, W):
-        raise Exception("compute() is not available for this Optimizer.")
-
-    def update(self, dW, W):
-        self.t += 1
-
-        if self.mt is None:
-            self.mt = np.zeros_like(dW)
-        if self.vt is None:
-            self.vt = np.zeros_like(dW)
-
-        if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0:
-            self.mt = -self.lr * dW
-            return
-
-        mu = _1 - _1/_f(self.t) # the + 1 is implicit.
-        mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
-
-        delta = W - self.vt
-        delta_norm_squared = np.square(delta).sum()
-        delta_norm = np.sqrt(delta_norm_squared)
-
-        alpha = self.alpha
-        beta = self.beta * dW.size
-
-        cubic_reg = alpha * delta_norm_squared
-        repulsive_reg = beta / delta_norm_squared
-        dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm)
-
-        self.mt = mu * self.mt - self.lr * dt
-
-        W += mu * self.mt - self.lr * dt
-        self.vt = W + self.gamma * (self.vt - W)
-
-# Nonparametric Layers {{{1
-
-class AlphaDropout(Layer):
-    # to be used alongside Selu activations.
-    # paper: https://arxiv.org/abs/1706.02515
-
-    def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099):
-        super().__init__()
-        self.alpha = _f(alpha)
-        self.lamb = _f(lamb)
-        self.saturated = -self.lamb * self.alpha
-        self.dropout = _f(dropout)
-
-    @property
-    def dropout(self):
-        return self._dropout
-
-    @dropout.setter
-    def dropout(self, x):
-        self._dropout = _f(x)
-        self.q = 1 - self._dropout
-        assert 0 <= self.q <= 1
-
-        sat = self.saturated
-
-        self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout)
-        self.b = -self.a * (self._dropout * sat)
-
-    def forward(self, X):
-        self.mask = np.random.rand(*X.shape) < self.q
-        return self.a * np.where(self.mask, X, self.saturated) + self.b
-
-    def forward_deterministic(self, X):
-        return X
-
-    def backward(self, dY):
-        return dY * self.a * self.mask
-
-class Decimate(Layer):
-    # simple decimaton layer that drops every other sample from the last axis.
-
-    def __init__(self, phase='even'):
-        super().__init__()
-        # phase is the set of samples we keep in the forward pass.
-        assert phase in ('even', 'odd'), phase
-        self.phase = phase
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        divy = (shape[-1] + 1) // 2 if self.phase == 'even' else shape[-1] // 2
-        self.output_shape = tuple(list(shape[:-1]) + [divy])
-        self.dX = np.zeros(self.input_shape, dtype=_f)
-
-    def forward(self, X):
-        self.batch_size = X.shape[0]
-        if self.phase == 'even':
-            return X.ravel()[0::2].reshape(self.batch_size, *self.output_shape)
-        elif self.phase == 'odd':
-            return X.ravel()[1::2].reshape(self.batch_size, *self.output_shape)
-
-    def backward(self, dY):
-        assert dY.shape[0] == self.batch_size
-        dX = np.zeros((self.batch_size, *self.input_shape), dtype=_f)
-        if self.phase == 'even':
-            dX.ravel()[0::2] = dY.ravel()
-        elif self.phase == 'odd':
-            dX.ravel()[1::2] = dY.ravel()
-        return dX
-
-class Undecimate(Layer):
-    # inverse operation of Decimate. not quite interpolation.
-
-    def __init__(self, phase='even'):
-        super().__init__()
-        # phase is the set of samples we keep in the backward pass.
-        assert phase in ('even', 'odd'), phase
-        self.phase = phase
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        mult = shape[-1] * 2
-        self.output_shape = tuple(list(shape[:-1]) + [mult])
-
-    def forward(self, X):
-        self.batch_size = X.shape[0]
-        Y = np.zeros((self.batch_size, *self.output_shape), dtype=_f)
-        if self.phase == 'even':
-            Y.ravel()[0::2] = X.ravel()
-        elif self.phase == 'odd':
-            Y.ravel()[1::2] = X.ravel()
-        return Y
-
-    def backward(self, dY):
-        assert dY.shape[0] == self.batch_size
-        if self.phase == 'even':
-            return dY.ravel()[0::2].reshape(self.batch_size, *self.input_shape)
-        elif self.phase == 'odd':
-            return dY.ravel()[1::2].reshape(self.batch_size, *self.input_shape)
-
-# Activations {{{2
-
-class Selu(Layer):
-    # paper: https://arxiv.org/abs/1706.02515
-
-    def __init__(self, alpha=1.67326324, lamb=1.05070099):
-        super().__init__()
-        self.alpha = _f(alpha)
-        self.lamb = _f(lamb)
-
-    def forward(self, X):
-        self.cond = X >= 0
-        self.neg = self.alpha * np.exp(X)
-        return self.lamb * np.where(self.cond, X, self.neg - self.alpha)
-
-    def backward(self, dY):
-        return dY * self.lamb * np.where(self.cond, 1, self.neg)
-
-class TanhTest(Layer):
-    def forward(self, X):
-        self.sig = np.tanh(1 / 2 * X)
-        return 2.4004 * self.sig
-
-    def backward(self, dY):
-        return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
-
-class ExpGB(Layer):
-    # an output layer for one-hot classification problems.
-    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
-    # paper: https://arxiv.org/abs/1707.04199
-
-    def __init__(self, alpha=0.1, beta=0.0):
-        super().__init__()
-        self.alpha = _f(alpha)
-        self.beta = _f(beta)
-
-    def forward(self, X):
-        return self.alpha * np.exp(X) + self.beta
-
-    def backward(self, dY):
-        # this gradient is intentionally incorrect.
-        return dY
-
-class CubicGB(Layer):
-    # an output layer for one-hot classification problems.
-    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
-    # paper: https://arxiv.org/abs/1707.04199
-    # note: in the paper, it's called pow3GB, which is ugly.
-
-    def __init__(self, alpha=0.1, beta=0.0):
-        # note: the paper suggests defaults of 0.001 and 0.0,
-        # but these didn't seem to work as well in my limited testing.
-        super().__init__()
-        self.alpha = _f(alpha)
-        self.beta = _f(beta)
-
-    def forward(self, X):
-        return self.alpha * X**3 + self.beta
-
-    def backward(self, dY):
-        # this gradient is intentionally incorrect.
-        return dY
-
-# Parametric Layers {{{1
-
-class Conv1Dper(Layer):
-    # periodic (circular) convolution.
-    # currently only supports one channel I/O.
-    # some notes:
-    # we could use FFTs for larger convolutions.
-    # i think storing the coefficients backwards would
-    # eliminate reversal in the critical code.
-
-    serialize = {
-        'W': 'coeffs',
-    }
-
-    def __init__(self, kernel_size, pos=None,
-                 init=init_glorot_uniform, reg_w=None):
-        super().__init__()
-        self.kernel_size = int(kernel_size)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
-        if pos is None:
-            self.wrap0 = (self.kernel_size - 0) // 2
-            self.wrap1 = (self.kernel_size - 1) // 2
-        elif pos == 'alt':
-            self.wrap0 = (self.kernel_size - 1) // 2
-            self.wrap1 = (self.kernel_size - 0) // 2
-        elif pos == 'left':
-            self.wrap0 = 0
-            self.wrap1 = self.kernel_size - 1
-        elif pos == 'right':
-            self.wrap0 = self.kernel_size - 1
-            self.wrap1 = 0
-        else:
-            raise Exception("pos parameter not understood: {}".format(pos))
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        assert len(shape) == 1, shape
-        self.output_shape = shape
-        self.coeffs.shape = (1, self.kernel_size)
-
-    def forward(self, X):
-        if self.wrap0 == 0:
-            Xper = np.hstack((X,X[:,:self.wrap1]))
-        elif self.wrap1 == 0:
-            Xper = np.hstack((X[:,-self.wrap0:],X))
-        else:
-            Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
-        self.cols = rolling_batch(Xper, self.kernel_size)
-        convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
-        return convolved
-
-    def backward(self, dY):
-        self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
-        return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
-
-class LayerNorm(Layer):
-    # paper: https://arxiv.org/abs/1607.06450
-    # note: nonparametric when affine == False
-
-    def __init__(self, eps=1e-5, affine=True):
-        super().__init__()
-        self.eps = _f(eps)
-        self.affine = bool(affine)
-
-        if self.affine:
-            self.gamma = self._new_weights('gamma', init=init_ones)
-            self.beta = self._new_weights('beta', init=init_zeros)
-            self.serialized = {
-                'gamma': 'gamma',
-                'beta': 'beta',
-            }
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        self.output_shape = shape
-        assert len(shape) == 1, shape
-        if self.affine:
-            self.gamma.shape = (shape[0],)
-            self.beta.shape = (shape[0],)
-
-    def forward(self, X):
-        self.mean = X.mean(0)
-        self.center = X - self.mean
-        self.var = self.center.var(0) + self.eps
-        self.std = np.sqrt(self.var)
-
-        self.Xnorm = self.center / self.std
-        if self.affine:
-            return self.gamma.f * self.Xnorm + self.beta.f
-        return self.Xnorm
-
-    def backward(self, dY):
-        length = dY.shape[0]
-
-        if self.affine:
-            dXnorm = dY * self.gamma.f
-            self.gamma.g += (dY * self.Xnorm).sum(0)
-            self.beta.g += dY.sum(0)
-        else:
-            dXnorm = dY
-
-        dstd = (dXnorm * self.center).sum(0) / -self.var
-        dcenter = dXnorm / self.std + dstd / self.std * self.center / length
-        dmean = -dcenter.sum(0)
-        dX = dcenter + dmean / length
-
-        return dX
-
-class Denses(Layer): # TODO: rename?
-    # acts as a separate Dense for each row or column. only for 2D arrays.
-
-    serialized = {
-        'W': 'coeffs',
-        'b': 'biases',
-    }
-
-    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
-        super().__init__()
-        self.dim = int(dim)
-        self.weight_init = init
-        self.axis = int(axis)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
-        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        assert len(shape) == 2, shape
-
-        assert -len(shape) <= self.axis < len(shape)
-        self.axis = self.axis % len(shape)
-
-        self.output_shape = list(shape)
-        self.output_shape[self.axis] = self.dim
-        self.output_shape = tuple(self.output_shape)
-
-        in_rows = self.input_shape[0]
-        in_cols = self.input_shape[1]
-        out_rows = self.output_shape[0]
-        out_cols = self.output_shape[1]
-
-        self.coeffs.shape = (in_rows, in_cols, self.dim)
-        self.biases.shape = (1, out_rows, out_cols)
-
-    def forward(self, X):
-        self.X = X
-        if self.axis == 0:
-            return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f
-        elif self.axis == 1:
-            return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f
-
-    def backward(self, dY):
-        self.biases.g += dY.sum(0, keepdims=True)
-        if self.axis == 0:
-            self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY)
-            return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
-        elif self.axis == 1:
-            self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
-            return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
-
-class CosineDense(Dense):
-    # paper: https://arxiv.org/abs/1702.05870
-    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
-    # the paper doesn't mention bias,
-    # so we treat bias as an additional weight with a constant input of 1.
-    # this is correct in Dense layers, so i hope it's correct here too.
-
-    eps = 1e-4
-
-    def forward(self, X):
-        self.X = X
-        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
-          + 1 + self.eps)
-        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
-          + np.square(self.biases.f) + self.eps)
-        self.dot = X @ self.coeffs.f + self.biases.f
-        Y = self.dot / (self.X_norm * self.W_norm)
-        return Y
-
-    def backward(self, dY):
-        ddot = dY / self.X_norm / self.W_norm
-        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
-        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
-
-        self.coeffs.g += self.X.T @ ddot            \
-          + dW_norm / self.W_norm * self.coeffs.f
-        self.biases.g += ddot.sum(0, keepdims=True) \
-          + dW_norm / self.W_norm * self.biases.f
-        dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
-
-        return dX
-
-# Rituals {{{1
-
-def stochastic_multiply(W, gamma=0.5, allow_negation=False):
-    # paper: https://arxiv.org/abs/1606.01981
-
-    assert W.ndim == 1, W.ndim
-    assert 0 < gamma < 1, gamma
-    size = len(W)
-    alpha = np.max(np.abs(W))
-    # NOTE: numpy gives [low, high) but the paper advocates [low, high]
-    mult = np.random.uniform(gamma, 1/gamma, size=size)
-    if allow_negation:
-        # NOTE: i have yet to see this do anything but cause divergence.
-        # i've referenced the paper several times yet still don't understand
-        # what i'm doing wrong, so i'm disabling it by default in my code.
-        # maybe i just need *a lot* more weights to compensate.
-        prob = (W / alpha + 1) / 2
-        samples = np.random.random_sample(size=size)
-        mult *= np.where(samples < prob, 1, -1)
-    np.multiply(W, mult, out=W)
-
-class StochMRitual(Ritual):
-    # paper: https://arxiv.org/abs/1606.01981
-    # this probably doesn't make sense for regression problems,
-    # let alone small models, but here it is anyway!
-
-    def __init__(self, learner=None, gamma=0.5):
-        super().__init__(learner)
-        self.gamma = _f(gamma)
-
-    def prepare(self, model):
-        self.W = np.copy(model.W)
-        super().prepare(model)
-
-    def learn(self, inputs, outputs):
-        # an experiment:
-        #assert self.learner.rate < 10, self.learner.rate
-        #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
-
-        self.W[:] = self.model.W
-        for layer in self.model.ordered_nodes:
-            if isinstance(layer, Dense):
-                stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
-        residual = super().learn(inputs, outputs)
-        self.model.W[:] = self.W
-        return residual
-
-    def update(self):
-        super().update()
-        f = 0.5
-        for layer in self.model.ordered_nodes:
-            if isinstance(layer, Dense):
-                np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
-            #   np.clip(layer.W, -1, 1, out=layer.W)
-
-class NoisyRitual(Ritual):
-    def __init__(self, learner=None,
-                 input_noise=0, output_noise=0, gradient_noise=0):
-        self.input_noise = _f(input_noise)
-        self.output_noise = _f(output_noise)
-        self.gradient_noise = _f(gradient_noise)
-        super().__init__(learner)
-
-    def learn(self, inputs, outputs):
-        # this is pretty crude
-        if self.input_noise > 0:
-            s = self.input_noise
-            inputs =   inputs + np.random.normal(0, s, size=inputs.shape)
-        if self.output_noise > 0:
-            s = self.output_noise
-            outputs = outputs + np.random.normal(0, s, size=outputs.shape)
-        return super().learn(inputs, outputs)
-
-    def update(self):
-        # gradient noise paper: https://arxiv.org/abs/1511.06807
-        if self.gradient_noise > 0:
-            size = len(self.model.dW)
-            gamma = 0.55
-            #s = self.gradient_noise / (1 + self.bn) ** gamma
-            # experiments:
-            s = self.gradient_noise * np.sqrt(self.learner.rate)
-            #s = np.square(self.learner.rate)
-            #s = self.learner.rate / self.en
-            self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
-        super().update()
-
-# Learners {{{1
-
-class PolyLearner(Learner):
-    per_batch = True
-
-    def __init__(self, optim, epochs=400, coeffs=(1,)):
-        self.coeffs = tuple(coeffs)
-        super().__init__(optim, epochs, np.polyval(self.coeffs, 0))
-
-    def rate_at(self, epoch):
-        progress = (epoch - 1) / (self.epochs)
-        ret = np.polyval(self.coeffs, progress)
-        return np.abs(ret)
-
-# Components {{{1
-
-def _mr_make_norm(norm, *args, **kwargs):
-    def _mr_norm(y, width, depth, block, multi, activation, style, FC, d):
-        skip = y
-        merger = Sum()
-        skip.feed(merger)
-        z_start = skip
-        z_start = z_start.feed(norm(*args, **kwargs))
-        z_start = z_start.feed(activation())
-        for _ in range(multi):
-            z = z_start
-            for j in range(block):
-                if j > 0:
-                    z = z.feed(norm(*args, **kwargs))
-                    z = z.feed(activation())
-                z = z.feed(FC())
-            z.feed(merger)
-        y = merger
-        return y
-    return _mr_norm
-
-def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
-    skip = y
-    merger = Sum()
-    skip.feed(merger)
-    z_start = skip.feed(activation())
-    for _ in range(multi):
-        z = z_start
-        for j in range(block):
-            if j > 0:
-                z = z.feed(activation())
-            z = z.feed(FC())
-        z.feed(merger)
-    y = merger
-    return y
-
-def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
-    # this is my own awful contraption.
-    is_last = d + 1 == depth
-    needs_sum = not is_last or multi > 1
-    skip = y
-    if needs_sum:
-        merger = Sum()
-    if not is_last:
-        skip.feed(merger)
-    z_start = skip.feed(activation())
-    for _ in range(multi):
-        z = z_start
-        for j in range(block):
-            if j > 0:
-                z = z.feed(activation())
-            z = z.feed(FC())
-        if needs_sum:
-            z.feed(merger)
-    if needs_sum:
-        y = merger
-    else:
-        y = z
-    return y
-
-_mr_styles = dict(
-    lnorm=_mr_make_norm(LayerNorm),
-    batchless=_mr_batchless,
-    onelesssum=_mr_onelesssum,
-)
-
-def multiresnet(x, width, depth, block=2, multi=1,
-                activation=Relu, style='batchless',
-                init=init_he_normal):
-    if style == 'cossim':
-        style = 'batchless'
-        DenseClass = CosineDense
-    else:
-        DenseClass = Dense
-    if style not in _mr_styles:
-        raise Exception('unknown resnet style', style)
-
-    y = x
-    last_size = x.output_shape[0]
-
-    for d in range(depth):
-        size = width
-        FC = lambda: DenseClass(size, init)
-
-        if last_size != size:
-            y = y.feed(FC())
-
-        y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d)
-
-        last_size = size
-
-    return y
-
-# Toy Data {{{1
-
-inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform,
-             glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform,
-             gaussian_unit=init_gaussian_unit)
-activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh,
-                   relu=Relu, elu=Elu, gelu=GeluApprox, selu=Selu,
-                   softplus=Softplus)
-
-def prettyize(data):
-    if isinstance(data, np.ndarray):
-        s = ', '.join(('{:8.2e}'.format(n) for n in data))
-        s = '[' + s + ']'
-    else:
-        s = '{:8.2e}'.format(data)
-    return s
-
-def normalize_data(data, mean=None, std=None):
-    # in-place
-    if mean is None or std is None:
-        mean = np.mean(data, axis=0)
-        std  =  np.std(data, axis=0)
-        mean_str = prettyize(mean)
-        std_str = prettyize(std)
-        lament('nod(...,\n    {},\n    {})'.format(mean_str, std_str))
-        sys.exit(1)
-    data -= _f(mean)
-    data /= _f(std)
-
-def toy_data(train_samples, valid_samples, problem=2):
-    total_samples = train_samples + valid_samples
-
-    nod = normalize_data # shorthand to keep a sane indentation
-
-    if problem == 0:
-        from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs
-        inputs, outputs = _f(inputs), _f(outputs)
-        valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)
-
-        nod(inputs, 127.5, 73.9)
-        nod(outputs, 44.8, 21.7)
-        nod(valid_inputs, 127.5, 73.9)
-        nod(valid_outputs, 44.8, 21.7)
-
-    elif problem == 1:
-        from sklearn.datasets import make_friedman1
-        inputs, outputs = make_friedman1(total_samples)
-        inputs, outputs = _f(inputs), _f(outputs)
-        outputs = np.expand_dims(outputs, -1)
-
-        nod(inputs, 0.5, 1/np.sqrt(12))
-        nod(outputs, 14.4, 4.9)
-
-    elif problem == 2:
-        from sklearn.datasets import make_friedman2
-        inputs, outputs = make_friedman2(total_samples)
-        inputs, outputs = _f(inputs), _f(outputs)
-        outputs = np.expand_dims(outputs, -1)
-
-        nod(inputs,
-            [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
-            [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
-
-        nod(outputs, [482], [380])
-
-    elif problem == 3:
-        from sklearn.datasets import make_friedman3
-        inputs, outputs = make_friedman3(total_samples)
-        inputs, outputs = _f(inputs), _f(outputs)
-        outputs = np.expand_dims(outputs, -1)
-
-        nod(inputs,
-            [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
-            [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
-
-        nod(outputs, [1.32327931], [0.31776295])
-
-    else:
-        raise Exception("unknown toy data set", problem)
-
-    if problem != 0:
-        # split off a validation set
-        indices = np.arange(inputs.shape[0])
-        np.random.shuffle(indices)
-        valid_inputs  =  inputs[indices][-valid_samples:]
-        valid_outputs = outputs[indices][-valid_samples:]
-        inputs  =  inputs[indices][:-valid_samples]
-        outputs = outputs[indices][:-valid_samples]
-
-    return (inputs, outputs), (valid_inputs, valid_outputs)
-
-# Model Creation {{{1
-
-def optim_from_config(config):
-    if config.optim == 'adam':
-        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
-        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
-        b1 = np.exp(-1/d1)
-        b2 = np.exp(-1/d2)
-        o = Nadam if config.nesterov else Adam
-        optim = o(b1=b1, b2=b2)
-    elif config.optim == 'ftml':
-        d1 = config.optim_decay1 if 'optim_decay1' in config else 2
-        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
-        b1 = np.exp(-1/d1)
-        b2 = np.exp(-1/d2)
-        optim = FTML(b1=b1, b2=b2)
-    elif config.optim == 'yf':
-        d1 = config.optim_decay1 if 'optim_decay1' in config else 999.5
-        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
-        if d1 != d2:
-            raise Exception("yellowfin only uses one decay term.")
-        beta = np.exp(-1/d1)
-        optim = YellowFin(beta=beta)
-    elif config.optim in ('ag', 'adagrad'):
-        optim = Adagrad()
-    elif config.optim in ('rms', 'rmsprop'):
-        d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
-        mu = np.exp(-1/d2)
-        optim = RMSprop(mu=mu)
-    elif config.optim == 'rmsc':
-        d2 = config.optim_decay2 if 'optim_decay2' in config else 9.5
-        mu = np.exp(-1/d2)
-        optim = RMSpropCentered(momentum=mu)
-    elif config.optim == 'sgd':
-        d1 = config.optim_decay1 if 'optim_decay1' in config else 0
-        clip = config.gradient_clip if 'gradient_clip' in config else 0.0
-        if d1 > 0 or clip > 0:
-            b1 = np.exp(-1/d1) if d1 > 0 else 0
-            if clip > 0:
-                optim = MomentumClip(mu=b1, nesterov=config.nesterov, clip=clip)
-            else:
-                optim = Momentum(mu=b1, nesterov=config.nesterov)
-        else:
-            optim = Optimizer()
-    elif config.optim == 'ps':
-        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
-        b1 = np.exp(-1/d1)
-        optim = PowerSign(mu=b1)
-    elif config.optim == 'as':
-        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
-        b1 = np.exp(-1/d1)
-        optim = AddSign(mu=b1)
-    else:
-        raise Exception('unknown optimizer', config.optim)
-
-    return optim
-
-def learner_from_config(config, optim, rscb):
-    if config.learner == 'sgdr':
-        expando = config.expando if 'expando' in config else None
-        learner = SGDR(optim, epochs=config.epochs, rate=config.learn,
-                       restart_decay=config.restart_decay, restarts=config.restarts,
-                       callback=rscb, expando=expando)
-        # final learning rate isn't of interest here; it's gonna be close to 0.
-        log('total epochs', learner.epochs)
-    elif config.learner in ('sin', 'sine'):
-        lower_rate = config.learn * 1e-5 # TODO: allow access to this.
-        epochs = config.epochs * (config.restarts + 1)
-        frequency = config.epochs
-        learner = SineCLR(optim, epochs=epochs, frequency=frequency,
-                          upper_rate=config.learn, lower_rate=lower_rate,
-                          callback=rscb)
-    elif config.learner == 'wave':
-        lower_rate = config.learn * 1e-5 # TODO: allow access to this.
-        epochs = config.epochs * (config.restarts + 1)
-        frequency = config.epochs
-        learner = WaveCLR(optim, epochs=epochs, frequency=frequency,
-                          upper_rate=config.learn, lower_rate=lower_rate,
-                          callback=rscb)
-    elif config.learner == 'anneal':
-        learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
-                                   halve_every=config.learn_halve_every)
-        log("final learning rate", "{:10.8f}".format(learner.final_rate))
-    elif config.learner == 'sgd':
-        learner = Learner(optim, epochs=config.epochs, rate=config.learn)
-    else:
-        raise Exception('unknown learner', config.learner)
-
-    return learner
-
-def lookup_loss(maybe_name):
-    if isinstance(maybe_name, Loss):
-        return maybe_name
-    elif maybe_name == 'mse':
-        return Squared()
-    elif maybe_name == 'mshe': # mushy
-        return SquaredHalved()
-    elif maybe_name == 'mae':
-        return Absolute()
-    elif maybe_name == 'msee':
-        return SomethingElse()
-    elif maybe_name == 'huber':
-        return Huber(delta=0.1)
-    raise Exception('unknown objective', maybe_name)
-
-def ritual_from_config(config, learner):
-    if config.ritual == 'default':
-        ritual = Ritual(learner=learner)
-    elif config.ritual == 'stochm':
-        ritual = StochMRitual(learner=learner)
-    elif config.ritual == 'noisy':
-        ritual = NoisyRitual(learner=learner,
-                             input_noise=1e-1, output_noise=1e-2,
-                             gradient_noise=2e-7)
-    else:
-        raise Exception('unknown ritual', config.ritual)
-
-    return ritual
-
-def model_from_config(config, input_features, output_features, callbacks=None):
-    init = inits[config.init]
-    activation = activations[config.activation]
-
-    x = Input(shape=(input_features,))
-    y = x
-    y = multiresnet(y,
-                    config.res_width, config.res_depth,
-                    config.res_block, config.res_multi,
-                    activation=activation, init=init,
-                    style=config.parallel_style)
-    if y.output_shape[0] != output_features:
-        y = y.feed(Dense(output_features, init))
-
-    loss = lookup_loss(config.loss)
-    mloss = lookup_loss(config.mloss) if config.mloss else loss
-
-    model = Model(x, y, loss=loss, mloss=mloss, unsafe=config.unsafe)
-
-    if config.fn_load is not None:
-        log('loading weights', config.fn_load)
-        model.load_weights(config.fn_load)
-
-    optim = optim_from_config(config)
-
-    def rscb(restart):
-        if callbacks:
-            callbacks.restart()
-        log("restarting", restart)
-        if config.restart_optim:
-            optim.reset()
-
-    learner = learner_from_config(config, optim, rscb)
-
-    ritual = ritual_from_config(config, learner)
-
-    return model, learner, ritual
-
-# main program {{{1
-
-def run(program, args=None):
-    args = args if args else []
-
-    lower_priority()
-    np.random.seed(42069)
-
-    # Config {{{2
-
-    from dotmap import DotMap
-    config = DotMap(
-        fn_load = None,
-        fn_save = 'optim_nn.h5',
-        log_fn = 'losses.npz',
-
-        # multi-residual network parameters
-        res_width = 28,
-        res_depth = 2,
-        res_block = 3, # normally 2 for plain resnet
-        res_multi = 2, # normally 1 for plain resnet
-
-        # style of resnet (order of layers, which layers, etc.)
-        parallel_style = 'onelesssum',
-        activation = 'gelu',
-
-        #optim = 'ftml',
-        #optim_decay1 = 2,
-        #optim_decay2 = 100,
-        optim = 'adam', # note: most features only implemented for Adam
-        optim_decay1 = 24,  #  first momentum given in batches (optional)
-        optim_decay2 = 100, # second momentum given in batches (optional)
-        nesterov = True, # not available for all optimizers.
-        batch_size = 64,
-
-        # learning parameters
-        learner = 'sgdr',
-        learn = 0.00125,
-        epochs = 24,
-        learn_halve_every = 16, # only used with anneal
-        restarts = 4,
-        restart_decay = 0.25, # only used with SGDR
-        expando = lambda i: 24 * i,
-
-        # misc
-        init = 'he_normal',
-        loss = 'mse',
-        mloss = 'mse',
-        ritual = 'default',
-        restart_optim = False, # restarts also reset internal state of optimizer
-        warmup = False, # train a couple epochs on gaussian noise and reset
-
-        # logging/output
-        log10_loss = True, # personally, i'm sick of looking linear loss values!
-        #fancy_logs = True, # unimplemented (can't turn it off yet)
-
-        problem = 2,
-        compare = (
-            # best results for ~10,000 parameters
-            # training/validation pairs for each problem (starting from problem 0):
-            (10**-3.120, 10**-2.901),
-            # 1080 epochs on these...
-            (10**-6.747, 10**-6.555),
-            (10**-7.774, 10**-7.626),
-            (10**-6.278, 10**-5.234), # overfitting? bad valid set?
-        ),
-
-        unsafe = True, # aka gotta go fast mode
-    )
-
-    for k in ['parallel_style', 'activation', 'optim', 'learner',
-              'init', 'loss', 'mloss', 'ritual']:
-        config[k] = config[k].lower()
-
-    config.learn *= np.sqrt(config.batch_size)
-
-    config.pprint()
-
-    # Toy Data {{{2
-
-    (inputs, outputs), (valid_inputs, valid_outputs) = \
-      toy_data(2**14, 2**11, problem=config.problem)
-    input_features  =  inputs.shape[-1]
-    output_features = outputs.shape[-1]
-
-    # Our Test Model
-
-    callbacks = Dummy()
-
-    model, learner, ritual = \
-      model_from_config(config, input_features, output_features, callbacks)
-
-    # Model Information {{{2
-
-    model.print_graph()
-    log('parameters', model.param_count)
-
-    # Training {{{2
-
-    batch_losses = []
-    train_losses = []
-    valid_losses = []
-
-    def measure_error():
-        def print_error(name, inputs, outputs, comparison=None):
-            predicted = model.evaluate(inputs)
-            err = model.mloss.forward(predicted, outputs)
-            if config.log10_loss:
-                print(name, "{:12.6e}".format(err))
-                if comparison:
-                    err10 = np.log10(err)
-                    cmp10 = np.log10(comparison)
-                    color = '\x1B[31m' if err10 > cmp10 else '\x1B[32m'
-                    log(name + " log10-loss", "{:+6.3f} {}({:+6.3f})\x1B[0m".format(err10, color, err10 - cmp10))
-                else:
-                    log(name + " log10-loss", "{:+6.3f}".format(err, np.log10(err)))
-            else:
-                log(name + " loss", "{:12.6e}".format(err))
-                if comparison:
-                    fmt = "10**({:+7.4f}) times"
-                    log("improvement", fmt.format(np.log10(comparison / err)))
-            return err
-
-        train_err = print_error("train",
-                                inputs, outputs,
-                                config.compare[config.problem][0])
-        valid_err = print_error("valid",
-                                valid_inputs, valid_outputs,
-                                config.compare[config.problem][1])
-        train_losses.append(train_err)
-        valid_losses.append(valid_err)
-
-    callbacks.restart = measure_error
-
-    training = config.epochs > 0 and config.restarts >= 0
-
-    ritual.prepare(model)
-
-    if training and config.warmup and not config.fn_load:
-        log("warming", "up")
-
-        # use plain SGD in warmup to prevent (or possibly cause?) numeric issues
-        temp_optim = learner.optim
-        temp_loss = model.loss
-        learner.optim = MomentumClip(lr=0.01, mu=0)
-        ritual.loss = Absolute() # less likely to blow up; more general
-
-        # NOTE: experiment: trying const batches and batch_size
-        bs = 256
-        target = 1 * 1024 * 1024
-        # 4 being sizeof(float)
-        batches = (target / 4 / np.prod(inputs.shape[1:])) // bs * bs
-        ins  = [int(batches)] + list( inputs.shape[1:])
-        outs = [int(batches)] + list(outputs.shape[1:])
-
-        for _ in range(4):
-            ritual.train_batched(
-                np.random.normal(size=ins),
-                np.random.normal(size=outs),
-                batch_size=bs)
-            ritual.reset()
-
-        learner.optim = temp_optim
-        model.loss = temp_loss
-
-    if training:
-        measure_error()
-
-    while training and learner.next():
-        avg_loss, losses = ritual.train_batched(
-            inputs, outputs,
-            config.batch_size,
-            return_losses=True)
-        batch_losses += losses
-
-        if config.log10_loss:
-            fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}"
-            log("info", fmt.format(learner.epoch, learner.rate, np.log10(avg_loss)),
-                update=True)
-        else:
-            fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
-            log("info", fmt.format(learner.epoch, learner.rate, avg_loss),
-                update=True)
-
-    measure_error()
-
-    if training and config.fn_save is not None:
-        log('saving weights', config.fn_save)
-        model.save_weights(config.fn_save, overwrite=True)
-
-    if training and config.log_fn is not None:
-        log('saving losses', config.log_fn)
-        np.savez_compressed(config.log_fn,
-                            batch_losses=np.array(batch_losses, dtype=_f),
-                            train_losses=np.array(train_losses, dtype=_f),
-                            valid_losses=np.array(valid_losses, dtype=_f))
-
-    # Evaluation {{{2
-    # TODO: write this portion again
-
-    return 0
-
-# run main program {{{1
-
-if __name__ == '__main__':
-    sys.exit(run(sys.argv[0], sys.argv[1:]))
diff --git a/onn/__init__.py b/onn/__init__.py
new file mode 100644
index 0000000..19d6020
--- /dev/null
+++ b/onn/__init__.py
@@ -0,0 +1,28 @@
+# external packages required for full functionality:
+# numpy scipy h5py sklearn dotmap
+
+# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
+#           this is gonna take some work.
+
+from .activation import *
+from .floats import *
+from .initialization import *
+from .layer import *
+from .learner import *
+from .loss import *
+from .math import *
+from .model import *
+from .nodes import *
+from .optimizer import *
+from .parametric import *
+from .regularizer import *
+from .ritual import *
+from .util import *
+from .weights import *
+
+# this is similar to default behaviour of having no __all__ variable at all,
+# but ours ignores modules as well. this allows for `import sys` and such
+# without clobbering `from our_module import *`.
+__all__ = [
+    o for o in locals()
+    if type(o) != 'module' and not o.startswith('_')]
diff --git a/onn/activation.py b/onn/activation.py
new file mode 100644
index 0000000..7bba491
--- /dev/null
+++ b/onn/activation.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+# just for speed, not strictly essential:
+from scipy.special import expit as sigmoid
+
+from .floats import *
+from .layer_base import *
+
+class Identity(Layer):
+    def forward(self, X):
+        return X
+
+    def backward(self, dY):
+        return dY
+
+class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
+    def forward(self, X):
+        self.sig = sigmoid(X)
+        return self.sig
+
+    def backward(self, dY):
+        return dY * self.sig * (1 - self.sig)
+
+class Softplus(Layer):
+    # integral of Sigmoid.
+
+    def forward(self, X):
+        self.X = X
+        return np.log(1 + np.exp(X))
+
+    def backward(self, dY):
+        return dY * sigmoid(self.X)
+
+class Tanh(Layer):
+    def forward(self, X):
+        self.sig = np.tanh(X)
+        return self.sig
+
+    def backward(self, dY):
+        return dY * (1 - self.sig * self.sig)
+
+class LeCunTanh(Layer):
+    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
+    # scaled such that f([-1, 1]) = [-1, 1].
+    # helps preserve an input variance of 1.
+    # second derivative peaks around an input of ±1.
+
+    def forward(self, X):
+        self.sig = np.tanh(2 / 3 * X)
+        return 1.7159 * self.sig
+
+    def backward(self, dY):
+        return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
+
+class Relu(Layer):
+    def forward(self, X):
+        self.cond = X >= 0
+        return np.where(self.cond, X, 0)
+
+    def backward(self, dY):
+        return np.where(self.cond, dY, 0)
+
+class Elu(Layer):
+    # paper: https://arxiv.org/abs/1511.07289
+
+    def __init__(self, alpha=1):
+        super().__init__()
+        self.alpha = _f(alpha) # FIXME: unused
+
+    def forward(self, X):
+        self.cond = X >= 0
+        self.neg = np.exp(X) - 1
+        return np.where(self.cond, X, self.neg)
+
+    def backward(self, dY):
+        return dY * np.where(self.cond, 1, self.neg + 1)
+
+class GeluApprox(Layer):
+    # paper: https://arxiv.org/abs/1606.08415
+    #  plot: https://www.desmos.com/calculator/ydzgtccsld
+
+    def forward(self, X):
+        self.a = 1.704 * X
+        self.sig = sigmoid(self.a)
+        return X * self.sig
+
+    def backward(self, dY):
+        return dY * self.sig * (1 + self.a * (1 - self.sig))
+
+class Softmax(Layer):
+    def forward(self, X):
+        alpha = np.max(X, axis=-1, keepdims=True)
+        num = np.exp(X - alpha)
+        den = np.sum(num, axis=-1, keepdims=True)
+        self.sm = num / den
+        return self.sm
+
+    def backward(self, dY):
+        return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
+
+class LogSoftmax(Softmax):
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.eps = _f(eps)
+
+    def forward(self, X):
+        return np.log(super().forward(X) + self.eps)
+
+    def backward(self, dY):
+        return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
+
+class Cos(Layer):
+    # performs well on MNIST for some strange reason.
+
+    def forward(self, X):
+        self.X = X
+        return np.cos(X)
+
+    def backward(self, dY):
+        return dY * -np.sin(self.X)
+
+class Selu(Layer):
+    # paper: https://arxiv.org/abs/1706.02515
+
+    def __init__(self, alpha=1.67326324, lamb=1.05070099):
+        super().__init__()
+        self.alpha = _f(alpha)
+        self.lamb = _f(lamb)
+
+    def forward(self, X):
+        self.cond = X >= 0
+        self.neg = self.alpha * np.exp(X)
+        return self.lamb * np.where(self.cond, X, self.neg - self.alpha)
+
+    def backward(self, dY):
+        return dY * self.lamb * np.where(self.cond, 1, self.neg)
+
+# more
+
+class TanhTest(Layer):
+    def forward(self, X):
+        self.sig = np.tanh(1 / 2 * X)
+        return 2.4004 * self.sig
+
+    def backward(self, dY):
+        return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
+
+class ExpGB(Layer):
+    # an output layer for one-hot classification problems.
+    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
+    # paper: https://arxiv.org/abs/1707.04199
+
+    def __init__(self, alpha=0.1, beta=0.0):
+        super().__init__()
+        self.alpha = _f(alpha)
+        self.beta = _f(beta)
+
+    def forward(self, X):
+        return self.alpha * np.exp(X) + self.beta
+
+    def backward(self, dY):
+        # this gradient is intentionally incorrect.
+        return dY
+
+class CubicGB(Layer):
+    # an output layer for one-hot classification problems.
+    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
+    # paper: https://arxiv.org/abs/1707.04199
+    # note: in the paper, it's called pow3GB, which is ugly.
+
+    def __init__(self, alpha=0.1, beta=0.0):
+        # note: the paper suggests defaults of 0.001 and 0.0,
+        # but these didn't seem to work as well in my limited testing.
+        super().__init__()
+        self.alpha = _f(alpha)
+        self.beta = _f(beta)
+
+    def forward(self, X):
+        return self.alpha * X**3 + self.beta
+
+    def backward(self, dY):
+        # this gradient is intentionally incorrect.
+        return dY
+
diff --git a/onn/floats.py b/onn/floats.py
new file mode 100644
index 0000000..7d3b2ca
--- /dev/null
+++ b/onn/floats.py
@@ -0,0 +1,18 @@
+import numpy as np
+
+_f = np.float32
+
+def _check(a):
+    assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
+    assert a.dtype == _f, a.dtype
+    return a
+
+_0 = _f(0)
+_1 = _f(1)
+_2 = _f(2)
+_inv2 = _f(1/2)
+_sqrt2 = _f(np.sqrt(2))
+_invsqrt2 = _f(1/np.sqrt(2))
+_pi = _f(np.pi)
+
+__all__ = [o for o in locals()]
diff --git a/onn/initialization.py b/onn/initialization.py
new file mode 100644
index 0000000..46916c5
--- /dev/null
+++ b/onn/initialization.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+# note: these are currently only implemented for 2D shapes.
+
+def init_zeros(size, ins=None, outs=None):
+    return np.zeros(size)
+
+def init_ones(size, ins=None, outs=None):
+    return np.ones(size)
+
+def init_he_normal(size, ins, outs):
+    s = np.sqrt(2 / ins)
+    return np.random.normal(0, s, size=size)
+
+def init_he_uniform(size, ins, outs):
+    s = np.sqrt(6 / ins)
+    return np.random.uniform(-s, s, size=size)
+
+def init_glorot_normal(size, ins, outs):
+    s = np.sqrt(2 / (ins + outs))
+    return np.random.normal(0, s, size=size)
+
+def init_glorot_uniform(size, ins, outs):
+    s = np.sqrt(6 / (ins + outs))
+    return np.random.uniform(-s, s, size=size)
+
+# more
+
+def init_gaussian_unit(size, ins, outs):
+    s = np.sqrt(1 / ins)
+    return np.random.normal(0, s, size=size)
diff --git a/onn/layer.py b/onn/layer.py
new file mode 100644
index 0000000..e1c9342
--- /dev/null
+++ b/onn/layer.py
@@ -0,0 +1,200 @@
+from .layer_base import *
+from .initialization import *
+from .floats import *
+
+# Nonparametric Layers {{{1
+
+class Input(Layer):
+    def __init__(self, shape):
+        assert shape is not None
+        super().__init__()
+        self.shape = tuple(shape)
+        self.input_shape = self.shape
+        self.output_shape = self.shape
+
+    def forward(self, X):
+        return X
+
+    def backward(self, dY):
+        #self.dY = dY
+        return np.zeros_like(dY)
+
+class Reshape(Layer):
+    def __init__(self, new_shape):
+        super().__init__()
+        self.shape = tuple(new_shape)
+        self.output_shape = self.shape
+
+    def forward(self, X):
+        self.batch_size = X.shape[0]
+        return X.reshape(self.batch_size, *self.output_shape)
+
+    def backward(self, dY):
+        assert dY.shape[0] == self.batch_size
+        return dY.reshape(self.batch_size, *self.input_shape)
+
+class Flatten(Layer):
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        self.output_shape = (np.prod(shape),)
+
+    def forward(self, X):
+        self.batch_size = X.shape[0]
+        return X.reshape(self.batch_size, *self.output_shape)
+
+    def backward(self, dY):
+        assert dY.shape[0] == self.batch_size
+        return dY.reshape(self.batch_size, *self.input_shape)
+
+class ConstAffine(Layer):
+    def __init__(self, a=1, b=0):
+        super().__init__()
+        self.a = _f(a)
+        self.b = _f(b)
+
+    def forward(self, X):
+        return self.a * X + self.b
+
+    def backward(self, dY):
+        return dY * self.a
+
+class Sum(Layer):
+    def _propagate(self, edges, deterministic):
+        return np.sum(edges, axis=0)
+
+    def _backpropagate(self, edges):
+        #assert len(edges) == 1, "unimplemented"
+        return edges[0] # TODO: does this always work?
+
+class ActivityRegularizer(Layer):
+    def __init__(self, reg):
+        super().__init__()
+        assert isinstance(reg, Regularizer), reg
+        self.reg = reg
+
+    def forward(self, X):
+        self.X = X
+        self.loss = np.sum(self.reg.forward(X))
+        return X
+
+    def backward(self, dY):
+        return dY + self.reg.backward(self.X)
+
+class Dropout(Layer):
+    def __init__(self, dropout=0.0):
+        super().__init__()
+        self.p = _f(1 - dropout)
+        assert 0 <= self.p <= 1
+
+    def forward(self, X):
+        self.mask = (np.random.rand(*X.shape) < self.p) / self.p
+        return X * self.mask
+
+    def forward_deterministic(self, X):
+        #self.mask = _1
+        return X
+
+    def backward(self, dY):
+        return dY * self.mask
+
+# more
+
+class AlphaDropout(Layer):
+    # to be used alongside Selu activations.
+    # paper: https://arxiv.org/abs/1706.02515
+
+    def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099):
+        super().__init__()
+        self.alpha = _f(alpha)
+        self.lamb = _f(lamb)
+        self.saturated = -self.lamb * self.alpha
+        self.dropout = _f(dropout)
+
+    @property
+    def dropout(self):
+        return self._dropout
+
+    @dropout.setter
+    def dropout(self, x):
+        self._dropout = _f(x)
+        self.q = 1 - self._dropout
+        assert 0 <= self.q <= 1
+
+        sat = self.saturated
+
+        self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout)
+        self.b = -self.a * (self._dropout * sat)
+
+    def forward(self, X):
+        self.mask = np.random.rand(*X.shape) < self.q
+        return self.a * np.where(self.mask, X, self.saturated) + self.b
+
+    def forward_deterministic(self, X):
+        return X
+
+    def backward(self, dY):
+        return dY * self.a * self.mask
+
+class Decimate(Layer):
+    # simple decimaton layer that drops every other sample from the last axis.
+
+    def __init__(self, phase='even'):
+        super().__init__()
+        # phase is the set of samples we keep in the forward pass.
+        assert phase in ('even', 'odd'), phase
+        self.phase = phase
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        divy = (shape[-1] + 1) // 2 if self.phase == 'even' else shape[-1] // 2
+        self.output_shape = tuple(list(shape[:-1]) + [divy])
+        self.dX = np.zeros(self.input_shape, dtype=_f)
+
+    def forward(self, X):
+        self.batch_size = X.shape[0]
+        if self.phase == 'even':
+            return X.ravel()[0::2].reshape(self.batch_size, *self.output_shape)
+        elif self.phase == 'odd':
+            return X.ravel()[1::2].reshape(self.batch_size, *self.output_shape)
+
+    def backward(self, dY):
+        assert dY.shape[0] == self.batch_size
+        dX = np.zeros((self.batch_size, *self.input_shape), dtype=_f)
+        if self.phase == 'even':
+            dX.ravel()[0::2] = dY.ravel()
+        elif self.phase == 'odd':
+            dX.ravel()[1::2] = dY.ravel()
+        return dX
+
+class Undecimate(Layer):
+    # inverse operation of Decimate. not quite interpolation.
+
+    def __init__(self, phase='even'):
+        super().__init__()
+        # phase is the set of samples we keep in the backward pass.
+        assert phase in ('even', 'odd'), phase
+        self.phase = phase
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        mult = shape[-1] * 2
+        self.output_shape = tuple(list(shape[:-1]) + [mult])
+
+    def forward(self, X):
+        self.batch_size = X.shape[0]
+        Y = np.zeros((self.batch_size, *self.output_shape), dtype=_f)
+        if self.phase == 'even':
+            Y.ravel()[0::2] = X.ravel()
+        elif self.phase == 'odd':
+            Y.ravel()[1::2] = X.ravel()
+        return Y
+
+    def backward(self, dY):
+        assert dY.shape[0] == self.batch_size
+        if self.phase == 'even':
+            return dY.ravel()[0::2].reshape(self.batch_size, *self.input_shape)
+        elif self.phase == 'odd':
+            return dY.ravel()[1::2].reshape(self.batch_size, *self.input_shape)
diff --git a/onn/layer_base.py b/onn/layer_base.py
new file mode 100644
index 0000000..a8cc253
--- /dev/null
+++ b/onn/layer_base.py
@@ -0,0 +1,143 @@
+import numpy as np
+
+from collections import defaultdict, OrderedDict
+
+from .weights import *
+
+# used for numbering layers like Keras:
+_layer_counters = defaultdict(lambda: 0)
+
+class LayerIncompatibility(Exception):
+    pass
+
+class Layer:
+    def __init__(self):
+        self.parents = []
+        self.children = []
+        self.weights = OrderedDict()
+        self.loss = None # for activity regularizers
+        self.input_shape = None
+        self.output_shape = None
+        kind = self.__class__.__name__
+        global _layer_counters
+        _layer_counters[kind] += 1
+        self.name = "{}_{}".format(kind, _layer_counters[kind])
+        self.unsafe = False # disables assertions for better performance
+        self.shared = False # as in weight sharing
+
+    def __str__(self):
+        return self.name
+
+    # methods we might want to override:
+
+    def forward(self, X):
+        raise NotImplementedError("unimplemented", self)
+
+    def forward_deterministic(self, X):
+        return self.forward(X)
+
+    def backward(self, dY):
+        raise NotImplementedError("unimplemented", self)
+
+    def make_shape(self, parent):
+        if self.input_shape == None:
+            self.input_shape = parent.output_shape
+        if self.output_shape == None:
+            self.output_shape = self.input_shape
+
+    def do_feed(self, child):
+        self.children.append(child)
+
+    def be_fed(self, parent):
+        self.parents.append(parent)
+
+    # TODO: better names for these (still)
+    def _propagate(self, edges, deterministic):
+        if not self.unsafe:
+            assert len(edges) == 1, self
+        if deterministic:
+            return self.forward_deterministic(edges[0])
+        else:
+            return self.forward(edges[0])
+
+    def _backpropagate(self, edges):
+        if len(edges) == 1:
+            return self.backward(edges[0])
+        return sum((self.backward(dY) for dY in edges))
+
+    # general utility methods:
+
+    def is_compatible(self, parent):
+        return np.all(self.input_shape == parent.output_shape)
+
+    def feed(self, child):
+        assert self.output_shape is not None, self
+        child.make_shape(self)
+        if not child.is_compatible(self):
+            fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
+            raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
+        self.do_feed(child)
+        child.be_fed(self)
+        return child
+
+    def validate_input(self, X):
+        assert X.shape[1:] == self.input_shape,  (str(self), X.shape[1:], self.input_shape)
+
+    def validate_output(self, Y):
+        assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
+
+    def _new_weights(self, name, **kwargs):
+        w = Weights(**kwargs)
+        assert name not in self.weights, name
+        self.weights[name] = w
+        return w
+
+    def share(self, node):
+        self.weights = node.weights # TODO: this should be all it takes.
+        for k, v in self.weights.items():
+            vs = getattr(node, k) # hack: key isn't necessarily attribute name!
+            setattr(self, k, vs)
+        self.shared = True
+
+    def clear_grad(self):
+        for name, w in self.weights.items():
+            w.g[:] = 0
+
+    @property
+    def size(self):
+        return sum((w.size for w in self.weights.values()))
+
+    def init(self, allocator):
+        ins, outs = self.input_shape[0], self.output_shape[0]
+        for k, w in self.weights.items():
+            w.allocate(ins, outs, allocator=allocator)
+
+    def propagate(self, values, deterministic):
+        if not self.unsafe:
+            assert self.parents, self
+        edges = []
+        for parent in self.parents:
+            if parent in values:
+                X = values[parent]
+                if not self.unsafe:
+                    self.validate_input(X)
+                edges.append(X)
+        Y = self._propagate(edges, deterministic)
+        if not self.unsafe:
+            self.validate_output(Y)
+        return Y
+
+    def backpropagate(self, values):
+        if not self.unsafe:
+            assert self.children, self
+        edges = []
+        for child in self.children:
+            if child in values:
+                dY = values[child]
+                if not self.unsafe:
+                    self.validate_output(dY)
+                edges.append(dY)
+        dX = self._backpropagate(edges)
+        if not self.unsafe:
+            self.validate_input(dX)
+        return dX
diff --git a/onn/learner.py b/onn/learner.py
new file mode 100644
index 0000000..f60dc99
--- /dev/null
+++ b/onn/learner.py
@@ -0,0 +1,180 @@
+from .floats import *
+from .optimizer_base import *
+
+class Learner:
+    per_batch = False
+
+    def __init__(self, optim, epochs=100, rate=None):
+        assert isinstance(optim, Optimizer)
+        self.optim = optim
+        self.start_rate = rate # None is okay; it'll use optim.lr instead.
+        self.epochs = int(epochs)
+        self.reset()
+
+    def reset(self, optim=False):
+        self.started = False
+        self.epoch = 0
+        if optim:
+            self.optim.reset()
+
+    @property
+    def epoch(self):
+        return self._epoch
+
+    @epoch.setter
+    def epoch(self, new_epoch):
+        self._epoch = int(new_epoch)
+        if 0 <= self.epoch <= self.epochs:
+            self.rate = self.rate_at(self._epoch)
+
+    @property
+    def rate(self):
+        return self.optim.lr
+
+    @rate.setter
+    def rate(self, new_rate):
+        self.optim.lr = new_rate
+
+    def rate_at(self, epoch):
+        if self.start_rate is None:
+            return self.optim.lr
+        return self.start_rate
+
+    def next(self):
+        # prepares the next epoch. returns whether or not to continue training.
+        if not self.started:
+            self.started = True
+        self.epoch += 1
+        if self.epoch > self.epochs:
+            return False
+        return True
+
+    def batch(self, progress): # TODO: rename
+        # interpolates rates between epochs.
+        # unlike epochs, we do not store batch number as a state.
+        # i.e. calling next() will not respect progress.
+        assert 0 <= progress <= 1
+        self.rate = self.rate_at(self._epoch + progress)
+
+    @property
+    def final_rate(self):
+        return self.rate_at(self.epochs - 1e-8)
+
+class AnnealingLearner(Learner):
+    def __init__(self, optim, epochs=100, rate=None, halve_every=10):
+        self.halve_every = _f(halve_every)
+        self.anneal = _f(0.5**(1/self.halve_every))
+        super().__init__(optim, epochs, rate)
+
+    def rate_at(self, epoch):
+        return super().rate_at(epoch) * self.anneal**epoch
+
+def cosmod(x):
+    # plot: https://www.desmos.com/calculator/hlgqmyswy2
+    return (_1 + np.cos((x % _1) * _pi)) * _inv2
+
+class SGDR(Learner):
+    # Stochastic Gradient Descent with Restarts
+    # paper: https://arxiv.org/abs/1608.03983
+    # NOTE: this is missing a couple of the proposed features.
+
+    per_batch = True
+
+    def __init__(self, optim, epochs=100, rate=None,
+                 restarts=0, restart_decay=0.5, callback=None,
+                 expando=0):
+        self.restart_epochs = int(epochs)
+        self.decay = _f(restart_decay)
+        self.restarts = int(restarts)
+        self.restart_callback = callback
+
+        # TODO: rename expando to something not insane
+        self.expando = expando if expando is not None else lambda i: i
+        if type(self.expando) == int:
+            inc = self.expando
+            self.expando = lambda i: i * inc
+
+        self.splits = []
+        epochs = 0
+        for i in range(0, self.restarts + 1):
+            split = epochs + self.restart_epochs + int(self.expando(i))
+            self.splits.append(split)
+            epochs = split
+        super().__init__(optim, epochs, rate)
+
+    def split_num(self, epoch):
+        previous = [0] + self.splits
+        for i, split in enumerate(self.splits):
+            if epoch - 1 < split:
+                sub_epoch = epoch - previous[i]
+                next_restart = split - previous[i]
+                return i, sub_epoch, next_restart
+        raise Exception('this should never happen.')
+
+    def rate_at(self, epoch):
+        base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
+        restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
+        x = _f(sub_epoch - 1) / _f(next_restart)
+        return base_rate * self.decay**_f(restart) * cosmod(x)
+
+    def next(self):
+        if not super().next():
+            return False
+        restart, sub_epoch, next_restart = self.split_num(self.epoch)
+        if restart > 0 and sub_epoch == 1:
+            if self.restart_callback is not None:
+                self.restart_callback(restart)
+        return True
+
+class TriangularCLR(Learner):
+    per_batch = True
+
+    def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0,
+                 frequency=100, callback=None):
+        # NOTE: start_rate is treated as upper_rate
+        self.frequency = int(frequency)
+        assert self.frequency > 0
+        self.callback = callback
+        self.lower_rate = _f(lower_rate)
+        super().__init__(optim, epochs, upper_rate)
+
+    def _t(self, epoch):
+        # NOTE: this could probably be simplified
+        offset = self.frequency / 2
+        return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
+
+    def rate_at(self, epoch):
+        upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
+        return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
+
+    def next(self):
+        if not super().next():
+            return False
+        e = self.epoch - 1
+        if e > 0 and e % self.frequency == 0:
+            if self.callback is not None:
+                self.callback(self.epoch // self.frequency)
+        return True
+
+class SineCLR(TriangularCLR):
+    def _t(self, epoch):
+        return np.sin(_pi * _inv2 * super()._t(epoch))
+
+class WaveCLR(TriangularCLR):
+    def _t(self, epoch):
+        return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
+
+# more
+
+class PolyLearner(Learner):
+    per_batch = True
+
+    def __init__(self, optim, epochs=400, coeffs=(1,)):
+        self.coeffs = tuple(coeffs)
+        super().__init__(optim, epochs, np.polyval(self.coeffs, 0))
+
+    def rate_at(self, epoch):
+        progress = (epoch - 1) / (self.epochs)
+        ret = np.polyval(self.coeffs, progress)
+        return np.abs(ret)
+
diff --git a/onn/loss.py b/onn/loss.py
new file mode 100644
index 0000000..94f8cb1
--- /dev/null
+++ b/onn/loss.py
@@ -0,0 +1,129 @@
+import numpy as np
+
+from .floats import *
+
+class Loss:
+    def forward(self, p, y):
+        raise NotImplementedError("unimplemented", self)
+
+    def backward(self, p, y):
+        raise NotImplementedError("unimplemented", self)
+
+class NLL(Loss): # Negative Log Likelihood
+    def forward(self, p, y):
+        correct = p * y
+        return np.mean(-correct)
+
+    def backward(self, p, y):
+        return -y / len(p)
+
+class CategoricalCrossentropy(Loss):
+    # lifted from theano
+
+    def __init__(self, eps=1e-6):
+        self.eps = _f(eps)
+
+    def forward(self, p, y):
+        p = np.clip(p, self.eps, 1 - self.eps)
+        f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
+        return np.mean(f)
+
+    def backward(self, p, y):
+        p = np.clip(p, self.eps, 1 - self.eps)
+        df = (p - y) / (p * (1 - p))
+        return df / len(y)
+
+class Accuracy(Loss):
+    # returns percentage of categories correctly predicted.
+    # utilizes argmax(), so it cannot be used for gradient descent.
+    # use CategoricalCrossentropy or NLL for that instead.
+
+    def forward(self, p, y):
+        correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1)
+        return np.mean(correct)
+
+    def backward(self, p, y):
+        raise NotImplementedError("cannot take the gradient of Accuracy")
+
+class ResidualLoss(Loss):
+    def forward(self, p, y):
+        return np.mean(self.f(p - y))
+
+    def backward(self, p, y):
+        ret = self.df(p - y) / len(y)
+        return ret
+
+class SquaredHalved(ResidualLoss):
+    def f(self, r):
+        return np.square(r) / 2
+
+    def df(self, r):
+        return r
+
+class Squared(ResidualLoss):
+    def f(self, r):
+        return np.square(r)
+
+    def df(self, r):
+        return 2 * r
+
+class Absolute(ResidualLoss):
+    def f(self, r):
+        return np.abs(r)
+
+    def df(self, r):
+        return np.sign(r)
+
+class Huber(ResidualLoss):
+    def __init__(self, delta=1.0):
+        self.delta = _f(delta)
+
+    def f(self, r):
+        return np.where(r <= self.delta,
+                        np.square(r) / 2,
+                        self.delta * (np.abs(r) - self.delta / 2))
+
+    def df(self, r):
+        return np.where(r <= self.delta,
+                        r,
+                        self.delta * np.sign(r))
+
+# more
+
+class SomethingElse(ResidualLoss):
+    # generalizes Absolute and SquaredHalved.
+    # plot: https://www.desmos.com/calculator/fagjg9vuz7
+    def __init__(self, a=4/3):
+        assert 1 <= a <= 2, "parameter out of range"
+        self.a = _f(a / 2)
+        self.b = _f(2 / a)
+        self.c = _f(2 / a - 1)
+
+    def f(self, r):
+        return self.a * np.abs(r)**self.b
+
+    def df(self, r):
+        return np.sign(r) * np.abs(r)**self.c
+
+class Confidence(Loss):
+    # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
+    # it's just a metric of how large the value is of the predicted class.
+    # when using it for loss, it acts like a crappy regularizer.
+    # it really just measures how much of a hot-shot the network thinks it is.
+
+    def forward(self, p, y=None):
+        categories = p.shape[-1]
+        confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
+        # the exponent in softmax puts a maximum on confidence,
+        # but we don't compensate for that. if necessary,
+        # it'd be better to use an activation that doesn't have this limit.
+        return np.mean(confidence)
+
+    def backward(self, p, y=None):
+        # in order to agree with the forward pass,
+        # using this backwards pass as-is will minimize confidence.
+        categories = p.shape[-1]
+        detc = p / categories / (1 - 1/categories)
+        dmax = p == np.max(p, axis=-1, keepdims=True)
+        return detc * dmax
+
diff --git a/onn/math.py b/onn/math.py
new file mode 100644
index 0000000..794dfe6
--- /dev/null
+++ b/onn/math.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+def rolling(a, window):
+    # http://stackoverflow.com/a/4924433
+    shape = (a.size - window + 1, window)
+    strides = (a.itemsize, a.itemsize)
+    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+
+def rolling_batch(a, window):
+    # same as rolling, but acts on each batch (axis 0).
+    shape = (a.shape[0], a.shape[-1] - window + 1, window)
+    strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
+    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+
diff --git a/onn/model.py b/onn/model.py
new file mode 100644
index 0000000..bc5f68c
--- /dev/null
+++ b/onn/model.py
@@ -0,0 +1,197 @@
+import sys
+
+from .floats import *
+from .nodes import *
+from .layer_base import *
+
+class Model:
+    def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
+        self.loss = loss if loss is not None else SquaredHalved()
+        self.mloss = mloss if mloss is not None else loss
+
+        nodes_in  = [nodes_in]  if isinstance(nodes_in,  Layer) else nodes_in
+        nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
+        assert type(nodes_in)  == list, type(nodes_in)
+        assert type(nodes_out) == list, type(nodes_out)
+        self.nodes_in = nodes_in
+        self.nodes_out = nodes_out
+
+        self.nodes = traverse_all(self.nodes_in, self.nodes_out)
+        self.make_weights()
+        for node in self.nodes:
+            node.unsafe = unsafe
+        # TODO: handle the same layer being in more than one node.
+
+    @property
+    def ordered_nodes(self):
+        # deprecated? we don't guarantee an order like we did before.
+        return self.nodes
+
+    def make_weights(self):
+        self.param_count = sum((node.size for node in self.nodes if not node.shared))
+        self.W  = np.zeros(self.param_count, dtype=_f)
+        self.dW = np.zeros(self.param_count, dtype=_f)
+
+        offset = 0
+        for node in self.nodes:
+            if node.size > 0 and not node.shared:
+                inner_offset = 0
+
+                def allocate(size):
+                    nonlocal inner_offset
+                    o = offset + inner_offset
+                    ret = self.W[o:o+size], self.dW[o:o+size]
+                    inner_offset += size
+                    assert len(ret[0]) == len(ret[1])
+                    assert size == len(ret[0]), (size, len(ret[0]))
+                    return ret
+
+                node.init(allocate)
+                assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
+                # i don't care if "less" is grammatically incorrect.
+                # you're mom is grammatically incorrect.
+                assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
+                offset += node.size
+
+    def evaluate(self, input_, deterministic=True):
+        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
+        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
+        node_in = self.nodes_in[0]
+        node_out = self.nodes_out[0]
+        outputs = self.evaluate_multi({node_in: input_}, deterministic)
+        return outputs[node_out]
+
+    def apply(self, error): # TODO: better name?
+        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
+        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
+        node_in = self.nodes_in[0]
+        node_out = self.nodes_out[0]
+        inputs = self.apply_multi({node_out: error})
+        return inputs[node_in]
+
+    def evaluate_multi(self, inputs, deterministic=True):
+        values = dict()
+        outputs = dict()
+        for node in self.nodes:
+            if node in self.nodes_in:
+                assert node in inputs, "missing input for node {}".format(node.name)
+                X = inputs[node]
+                values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
+            else:
+                values[node] = node.propagate(values, deterministic)
+            if node in self.nodes_out:
+                outputs[node] = values[node]
+        return outputs
+
+    def apply_multi(self, outputs):
+        values = dict()
+        inputs = dict()
+        for node in reversed(self.nodes):
+            if node in self.nodes_out:
+                assert node in outputs, "missing output for node {}".format(node.name)
+                X = outputs[node]
+                values[node] = node._backpropagate(np.expand_dims(X, 0))
+            else:
+                values[node] = node.backpropagate(values)
+            if node in self.nodes_in:
+                inputs[node] = values[node]
+        return inputs
+
+    def forward(self, inputs, outputs, measure=False, deterministic=False):
+        predicted = self.evaluate(inputs, deterministic=deterministic)
+        if measure:
+            error = self.mloss.forward(predicted, outputs)
+        else:
+            error = self.loss.forward(predicted, outputs)
+        return error, predicted
+
+    def backward(self, predicted, outputs, measure=False):
+        if measure:
+            error = self.mloss.backward(predicted, outputs)
+        else:
+            error = self.loss.backward(predicted, outputs)
+        # input_delta is rarely useful; it's just to match the forward pass.
+        input_delta = self.apply(error)
+        return self.dW, input_delta
+
+    def clear_grad(self):
+        for node in self.nodes:
+            node.clear_grad()
+
+    def regulate_forward(self):
+        loss = _0
+        for node in self.nodes:
+            if node.loss is not None:
+                loss += node.loss
+            for k, w in node.weights.items():
+                loss += w.forward()
+        return loss
+
+    def regulate(self):
+        for node in self.nodes:
+            for k, w in node.weights.items():
+                w.update()
+
+    def load_weights(self, fn):
+        # seemingly compatible with keras' Dense layers.
+        import h5py
+        open(fn) # just ensure the file exists (python's error is better)
+        f = h5py.File(fn, 'r')
+        weights = {}
+        def visitor(name, obj):
+            if isinstance(obj, h5py.Dataset):
+                weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
+        f.visititems(visitor)
+        f.close()
+
+        used = {}
+        for k in weights.keys():
+            used[k] = False
+
+        nodes = [node for node in self.nodes if node.size > 0]
+        # TODO: support shared weights.
+        for node in nodes:
+            full_name = str(node).lower()
+            for s_name, o_name in node.serialized.items():
+                key = full_name + '_' + s_name
+                data = weights[key]
+                target = getattr(node, o_name)
+                target.f[:] = data
+                used[key] = True
+
+        for k, v in used.items():
+            if not v:
+                lament("WARNING: unused weight", k)
+
+    def save_weights(self, fn, overwrite=False):
+        import h5py
+        from collections import defaultdict
+
+        f = h5py.File(fn, 'w')
+
+        counts = defaultdict(lambda: 0)
+
+        nodes = [node for node in self.nodes if node.size > 0]
+        # TODO: support shared weights.
+        for node in nodes:
+            full_name = str(node).lower()
+            grp = f.create_group(full_name)
+            for s_name, o_name in node.serialized.items():
+                key = full_name + '_' + s_name
+                target = getattr(node, o_name)
+                data = grp.create_dataset(key, target.shape, dtype=_f)
+                data[:] = target.f
+                counts[key] += 1
+                if counts[key] > 1:
+                    lament("WARNING: rewrote weight", key)
+
+        f.close()
+
+    def print_graph(self, file=sys.stdout):
+        print('digraph G {', file=file)
+        for node in self.nodes:
+            children = [str(n) for n in node.children]
+            if children:
+                sep = '->'
+                print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
+        print('}', file=file)
diff --git a/onn/nodes.py b/onn/nodes.py
new file mode 100644
index 0000000..081b045
--- /dev/null
+++ b/onn/nodes.py
@@ -0,0 +1,43 @@
+class DummyNode:
+    name = "Dummy"
+
+    def __init__(self, children=None, parents=None):
+        self.children = children if children is not None else []
+        self.parents  = parents  if parents  is not None else []
+
+def traverse(node_in, node_out, nodes=None, dummy_mode=False):
+    # i have no idea if this is any algorithm in particular.
+    nodes = nodes if nodes is not None else []
+
+    seen_up = {}
+    q = [node_out]
+    while len(q) > 0:
+        node = q.pop(0)
+        seen_up[node] = True
+        for parent in node.parents:
+            q.append(parent)
+
+    if dummy_mode:
+        seen_up[node_in] = True
+
+    nodes = []
+    q = [node_in]
+    while len(q) > 0:
+        node = q.pop(0)
+        if not seen_up[node]:
+            continue
+        parents_added = (parent in nodes for parent in node.parents)
+        if not node in nodes and all(parents_added):
+            nodes.append(node)
+        for child in node.children:
+            q.append(child)
+
+    if dummy_mode:
+        nodes.remove(node_in)
+
+    return nodes
+
+def traverse_all(nodes_in, nodes_out, nodes=None):
+    all_in = DummyNode(children=nodes_in)
+    all_out = DummyNode(parents=nodes_out)
+    return traverse(all_in, all_out, nodes, dummy_mode=True)
diff --git a/onn/optimizer.py b/onn/optimizer.py
new file mode 100644
index 0000000..5a0e91c
--- /dev/null
+++ b/onn/optimizer.py
@@ -0,0 +1,531 @@
+import numpy as np
+
+from .floats import *
+from .optimizer_base import *
+
+# some of the the following optimizers are blatantly lifted from tiny-dnn:
+# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+
+class Momentum(Optimizer):
+    def __init__(self, lr=0.01, mu=0.9, nesterov=False):
+        self.mu = _f(mu) # momentum
+        self.nesterov = bool(nesterov)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.Vprev = None
+
+    def compute(self, dW, W):
+        if self.Vprev is None:
+            self.Vprev = np.copy(dW)
+
+        V = self.mu * self.Vprev - self.lr * dW
+        self.Vprev[:] = V
+        if self.nesterov:
+            return self.mu * V - self.lr * dW
+
+        return V
+
+class Adagrad(Optimizer):
+    def __init__(self, lr=0.01, eps=1e-8):
+        self.eps = _f(eps)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.g = None
+
+    def compute(self, dW, W):
+        if self.g is None:
+            self.g = np.zeros_like(dW)
+
+        self.g += np.square(dW)
+        return -self.lr * dW / (np.sqrt(self.g) + self.eps)
+
+class RMSprop(Optimizer):
+    # RMSprop generalizes* Adagrad, etc.
+
+    # * RMSprop == Adagrad when
+    #   RMSprop.mu == 1
+
+    def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
+        self.mu = _f(mu) # decay term
+        self.eps = _f(eps)
+
+        # one might consider the following equation when specifying mu:
+        # mu = e**(-1/t)
+        # default: t = -1/ln(0.99) = ~99.5
+        # therefore the default of mu=0.99 means
+        # an input decays to 1/e its original amplitude over 99.5 batches.
+        # (this is from DSP, so how relevant it is in SGD is debatable)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.g = None
+
+    def compute(self, dW, W):
+        if self.g is None:
+            self.g = np.zeros_like(dW)
+
+        # basically apply a first-order low-pass filter to delta squared
+        self.g += (1 - self.mu) * (np.square(dW) - self.g)
+
+        # finally sqrt it to complete the running root-mean-square approximation
+        return -self.lr * dW / (np.sqrt(self.g) + self.eps)
+
+class RMSpropCentered(Optimizer):
+    # referenced TensorFlow/PyTorch.
+    # paper: https://arxiv.org/pdf/1308.0850v5.pdf
+
+    def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8):
+        self.aleph = _f(aleph)
+        self.momentum = _f(momentum)
+        self.eps = _f(eps)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.g = None
+        self.mt = None
+        self.vt = None
+        self.delta = None
+
+    def compute(self, dW, W):
+        if self.g is None:
+            self.g = np.zeros_like(dW)
+        if self.mt is None:
+            self.mt = np.zeros_like(dW)
+        if self.vt is None:
+            self.vt = np.zeros_like(dW)
+        if self.delta is None:
+            self.delta = np.zeros_like(dW)
+
+        self.mt += (1 - self.aleph) * (dW - self.mt)
+        self.vt += (1 - self.aleph) * (np.square(dW) - self.vt)
+
+        # PyTorch has the epsilon outside of the sqrt,
+        # TensorFlow and the paper have it within.
+        # in onn, we generally do it outside, as this seems to work better.
+        temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps)
+
+        # TensorFlow does it this way.
+        self.delta[:] = self.momentum * self.delta + self.lr * temp
+        return -self.delta
+        # PyTorch does it this way.
+        #self.delta[:] = self.momentum * self.delta + temp
+        #return -self.lr * self.delta
+        # they are equivalent only when LR is constant, which it might not be.
+
+class Adam(Optimizer):
+    # paper: https://arxiv.org/abs/1412.6980
+    # Adam generalizes* RMSprop, and
+    # adds a decay term to the regular (non-squared) delta, and performs
+    # debiasing to compensate for the filtered deltas starting from zero.
+
+    # * Adam == RMSprop when
+    #   Adam.b1 == 0
+    #   Adam.b2 == RMSprop.mu
+
+    def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
+        self.b1 = _f(b1) # decay term
+        self.b2 = _f(b2) # decay term
+        self.b1_t_default = _f(b1) # decay term power t
+        self.b2_t_default = _f(b2) # decay term power t
+        self.eps = _f(eps)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.mt = None
+        self.vt = None
+        self.b1_t = self.b1_t_default
+        self.b2_t = self.b2_t_default
+
+    def compute(self, dW, W):
+        if self.mt is None:
+            self.mt = np.zeros_like(dW)
+        if self.vt is None:
+            self.vt = np.zeros_like(dW)
+
+        # decay gain
+        self.b1_t *= self.b1
+        self.b2_t *= self.b2
+
+        # filter
+        self.mt += (1 - self.b1) * (dW - self.mt)
+        self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
+
+        return -self.lr * (self.mt / (1 - self.b1_t)) \
+                / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
+
+class Nadam(Optimizer):
+    # paper: https://arxiv.org/abs/1412.6980
+    # paper: http://cs229.stanford.edu/proj2015/054_report.pdf
+    # TODO: double-check this implementation. also read the damn paper.
+    # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
+    # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
+
+    def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
+        self.b1 = _f(b1) # decay term
+        self.b2 = _f(b2) # decay term
+        self.eps = _f(eps)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.mt = None
+        self.vt = None
+        self.t = 0
+        self.sched = 1
+
+    def compute(self, dW, W):
+        self.t += 1
+
+        if self.mt is None:
+            self.mt = np.zeros_like(dW)
+        if self.vt is None:
+            self.vt = np.zeros_like(dW)
+
+        ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
+        ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
+
+        sched0 = self.sched * ut0
+        sched1 = self.sched * ut0 * ut1
+        self.sched = sched0
+
+        gp = dW / (1 - sched0)
+
+        self.mt += (1 - self.b1) * (dW - self.mt)
+        self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
+
+        mtp = self.mt / (1 - sched1)
+        vtp = self.vt / (1 - self.b2**self.t)
+
+        mt_bar = (1 - ut0) * gp + ut1 * mtp
+
+        return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
+
+# more
+
+class FTML(Optimizer):
+    # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
+    # author's implementation: https://github.com/szhengac/optim/commit/923555e
+
+    def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
+        self.iterations = _0
+        self.b1 = _f(b1) # decay term
+        self.b2 = _f(b2) # decay term
+        self.eps = _f(eps)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.dt1 = None
+        self.dt = None
+        self.vt = None
+        self.zt = None
+        self.b1_t = _1
+        self.b2_t = _1
+
+    def compute(self, dW, W):
+        if self.dt1 is None: self.dt1 = np.zeros_like(dW)
+        if self.dt is None: self.dt = np.zeros_like(dW)
+        if self.vt is None: self.vt = np.zeros_like(dW)
+        if self.zt is None: self.zt = np.zeros_like(dW)
+
+        # NOTE: we could probably rewrite these equations to avoid this copy.
+        self.dt1[:] = self.dt[:]
+
+        self.b1_t *= self.b1
+        self.b2_t *= self.b2
+
+        # hardly an elegant solution.
+        lr = max(self.lr, self.eps)
+
+        # same as Adam's vt.
+        self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
+
+        # you can factor "inner" out of Adam as well.
+        inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
+        self.dt[:] = (1 - self.b1_t) / lr * inner
+
+        sigma_t = self.dt - self.b1 * self.dt1
+
+        # Adam's mt minus the sigma term.
+        self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
+
+        # subtract by weights to avoid having to override self.update.
+        return -self.zt / self.dt - W
+
+class MomentumClip(Optimizer):
+    def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
+        self.mu = _f(mu)
+        self.clip = _f(clip)
+        self.nesterov = bool(nesterov)
+        self.debug = bool(debug)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.accum = None
+
+    def compute(self, dW, W):
+        if self.accum is None:
+            self.accum = np.zeros_like(dW)
+
+        total_norm = np.linalg.norm(dW)
+        clip_scale = self.clip / (total_norm + 1e-6)
+        if clip_scale < 1:
+            if self.debug:
+                lament("clipping gradients; norm: {:10.5f}".format(total_norm))
+            dW *= clip_scale
+
+        self.accum[:] = self.accum * self.mu + dW
+        if self.nesterov:
+            return -self.lr * (self.accum * self.mu + dW)
+        else:
+            return -self.lr * self.accum
+
+class YellowFin(Optimizer):
+    # paper: https://arxiv.org/abs/1706.03471
+    # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
+    # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
+    # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
+
+    def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
+                 debias=True, clip=1.0):
+        self.lr_default = _f(lr)
+        self.mu_default = _f(mu)
+        self.beta = _f(beta)
+        self.window_size = int(window_size) # curv_win_width
+        self.debias_enabled = bool(debias)
+        self.clip = _f(clip)
+
+        self.mu = _f(mu) # momentum
+        super().__init__(lr)
+
+    def reset(self):
+        self.accum = None
+
+        self.lr = self.lr_default
+        self.mu = self.mu_default
+
+        self.step = 0
+        self.beta_t = self.beta
+
+        self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
+
+        self.h_min = None
+        self.h_max = None
+
+        self.g_lpf = 0
+        #self.g_squared_lpf = 0
+        self.g_norm_squared_lpf = 0
+        self.g_norm_lpf = 0
+        self.h_min_lpf = 0
+        self.h_max_lpf = 0
+        self.dist_lpf = 0
+        self.lr_lpf = 0
+        self.mu_lpf = 0
+
+    def get_lr_mu(self):
+        p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
+        w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
+        w = np.power(w3, 1/3)
+        y = w - p / (3 * w)
+        sqrt_mu1 = y + 1
+
+        sqrt_h_min = np.sqrt(self.h_min)
+        sqrt_h_max = np.sqrt(self.h_max)
+        sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min)
+
+        sqrt_mu = max(sqrt_mu1, sqrt_mu2)
+        if sqrt_mu2 > sqrt_mu1:
+            print('note: taking dr calculation. something may have exploded.')
+
+        lr = np.square(1 - sqrt_mu) / self.h_min
+        mu = np.square(sqrt_mu)
+        return lr, mu
+
+    def compute(self, dW, W):
+        if self.accum is None:
+            self.accum = np.zeros_like(dW)
+
+        # TODO: prevent allocations everywhere by using [:].
+        #       assuming that really works. i haven't actually checked.
+
+        total_norm = np.linalg.norm(dW)
+        clip_scale = self.clip / (total_norm + 1e-6)
+        if clip_scale < 1:
+            #print("clipping gradients; norm: {:10.5f}".format(total_norm))
+            dW *= clip_scale
+
+        #fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
+        #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
+
+        b = self.beta
+        m1b = 1 - self.beta
+        debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1
+
+        g = dW
+        g_squared = np.square(g)
+        g_norm_squared = np.sum(g_squared)
+        g_norm = np.sqrt(g_norm_squared)
+
+        self.curv_win[self.step % self.window_size] = g_norm_squared
+        valid_window = self.curv_win[:min(self.window_size, self.step + 1)]
+        h_min_t = np.min(valid_window)
+        h_max_t = np.max(valid_window)
+
+        self.g_lpf              = b * self.g_lpf              + m1b * g
+        #self.g_squared_lpf      = b * self.g_squared_lpf      + m1b * g_squared
+        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
+        self.g_norm_lpf         = b * self.g_norm_lpf         + m1b * g_norm
+        self.h_min_lpf          = b * self.h_min_lpf          + m1b * h_min_t
+        self.h_max_lpf          = b * self.h_max_lpf          + m1b * h_max_t
+
+        g_avg              = debias * self.g_lpf
+        #g_squared_avg      = debias * self.g_squared_lpf
+        g_norm_squared_avg = debias * self.g_norm_squared_lpf
+        g_norm_avg         = debias * self.g_norm_lpf
+        self.h_min         = debias * self.h_min_lpf
+        self.h_max         = debias * self.h_max_lpf
+        assert self.h_max >= self.h_min
+
+        dist = g_norm_avg / g_norm_squared_avg
+
+        self.dist_lpf           = b * self.dist_lpf           + m1b * dist
+
+        self.dist_avg      = debias * self.dist_lpf
+
+        self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
+        # equivalently:
+        #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
+
+        if self.step > 0:
+            lr_for_real, mu_for_real = self.get_lr_mu()
+            self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real
+            self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real
+            self.mu = debias * self.mu_lpf
+            self.lr = debias * self.lr_lpf
+
+        self.accum[:] = self.accum * self.mu - self.lr * dW
+        V = self.accum
+
+        self.step += 1
+        self.beta_t *= self.beta
+        return V
+
+class AddSign(Optimizer):
+    # paper: https://arxiv.org/abs/1709.07417
+
+    def __init__(self, lr=0.01, mu=0.9, alpha=1):
+        self.mu = _f(mu)
+        self.alpha = _f(alpha)
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.accum = None
+
+    def compute(self, dW, W):
+        if self.accum is None:
+            self.accum = np.zeros_like(dW)
+
+        self.accum[:] = self.accum * self.mu + dW
+
+        signed = np.sign(dW) * np.sign(self.accum)
+        #signed *= decay
+
+        return -self.lr * dW * (self.alpha + signed)
+
+class PowerSign(Optimizer):
+    # paper: https://arxiv.org/abs/1709.07417
+
+    def __init__(self, lr=0.01, mu=0.9, alpha=np.e):
+        self.mu = _f(mu)
+        self.alpha = _f(alpha)
+        self.use_exp = np.isclose(self.alpha, _f(np.e))
+
+        super().__init__(lr)
+
+    def reset(self):
+        self.accum = None
+
+    def compute(self, dW, W):
+        if self.accum is None:
+            self.accum = np.zeros_like(dW)
+
+        self.accum[:] = self.accum * self.mu + dW
+
+        signed = np.sign(dW) * np.sign(self.accum)
+        #signed *= decay
+
+        if self.use_exp:
+            return -self.lr * dW * np.exp(signed)
+        else:
+            return -self.lr * dW * np.power(self.alpha, signed)
+
+class Neumann(Optimizer):
+    # paper: https://arxiv.org/abs/1712.03298
+    # NOTE: this implementation is missing resetting as described in the paper.
+    #       resetting is totally disabled for now.
+    # NOTE: this implementation does not use vanilla SGD for its first epochs.
+    #       you should do this yourself if you need it.
+    #       it seems like using a Learner like SineCLR makes this unnecessary.
+
+    def __init__(self, lr=0.01):
+        self.alpha = _f(1e-7) # cubic.
+        self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
+        self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
+        # momentum is ∝ (in the shape of) 1 - 1/(1 + t)
+        self.mu_min = _f(0.5)
+        self.mu_max = _f(0.9)
+        self.reset_period = 0 # TODO
+
+        super().__init__(lr)
+
+    def reset(self):
+        # NOTE: mt and vt are different than the pair in Adam-like optimizers.
+        self.mt = None # momentum accumulator.
+        self.vt = None # weight accumulator.
+        self.t = 0
+
+    def compute(self, dW, W):
+        raise Exception("compute() is not available for this Optimizer.")
+
+    def update(self, dW, W):
+        self.t += 1
+
+        if self.mt is None:
+            self.mt = np.zeros_like(dW)
+        if self.vt is None:
+            self.vt = np.zeros_like(dW)
+
+        if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0:
+            self.mt = -self.lr * dW
+            return
+
+        # momentum quantity:
+        mu = _1 - _1/_f(self.t) # the + 1 is implicit.
+        mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
+
+        # smoothed change in weights:
+        delta = W - self.vt
+        delta_norm_squared = np.square(delta).sum()
+        delta_norm = np.sqrt(delta_norm_squared)
+
+        # regularization terms: (push and pull)
+        cubic_reg = self.alpha * delta_norm_squared
+        repulsive_reg = self.beta * dW.size / delta_norm_squared
+        dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm)
+
+        # plain momentum:
+        self.mt = mu * self.mt - self.lr * dt
+
+        # weights and accumulator:
+        W += mu * self.mt - self.lr * dt
+        self.vt = W + self.gamma * (self.vt - W)
+
diff --git a/onn/optimizer_base.py b/onn/optimizer_base.py
new file mode 100644
index 0000000..d7251c0
--- /dev/null
+++ b/onn/optimizer_base.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+from .floats import *
+
+class Optimizer:
+    def __init__(self, lr=0.1):
+        self.lr = _f(lr) # learning rate
+        self.reset()
+
+    def reset(self):
+        pass
+
+    def compute(self, dW, W):
+        return -self.lr * dW
+
+    def update(self, dW, W):
+        W += self.compute(dW, W)
+
+
diff --git a/onn/parametric.py b/onn/parametric.py
new file mode 100644
index 0000000..b1e6560
--- /dev/null
+++ b/onn/parametric.py
@@ -0,0 +1,254 @@
+import numpy as np
+
+from .floats import *
+from .layer_base import *
+from .initialization import *
+
+class Bias(Layer):
+    # TODO: support axes other than -1 and shapes other than 1D.
+
+    serialized = {
+        'b': 'biases',
+    }
+
+    def __init__(self, init=init_zeros, reg_b=None):
+        super().__init__()
+        self.biases = self._new_weights('biases', init=init, regularizer=reg_b)
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        self.output_shape = shape
+        self.biases.shape = (shape[-1],)
+
+    def forward(self, X):
+        return X + self.biases.f
+
+    def backward(self, dY):
+        self.biases.g += dY.sum(0)
+        return dY
+
+class Dense(Layer):
+    serialized = {
+        'W': 'coeffs',
+        'b': 'biases',
+    }
+
+    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None):
+        super().__init__()
+        self.dim = int(dim)
+        self.output_shape = (dim,)
+        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
+        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        assert len(shape) == 1, shape
+        self.coeffs.shape = (shape[0], self.dim)
+        self.biases.shape = (1, self.dim)
+
+    def forward(self, X):
+        self.X = X
+        return X @ self.coeffs.f + self.biases.f
+
+    def backward(self, dY):
+        self.coeffs.g += self.X.T @ dY
+        self.biases.g += dY.sum(0, keepdims=True)
+        return dY @ self.coeffs.f.T
+
+
+# more
+
+class Conv1Dper(Layer):
+    # periodic (circular) convolution.
+    # currently only supports one channel I/O.
+    # some notes:
+    # we could use FFTs for larger convolutions.
+    # i think storing the coefficients backwards would
+    # eliminate reversal in the critical code.
+
+    serialize = {
+        'W': 'coeffs',
+    }
+
+    def __init__(self, kernel_size, pos=None,
+                 init=init_glorot_uniform, reg_w=None):
+        super().__init__()
+        self.kernel_size = int(kernel_size)
+        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
+        if pos is None:
+            self.wrap0 = (self.kernel_size - 0) // 2
+            self.wrap1 = (self.kernel_size - 1) // 2
+        elif pos == 'alt':
+            self.wrap0 = (self.kernel_size - 1) // 2
+            self.wrap1 = (self.kernel_size - 0) // 2
+        elif pos == 'left':
+            self.wrap0 = 0
+            self.wrap1 = self.kernel_size - 1
+        elif pos == 'right':
+            self.wrap0 = self.kernel_size - 1
+            self.wrap1 = 0
+        else:
+            raise Exception("pos parameter not understood: {}".format(pos))
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        assert len(shape) == 1, shape
+        self.output_shape = shape
+        self.coeffs.shape = (1, self.kernel_size)
+
+    def forward(self, X):
+        if self.wrap0 == 0:
+            Xper = np.hstack((X,X[:,:self.wrap1]))
+        elif self.wrap1 == 0:
+            Xper = np.hstack((X[:,-self.wrap0:],X))
+        else:
+            Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
+        self.cols = rolling_batch(Xper, self.kernel_size)
+        convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
+        return convolved
+
+    def backward(self, dY):
+        self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
+        return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
+
+class LayerNorm(Layer):
+    # paper: https://arxiv.org/abs/1607.06450
+    # note: nonparametric when affine == False
+
+    def __init__(self, eps=1e-5, affine=True):
+        super().__init__()
+        self.eps = _f(eps)
+        self.affine = bool(affine)
+
+        if self.affine:
+            self.gamma = self._new_weights('gamma', init=init_ones)
+            self.beta = self._new_weights('beta', init=init_zeros)
+            self.serialized = {
+                'gamma': 'gamma',
+                'beta': 'beta',
+            }
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        self.output_shape = shape
+        assert len(shape) == 1, shape
+        if self.affine:
+            self.gamma.shape = (shape[0],)
+            self.beta.shape = (shape[0],)
+
+    def forward(self, X):
+        self.mean = X.mean(0)
+        self.center = X - self.mean
+        self.var = self.center.var(0) + self.eps
+        self.std = np.sqrt(self.var)
+
+        self.Xnorm = self.center / self.std
+        if self.affine:
+            return self.gamma.f * self.Xnorm + self.beta.f
+        return self.Xnorm
+
+    def backward(self, dY):
+        length = dY.shape[0]
+
+        if self.affine:
+            dXnorm = dY * self.gamma.f
+            self.gamma.g += (dY * self.Xnorm).sum(0)
+            self.beta.g += dY.sum(0)
+        else:
+            dXnorm = dY
+
+        dstd = (dXnorm * self.center).sum(0) / -self.var
+        dcenter = dXnorm / self.std + dstd / self.std * self.center / length
+        dmean = -dcenter.sum(0)
+        dX = dcenter + dmean / length
+
+        return dX
+
+class Denses(Layer): # TODO: rename?
+    # acts as a separate Dense for each row or column. only for 2D arrays.
+
+    serialized = {
+        'W': 'coeffs',
+        'b': 'biases',
+    }
+
+    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
+        super().__init__()
+        self.dim = int(dim)
+        self.weight_init = init
+        self.axis = int(axis)
+        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
+        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        assert len(shape) == 2, shape
+
+        assert -len(shape) <= self.axis < len(shape)
+        self.axis = self.axis % len(shape)
+
+        self.output_shape = list(shape)
+        self.output_shape[self.axis] = self.dim
+        self.output_shape = tuple(self.output_shape)
+
+        in_rows = self.input_shape[0]
+        in_cols = self.input_shape[1]
+        out_rows = self.output_shape[0]
+        out_cols = self.output_shape[1]
+
+        self.coeffs.shape = (in_rows, in_cols, self.dim)
+        self.biases.shape = (1, out_rows, out_cols)
+
+    def forward(self, X):
+        self.X = X
+        if self.axis == 0:
+            return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f
+        elif self.axis == 1:
+            return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f
+
+    def backward(self, dY):
+        self.biases.g += dY.sum(0, keepdims=True)
+        if self.axis == 0:
+            self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY)
+            return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
+        elif self.axis == 1:
+            self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
+            return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
+
+class CosineDense(Dense):
+    # paper: https://arxiv.org/abs/1702.05870
+    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
+    # the paper doesn't mention bias,
+    # so we treat bias as an additional weight with a constant input of 1.
+    # this is correct in Dense layers, so i hope it's correct here too.
+
+    eps = 1e-4
+
+    def forward(self, X):
+        self.X = X
+        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
+          + 1 + self.eps)
+        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
+          + np.square(self.biases.f) + self.eps)
+        self.dot = X @ self.coeffs.f + self.biases.f
+        Y = self.dot / (self.X_norm * self.W_norm)
+        return Y
+
+    def backward(self, dY):
+        ddot = dY / self.X_norm / self.W_norm
+        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
+        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
+
+        self.coeffs.g += self.X.T @ ddot            \
+          + dW_norm / self.W_norm * self.coeffs.f
+        self.biases.g += ddot.sum(0, keepdims=True) \
+          + dW_norm / self.W_norm * self.biases.f
+        dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
+
+        return dX
+
diff --git a/onn/regularizer.py b/onn/regularizer.py
new file mode 100644
index 0000000..b8b1ac7
--- /dev/null
+++ b/onn/regularizer.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+from .floats import *
+
+class Regularizer:
+    pass
+
+class L1L2(Regularizer):
+    def __init__(self, l1=0.0, l2=0.0):
+        self.l1 = _f(l1)
+        self.l2 = _f(l2)
+
+    def forward(self, X):
+        f = _0
+        if self.l1:
+            f += np.sum(self.l1 * np.abs(X))
+        if self.l2:
+            f += np.sum(self.l2 * np.square(X))
+        return f
+
+    def backward(self, X):
+        df = np.zeros_like(X)
+        if self.l1:
+            df += self.l1 * np.sign(X)
+        if self.l2:
+            df += self.l2 * 2 * X
+        return df
+
+# more
+
+class SaturateRelu(Regularizer):
+    # paper: https://arxiv.org/abs/1703.09202
+    # TODO: test this (and ActivityRegularizer) more thoroughly.
+    #       i've looked at the histogram of the resulting weights.
+    #       it seems like only the layers after this are affected
+    #       the way they should be.
+
+    def __init__(self, lamb=0.0):
+        self.lamb = _f(lamb)
+
+    def forward(self, X):
+        return self.lamb * np.where(X >= 0, X, 0)
+
+    def backward(self, X):
+        return self.lamb * np.where(X >= 0, 1, 0)
diff --git a/onn/ritual.py b/onn/ritual.py
new file mode 100644
index 0000000..7499e6b
--- /dev/null
+++ b/onn/ritual.py
@@ -0,0 +1,90 @@
+import numpy as np
+
+from .floats import *
+from .initialization import *
+from .ritual_base import *
+
+def stochastic_multiply(W, gamma=0.5, allow_negation=False):
+    # paper: https://arxiv.org/abs/1606.01981
+
+    assert W.ndim == 1, W.ndim
+    assert 0 < gamma < 1, gamma
+    size = len(W)
+    alpha = np.max(np.abs(W))
+    # NOTE: numpy gives [low, high) but the paper advocates [low, high]
+    mult = np.random.uniform(gamma, 1/gamma, size=size)
+    if allow_negation:
+        # NOTE: i have yet to see this do anything but cause divergence.
+        # i've referenced the paper several times yet still don't understand
+        # what i'm doing wrong, so i'm disabling it by default in my code.
+        # maybe i just need *a lot* more weights to compensate.
+        prob = (W / alpha + 1) / 2
+        samples = np.random.random_sample(size=size)
+        mult *= np.where(samples < prob, 1, -1)
+    np.multiply(W, mult, out=W)
+
+class StochMRitual(Ritual):
+    # paper: https://arxiv.org/abs/1606.01981
+    # this probably doesn't make sense for regression problems,
+    # let alone small models, but here it is anyway!
+
+    def __init__(self, learner=None, gamma=0.5):
+        super().__init__(learner)
+        self.gamma = _f(gamma)
+
+    def prepare(self, model):
+        self.W = np.copy(model.W)
+        super().prepare(model)
+
+    def learn(self, inputs, outputs):
+        # an experiment:
+        #assert self.learner.rate < 10, self.learner.rate
+        #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
+
+        self.W[:] = self.model.W
+        for layer in self.model.ordered_nodes:
+            if isinstance(layer, Dense):
+                stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
+        residual = super().learn(inputs, outputs)
+        self.model.W[:] = self.W
+        return residual
+
+    def update(self):
+        super().update()
+        f = 0.5
+        for layer in self.model.ordered_nodes:
+            if isinstance(layer, Dense):
+                np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
+            #   np.clip(layer.W, -1, 1, out=layer.W)
+
+class NoisyRitual(Ritual):
+    def __init__(self, learner=None,
+                 input_noise=0, output_noise=0, gradient_noise=0):
+        self.input_noise = _f(input_noise)
+        self.output_noise = _f(output_noise)
+        self.gradient_noise = _f(gradient_noise)
+        super().__init__(learner)
+
+    def learn(self, inputs, outputs):
+        # this is pretty crude
+        if self.input_noise > 0:
+            s = self.input_noise
+            inputs =   inputs + np.random.normal(0, s, size=inputs.shape)
+        if self.output_noise > 0:
+            s = self.output_noise
+            outputs = outputs + np.random.normal(0, s, size=outputs.shape)
+        return super().learn(inputs, outputs)
+
+    def update(self):
+        # gradient noise paper: https://arxiv.org/abs/1511.06807
+        if self.gradient_noise > 0:
+            size = len(self.model.dW)
+            gamma = 0.55
+            #s = self.gradient_noise / (1 + self.bn) ** gamma
+            # experiments:
+            s = self.gradient_noise * np.sqrt(self.learner.rate)
+            #s = np.square(self.learner.rate)
+            #s = self.learner.rate / self.en
+            self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
+        super().update()
+
diff --git a/onn/ritual_base.py b/onn/ritual_base.py
new file mode 100644
index 0000000..5362a3a
--- /dev/null
+++ b/onn/ritual_base.py
@@ -0,0 +1,132 @@
+import types
+import numpy as np
+
+from .floats import *
+
+class Ritual: # i'm just making up names at this point.
+    def __init__(self, learner=None):
+        self.learner = learner if learner is not None else Learner(Optimizer())
+        self.model = None
+
+    def reset(self):
+        self.learner.reset(optim=True)
+        self.en = 0
+        self.bn = 0
+
+    def learn(self, inputs, outputs):
+        error, predicted = self.model.forward(inputs, outputs)
+        self.model.backward(predicted, outputs)
+        self.model.regulate()
+        return error, predicted
+
+    def update(self):
+        optim = self.learner.optim
+        optim.model = self.model
+        optim.update(self.model.dW, self.model.W)
+
+    def prepare(self, model):
+        self.en = 0
+        self.bn = 0
+        self.model = model
+
+    def _train_batch(self, batch_inputs, batch_outputs, b, batch_count,
+                     test_only=False, loss_logging=False, mloss_logging=True):
+        if not test_only and self.learner.per_batch:
+            self.learner.batch(b / batch_count)
+
+        if test_only:
+            predicted = self.model.evaluate(batch_inputs, deterministic=True)
+        else:
+            error, predicted = self.learn(batch_inputs, batch_outputs)
+            self.model.regulate_forward()
+            self.update()
+
+        if loss_logging:
+            batch_loss = self.model.loss.forward(predicted, batch_outputs)
+            if np.isnan(batch_loss):
+                raise Exception("nan")
+            self.losses.append(batch_loss)
+            self.cumsum_loss += batch_loss
+
+        if mloss_logging:
+            # NOTE: this can use the non-deterministic predictions. fixme?
+            batch_mloss = self.model.mloss.forward(predicted, batch_outputs)
+            if np.isnan(batch_mloss):
+                raise Exception("nan")
+            self.mlosses.append(batch_mloss)
+            self.cumsum_mloss += batch_mloss
+
+    def train_batched(self, inputs_or_generator, outputs_or_batch_count,
+                      batch_size=None,
+                      return_losses=False, test_only=False, shuffle=True,
+                      clear_grad=True):
+        assert isinstance(return_losses, bool) or return_losses == 'both'
+        assert self.model is not None
+
+        gen = isinstance(inputs_or_generator, types.GeneratorType)
+        if gen:
+            generator = inputs_or_generator
+            batch_count = outputs_or_batch_count
+            assert isinstance(batch_count, int), type(batch_count)
+        else:
+            inputs = inputs_or_generator
+            outputs = outputs_or_batch_count
+
+        if not test_only:
+            self.en += 1
+
+        if shuffle:
+            if gen:
+                raise Exception("shuffling is incompatibile with using a generator.")
+            indices = np.arange(inputs.shape[0])
+            np.random.shuffle(indices)
+            inputs = inputs[indices]
+            outputs = outputs[indices]
+
+        self.cumsum_loss, self.cumsum_mloss = _0, _0
+        self.losses, self.mlosses = [], []
+
+        if not gen:
+            batch_count = inputs.shape[0] // batch_size
+            # TODO: lift this restriction
+            assert inputs.shape[0] % batch_size == 0, \
+              "inputs is not evenly divisible by batch_size"
+
+        prev_batch_size = None
+        for b in range(batch_count):
+            if not test_only:
+                self.bn += 1
+
+            if gen:
+                batch_inputs, batch_outputs = next(generator)
+                batch_size = batch_inputs.shape[0]
+                # TODO: lift this restriction
+                assert batch_size == prev_batch_size or prev_batch_size is None, \
+                  "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
+            else:
+                bi = b * batch_size
+                batch_inputs  = inputs[ bi:bi+batch_size]
+                batch_outputs = outputs[bi:bi+batch_size]
+
+            if clear_grad:
+                self.model.clear_grad()
+            self._train_batch(batch_inputs, batch_outputs, b, batch_count,
+                              test_only, return_losses=='both', return_losses)
+
+            prev_batch_size = batch_size
+
+        avg_mloss = self.cumsum_mloss / _f(batch_count)
+        if return_losses == 'both':
+            avg_loss = self.cumsum_loss / _f(batch_count)
+            return avg_loss, avg_mloss, self.losses, self.mlosses
+        elif return_losses:
+            return avg_mloss, self.mlosses
+        return avg_mloss
+
+    def test_batched(self, inputs, outputs, *args, **kwargs):
+        return self.train_batched(inputs, outputs, *args,
+                                  test_only=True, **kwargs)
+
+    def train_batched_gen(self, generator, batch_count, *args, **kwargs):
+        return self.train_batched(generator, batch_count, *args,
+                                  shuffle=False, **kwargs)
diff --git a/onn/util.py b/onn/util.py
new file mode 100644
index 0000000..edd895c
--- /dev/null
+++ b/onn/util.py
@@ -0,0 +1,37 @@
+import sys
+
+def lament(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+def lower_priority():
+    """Set the priority of the process to below-normal."""
+    # via https://stackoverflow.com/a/1023269
+    if sys.platform == 'win32':
+        try:
+            import win32api, win32process, win32con
+            pid = win32api.GetCurrentProcessId()
+            handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
+            win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
+        except ImportError:
+            lament("you do not have pywin32 installed.")
+            lament("the process priority could not be lowered.")
+            lament("consider: python -m pip install pypiwin32")
+            lament("consider: conda install pywin32")
+    else:
+        import os
+        os.nice(1)
+
+# more
+
+_log_was_update = False
+def log(left, right, update=False):
+    s = "\x1B[1m  {:>20}:\x1B[0m   {}".format(left, right)
+    global _log_was_update
+    if update and _log_was_update:
+        lament('\x1B[F' + s)
+    else:
+        lament(s)
+    _log_was_update = update
+
+class Dummy:
+    pass
diff --git a/onn/weights.py b/onn/weights.py
new file mode 100644
index 0000000..a531d64
--- /dev/null
+++ b/onn/weights.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+class Weights:
+    # we may or may not contain weights -- or any information, for that matter.
+
+    def __init__(self, **kwargs):
+        self.f = None # forward weights
+        self.g = None # backward weights (gradients)
+        self.shape = None
+        self.init = None
+        self.allocator = None
+        self.regularizer = None
+        self._allocated = False
+
+        self.configure(**kwargs)
+
+    def configure(self, **kwargs):
+        for k, v in kwargs.items():
+            getattr(self, k) # ensures the key already exists
+            setattr(self, k, v)
+
+    @property
+    def size(self):
+        assert self.shape is not None
+        return np.prod(self.shape)
+
+    def allocate(self, *args, **kwargs):
+        if self._allocated:
+            raise Exception("attempted to allocate existing weights")
+        self.configure(**kwargs)
+
+        # intentionally not using isinstance
+        assert type(self.shape) == tuple, self.shape
+
+        f, g = self.allocator(self.size)
+        assert len(f) == self.size, "{} != {}".format(f.shape, self.size)
+        assert len(g) == self.size, "{} != {}".format(g.shape, self.size)
+        f[:] = self.init(self.size, *args)
+        g[:] = self.init(self.size, *args)
+        self.f = f.reshape(self.shape)
+        self.g = g.reshape(self.shape)
+
+        self._allocated = True
+
+    def forward(self):
+        if self.regularizer is None:
+            return 0.0
+        return self.regularizer.forward(self.f)
+
+    def backward(self):
+        if self.regularizer is None:
+            return 0.0
+        return self.regularizer.backward(self.f)
+
+    def update(self):
+        if self.regularizer is None:
+            return
+        self.g += self.regularizer.backward(self.f)
diff --git a/onn_core.py b/onn_core.py
deleted file mode 100644
index 62be569..0000000
--- a/onn_core.py
+++ /dev/null
@@ -1,1398 +0,0 @@
-import sys
-import types
-
-def lament(*args, **kwargs):
-    print(*args, file=sys.stderr, **kwargs)
-
-def lower_priority():
-    """Set the priority of the process to below-normal."""
-    # via https://stackoverflow.com/a/1023269
-    if sys.platform == 'win32':
-        try:
-            import win32api, win32process, win32con
-            pid = win32api.GetCurrentProcessId()
-            handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
-            win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
-        except ImportError:
-            lament("you do not have pywin32 installed.")
-            lament("the process priority could not be lowered.")
-            lament("consider: python -m pip install pypiwin32")
-            lament("consider: conda install pywin32")
-    else:
-        import os
-        os.nice(1)
-
-import numpy as np
-_f = np.float32
-
-# just for speed, not strictly essential:
-from scipy.special import expit as sigmoid
-
-# used for numbering layers like Keras, and keeping initialization consistent:
-from collections import defaultdict, OrderedDict
-_layer_counters = defaultdict(lambda: 0)
-
-def _check(a):
-    assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
-    assert a.dtype == _f, a.dtype
-    return a
-
-_0 = _f(0)
-_1 = _f(1)
-_2 = _f(2)
-_inv2 = _f(1/2)
-_sqrt2 = _f(np.sqrt(2))
-_invsqrt2 = _f(1/np.sqrt(2))
-_pi = _f(np.pi)
-
-class LayerIncompatibility(Exception):
-    pass
-
-# Node Traversal {{{1
-
-class DummyNode:
-    name = "Dummy"
-
-    def __init__(self, children=None, parents=None):
-        self.children = children if children is not None else []
-        self.parents  = parents  if parents  is not None else []
-
-def traverse(node_in, node_out, nodes=None, dummy_mode=False):
-    # i have no idea if this is any algorithm in particular.
-    nodes = nodes if nodes is not None else []
-
-    seen_up = {}
-    q = [node_out]
-    while len(q) > 0:
-        node = q.pop(0)
-        seen_up[node] = True
-        for parent in node.parents:
-            q.append(parent)
-
-    if dummy_mode:
-        seen_up[node_in] = True
-
-    nodes = []
-    q = [node_in]
-    while len(q) > 0:
-        node = q.pop(0)
-        if not seen_up[node]:
-            continue
-        parents_added = (parent in nodes for parent in node.parents)
-        if not node in nodes and all(parents_added):
-            nodes.append(node)
-        for child in node.children:
-            q.append(child)
-
-    if dummy_mode:
-        nodes.remove(node_in)
-
-    return nodes
-
-def traverse_all(nodes_in, nodes_out, nodes=None):
-    all_in = DummyNode(children=nodes_in)
-    all_out = DummyNode(parents=nodes_out)
-    return traverse(all_in, all_out, nodes, dummy_mode=True)
-
-# Initializations {{{1
-
-# note: these are currently only implemented for 2D shapes.
-
-def init_zeros(size, ins=None, outs=None):
-    return np.zeros(size)
-
-def init_ones(size, ins=None, outs=None):
-    return np.ones(size)
-
-def init_he_normal(size, ins, outs):
-    s = np.sqrt(2 / ins)
-    return np.random.normal(0, s, size=size)
-
-def init_he_uniform(size, ins, outs):
-    s = np.sqrt(6 / ins)
-    return np.random.uniform(-s, s, size=size)
-
-def init_glorot_normal(size, ins, outs):
-    s = np.sqrt(2 / (ins + outs))
-    return np.random.normal(0, s, size=size)
-
-def init_glorot_uniform(size, ins, outs):
-    s = np.sqrt(6 / (ins + outs))
-    return np.random.uniform(-s, s, size=size)
-
-# Weight container {{{1
-
-class Weights:
-    # we may or may not contain weights -- or any information, for that matter.
-
-    def __init__(self, **kwargs):
-        self.f = None # forward weights
-        self.g = None # backward weights (gradients)
-        self.shape = None
-        self.init = None
-        self.allocator = None
-        self.regularizer = None
-        self._allocated = False
-
-        self.configure(**kwargs)
-
-    def configure(self, **kwargs):
-        for k, v in kwargs.items():
-            getattr(self, k) # ensures the key already exists
-            setattr(self, k, v)
-
-    @property
-    def size(self):
-        assert self.shape is not None
-        return np.prod(self.shape)
-
-    def allocate(self, *args, **kwargs):
-        if self._allocated:
-            raise Exception("attempted to allocate existing weights")
-        self.configure(**kwargs)
-
-        # intentionally not using isinstance
-        assert type(self.shape) == tuple, self.shape
-
-        f, g = self.allocator(self.size)
-        assert len(f) == self.size, "{} != {}".format(f.shape, self.size)
-        assert len(g) == self.size, "{} != {}".format(g.shape, self.size)
-        f[:] = self.init(self.size, *args)
-        g[:] = self.init(self.size, *args)
-        self.f = f.reshape(self.shape)
-        self.g = g.reshape(self.shape)
-
-        self._allocated = True
-
-    def forward(self):
-        if self.regularizer is None:
-            return 0.0
-        return self.regularizer.forward(self.f)
-
-    def backward(self):
-        if self.regularizer is None:
-            return 0.0
-        return self.regularizer.backward(self.f)
-
-    def update(self):
-        if self.regularizer is None:
-            return
-        self.g += self.regularizer.backward(self.f)
-
-# Loss functions {{{1
-
-class Loss:
-    pass
-
-class NLL(Loss): # Negative Log Likelihood
-    def forward(self, p, y):
-        correct = p * y
-        return np.mean(-correct)
-
-    def backward(self, p, y):
-        return -y / len(p)
-
-class CategoricalCrossentropy(Loss):
-    # lifted from theano
-
-    def __init__(self, eps=1e-6):
-        self.eps = _f(eps)
-
-    def forward(self, p, y):
-        p = np.clip(p, self.eps, 1 - self.eps)
-        f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
-        return np.mean(f)
-
-    def backward(self, p, y):
-        p = np.clip(p, self.eps, 1 - self.eps)
-        df = (p - y) / (p * (1 - p))
-        return df / len(y)
-
-class Accuracy(Loss):
-    # returns percentage of categories correctly predicted.
-    # utilizes argmax(), so it cannot be used for gradient descent.
-    # use CategoricalCrossentropy or NLL for that instead.
-
-    def forward(self, p, y):
-        correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1)
-        return np.mean(correct)
-
-    def backward(self, p, y):
-        raise NotImplementedError("cannot take the gradient of Accuracy")
-
-class ResidualLoss(Loss):
-    def forward(self, p, y):
-        return np.mean(self.f(p - y))
-
-    def backward(self, p, y):
-        ret = self.df(p - y) / len(y)
-        return ret
-
-class SquaredHalved(ResidualLoss):
-    def f(self, r):
-        return np.square(r) / 2
-
-    def df(self, r):
-        return r
-
-class Squared(ResidualLoss):
-    def f(self, r):
-        return np.square(r)
-
-    def df(self, r):
-        return 2 * r
-
-class Absolute(ResidualLoss):
-    def f(self, r):
-        return np.abs(r)
-
-    def df(self, r):
-        return np.sign(r)
-
-class Huber(ResidualLoss):
-    def __init__(self, delta=1.0):
-        self.delta = _f(delta)
-
-    def f(self, r):
-        return np.where(r <= self.delta,
-                        np.square(r) / 2,
-                        self.delta * (np.abs(r) - self.delta / 2))
-
-    def df(self, r):
-        return np.where(r <= self.delta,
-                        r,
-                        self.delta * np.sign(r))
-
-# Regularizers {{{1
-
-class Regularizer:
-    pass
-
-class L1L2(Regularizer):
-    def __init__(self, l1=0.0, l2=0.0):
-        self.l1 = _f(l1)
-        self.l2 = _f(l2)
-
-    def forward(self, X):
-        f = _0
-        if self.l1:
-            f += np.sum(self.l1 * np.abs(X))
-        if self.l2:
-            f += np.sum(self.l2 * np.square(X))
-        return f
-
-    def backward(self, X):
-        df = np.zeros_like(X)
-        if self.l1:
-            df += self.l1 * np.sign(X)
-        if self.l2:
-            df += self.l2 * 2 * X
-        return df
-
-# Optimizers {{{1
-
-class Optimizer:
-    def __init__(self, lr=0.1):
-        self.lr = _f(lr) # learning rate
-        self.reset()
-
-    def reset(self):
-        pass
-
-    def compute(self, dW, W):
-        return -self.lr * dW
-
-    def update(self, dW, W):
-        W += self.compute(dW, W)
-
-# some of the the following optimizers are blatantly lifted from tiny-dnn:
-# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
-
-class Momentum(Optimizer):
-    def __init__(self, lr=0.01, mu=0.9, nesterov=False):
-        self.mu = _f(mu) # momentum
-        self.nesterov = bool(nesterov)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.Vprev = None
-
-    def compute(self, dW, W):
-        if self.Vprev is None:
-            self.Vprev = np.copy(dW)
-
-        V = self.mu * self.Vprev - self.lr * dW
-        self.Vprev[:] = V
-        if self.nesterov:
-            return self.mu * V - self.lr * dW
-
-        return V
-
-class Adagrad(Optimizer):
-    def __init__(self, lr=0.01, eps=1e-8):
-        self.eps = _f(eps)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.g = None
-
-    def compute(self, dW, W):
-        if self.g is None:
-            self.g = np.zeros_like(dW)
-
-        self.g += np.square(dW)
-        return -self.lr * dW / (np.sqrt(self.g) + self.eps)
-
-class RMSprop(Optimizer):
-    # RMSprop generalizes* Adagrad, etc.
-
-    # * RMSprop == Adagrad when
-    #   RMSprop.mu == 1
-
-    def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
-        self.mu = _f(mu) # decay term
-        self.eps = _f(eps)
-
-        # one might consider the following equation when specifying mu:
-        # mu = e**(-1/t)
-        # default: t = -1/ln(0.99) = ~99.5
-        # therefore the default of mu=0.99 means
-        # an input decays to 1/e its original amplitude over 99.5 batches.
-        # (this is from DSP, so how relevant it is in SGD is debatable)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.g = None
-
-    def compute(self, dW, W):
-        if self.g is None:
-            self.g = np.zeros_like(dW)
-
-        # basically apply a first-order low-pass filter to delta squared
-        self.g += (1 - self.mu) * (np.square(dW) - self.g)
-
-        # finally sqrt it to complete the running root-mean-square approximation
-        return -self.lr * dW / (np.sqrt(self.g) + self.eps)
-
-class RMSpropCentered(Optimizer):
-    # referenced TensorFlow/PyTorch.
-    # paper: https://arxiv.org/pdf/1308.0850v5.pdf
-
-    def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8):
-        self.aleph = _f(aleph)
-        self.momentum = _f(momentum)
-        self.eps = _f(eps)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.g = None
-        self.mt = None
-        self.vt = None
-        self.delta = None
-
-    def compute(self, dW, W):
-        if self.g is None:
-            self.g = np.zeros_like(dW)
-        if self.mt is None:
-            self.mt = np.zeros_like(dW)
-        if self.vt is None:
-            self.vt = np.zeros_like(dW)
-        if self.delta is None:
-            self.delta = np.zeros_like(dW)
-
-        self.mt += (1 - self.aleph) * (dW - self.mt)
-        self.vt += (1 - self.aleph) * (np.square(dW) - self.vt)
-
-        # PyTorch has the epsilon outside of the sqrt,
-        # TensorFlow and the paper have it within.
-        # in onn, we generally do it outside, as this seems to work better.
-        temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps)
-
-        # TensorFlow does it this way.
-        self.delta[:] = self.momentum * self.delta + self.lr * temp
-        return -self.delta
-        # PyTorch does it this way.
-        #self.delta[:] = self.momentum * self.delta + temp
-        #return -self.lr * self.delta
-        # they are equivalent only when LR is constant, which it might not be.
-
-class Adam(Optimizer):
-    # paper: https://arxiv.org/abs/1412.6980
-    # Adam generalizes* RMSprop, and
-    # adds a decay term to the regular (non-squared) delta, and performs
-    # debiasing to compensate for the filtered deltas starting from zero.
-
-    # * Adam == RMSprop when
-    #   Adam.b1 == 0
-    #   Adam.b2 == RMSprop.mu
-
-    def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
-        self.b1 = _f(b1) # decay term
-        self.b2 = _f(b2) # decay term
-        self.b1_t_default = _f(b1) # decay term power t
-        self.b2_t_default = _f(b2) # decay term power t
-        self.eps = _f(eps)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.mt = None
-        self.vt = None
-        self.b1_t = self.b1_t_default
-        self.b2_t = self.b2_t_default
-
-    def compute(self, dW, W):
-        if self.mt is None:
-            self.mt = np.zeros_like(dW)
-        if self.vt is None:
-            self.vt = np.zeros_like(dW)
-
-        # decay gain
-        self.b1_t *= self.b1
-        self.b2_t *= self.b2
-
-        # filter
-        self.mt += (1 - self.b1) * (dW - self.mt)
-        self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
-
-        return -self.lr * (self.mt / (1 - self.b1_t)) \
-                / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
-
-class Nadam(Optimizer):
-    # paper: https://arxiv.org/abs/1412.6980
-    # paper: http://cs229.stanford.edu/proj2015/054_report.pdf
-    # TODO: double-check this implementation. also read the damn paper.
-    # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
-    # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
-
-    def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
-        self.b1 = _f(b1) # decay term
-        self.b2 = _f(b2) # decay term
-        self.eps = _f(eps)
-
-        super().__init__(lr)
-
-    def reset(self):
-        self.mt = None
-        self.vt = None
-        self.t = 0
-        self.sched = 1
-
-    def compute(self, dW, W):
-        self.t += 1
-
-        if self.mt is None:
-            self.mt = np.zeros_like(dW)
-        if self.vt is None:
-            self.vt = np.zeros_like(dW)
-
-        ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
-        ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
-
-        sched0 = self.sched * ut0
-        sched1 = self.sched * ut0 * ut1
-        self.sched = sched0
-
-        gp = dW / (1 - sched0)
-
-        self.mt += (1 - self.b1) * (dW - self.mt)
-        self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
-
-        mtp = self.mt / (1 - sched1)
-        vtp = self.vt / (1 - self.b2**self.t)
-
-        mt_bar = (1 - ut0) * gp + ut1 * mtp
-
-        return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
-
-# Abstract Layers {{{1
-
-class Layer:
-    def __init__(self):
-        self.parents = []
-        self.children = []
-        self.weights = OrderedDict()
-        self.loss = None # for activity regularizers
-        self.input_shape = None
-        self.output_shape = None
-        kind = self.__class__.__name__
-        global _layer_counters
-        _layer_counters[kind] += 1
-        self.name = "{}_{}".format(kind, _layer_counters[kind])
-        self.unsafe = False # disables assertions for better performance
-        self.shared = False # as in weight sharing
-
-    def __str__(self):
-        return self.name
-
-    # methods we might want to override:
-
-    def forward(self, X):
-        raise NotImplementedError("unimplemented", self)
-
-    def forward_deterministic(self, X):
-        return self.forward(X)
-
-    def backward(self, dY):
-        raise NotImplementedError("unimplemented", self)
-
-    def make_shape(self, parent):
-        if self.input_shape == None:
-            self.input_shape = parent.output_shape
-        if self.output_shape == None:
-            self.output_shape = self.input_shape
-
-    def do_feed(self, child):
-        self.children.append(child)
-
-    def be_fed(self, parent):
-        self.parents.append(parent)
-
-    # TODO: better names for these (still)
-    def _propagate(self, edges, deterministic):
-        if not self.unsafe:
-            assert len(edges) == 1, self
-        if deterministic:
-            return self.forward_deterministic(edges[0])
-        else:
-            return self.forward(edges[0])
-
-    def _backpropagate(self, edges):
-        if len(edges) == 1:
-            return self.backward(edges[0])
-        return sum((self.backward(dY) for dY in edges))
-
-    # general utility methods:
-
-    def is_compatible(self, parent):
-        return np.all(self.input_shape == parent.output_shape)
-
-    def feed(self, child):
-        assert self.output_shape is not None, self
-        child.make_shape(self)
-        if not child.is_compatible(self):
-            fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
-            raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
-        self.do_feed(child)
-        child.be_fed(self)
-        return child
-
-    def validate_input(self, X):
-        assert X.shape[1:] == self.input_shape,  (str(self), X.shape[1:], self.input_shape)
-
-    def validate_output(self, Y):
-        assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
-
-    def _new_weights(self, name, **kwargs):
-        w = Weights(**kwargs)
-        assert name not in self.weights, name
-        self.weights[name] = w
-        return w
-
-    def share(self, node):
-        self.weights = node.weights # TODO: this should be all it takes.
-        for k, v in self.weights.items():
-            vs = getattr(node, k) # hack: key isn't necessarily attribute name!
-            setattr(self, k, vs)
-        self.shared = True
-
-    def clear_grad(self):
-        for name, w in self.weights.items():
-            w.g[:] = 0
-
-    @property
-    def size(self):
-        return sum((w.size for w in self.weights.values()))
-
-    def init(self, allocator):
-        ins, outs = self.input_shape[0], self.output_shape[0]
-        for k, w in self.weights.items():
-            w.allocate(ins, outs, allocator=allocator)
-
-    def propagate(self, values, deterministic):
-        if not self.unsafe:
-            assert self.parents, self
-        edges = []
-        for parent in self.parents:
-            if parent in values:
-                X = values[parent]
-                if not self.unsafe:
-                    self.validate_input(X)
-                edges.append(X)
-        Y = self._propagate(edges, deterministic)
-        if not self.unsafe:
-            self.validate_output(Y)
-        return Y
-
-    def backpropagate(self, values):
-        if not self.unsafe:
-            assert self.children, self
-        edges = []
-        for child in self.children:
-            if child in values:
-                dY = values[child]
-                if not self.unsafe:
-                    self.validate_output(dY)
-                edges.append(dY)
-        dX = self._backpropagate(edges)
-        if not self.unsafe:
-            self.validate_input(dX)
-        return dX
-
-# Nonparametric Layers {{{1
-
-class Input(Layer):
-    def __init__(self, shape):
-        assert shape is not None
-        super().__init__()
-        self.shape = tuple(shape)
-        self.input_shape = self.shape
-        self.output_shape = self.shape
-
-    def forward(self, X):
-        return X
-
-    def backward(self, dY):
-        #self.dY = dY
-        return np.zeros_like(dY)
-
-class Reshape(Layer):
-    def __init__(self, new_shape):
-        super().__init__()
-        self.shape = tuple(new_shape)
-        self.output_shape = self.shape
-
-    def forward(self, X):
-        self.batch_size = X.shape[0]
-        return X.reshape(self.batch_size, *self.output_shape)
-
-    def backward(self, dY):
-        assert dY.shape[0] == self.batch_size
-        return dY.reshape(self.batch_size, *self.input_shape)
-
-class Flatten(Layer):
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        self.output_shape = (np.prod(shape),)
-
-    def forward(self, X):
-        self.batch_size = X.shape[0]
-        return X.reshape(self.batch_size, *self.output_shape)
-
-    def backward(self, dY):
-        assert dY.shape[0] == self.batch_size
-        return dY.reshape(self.batch_size, *self.input_shape)
-
-class ConstAffine(Layer):
-    def __init__(self, a=1, b=0):
-        super().__init__()
-        self.a = _f(a)
-        self.b = _f(b)
-
-    def forward(self, X):
-        return self.a * X + self.b
-
-    def backward(self, dY):
-        return dY * self.a
-
-class Sum(Layer):
-    def _propagate(self, edges, deterministic):
-        return np.sum(edges, axis=0)
-
-    def _backpropagate(self, edges):
-        #assert len(edges) == 1, "unimplemented"
-        return edges[0] # TODO: does this always work?
-
-class ActivityRegularizer(Layer):
-    def __init__(self, reg):
-        super().__init__()
-        assert isinstance(reg, Regularizer), reg
-        self.reg = reg
-
-    def forward(self, X):
-        self.X = X
-        self.loss = np.sum(self.reg.forward(X))
-        return X
-
-    def backward(self, dY):
-        return dY + self.reg.backward(self.X)
-
-class Dropout(Layer):
-    def __init__(self, dropout=0.0):
-        super().__init__()
-        self.p = _f(1 - dropout)
-        assert 0 <= self.p <= 1
-
-    def forward(self, X):
-        self.mask = (np.random.rand(*X.shape) < self.p) / self.p
-        return X * self.mask
-
-    def forward_deterministic(self, X):
-        #self.mask = _1
-        return X
-
-    def backward(self, dY):
-        return dY * self.mask
-
-# Activation Layers {{{2
-
-class Identity(Layer):
-    def forward(self, X):
-        return X
-
-    def backward(self, dY):
-        return dY
-
-class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
-    def forward(self, X):
-        self.sig = sigmoid(X)
-        return self.sig
-
-    def backward(self, dY):
-        return dY * self.sig * (1 - self.sig)
-
-class Softplus(Layer):
-    # integral of Sigmoid.
-
-    def forward(self, X):
-        self.X = X
-        return np.log(1 + np.exp(X))
-
-    def backward(self, dY):
-        return dY * sigmoid(self.X)
-
-class Tanh(Layer):
-    def forward(self, X):
-        self.sig = np.tanh(X)
-        return self.sig
-
-    def backward(self, dY):
-        return dY * (1 - self.sig * self.sig)
-
-class LeCunTanh(Layer):
-    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
-    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
-    # scaled such that f([-1, 1]) = [-1, 1].
-    # helps preserve an input variance of 1.
-    # second derivative peaks around an input of ±1.
-
-    def forward(self, X):
-        self.sig = np.tanh(2 / 3 * X)
-        return 1.7159 * self.sig
-
-    def backward(self, dY):
-        return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
-
-class Relu(Layer):
-    def forward(self, X):
-        self.cond = X >= 0
-        return np.where(self.cond, X, 0)
-
-    def backward(self, dY):
-        return np.where(self.cond, dY, 0)
-
-class Elu(Layer):
-    # paper: https://arxiv.org/abs/1511.07289
-
-    def __init__(self, alpha=1):
-        super().__init__()
-        self.alpha = _f(alpha) # FIXME: unused
-
-    def forward(self, X):
-        self.cond = X >= 0
-        self.neg = np.exp(X) - 1
-        return np.where(self.cond, X, self.neg)
-
-    def backward(self, dY):
-        return dY * np.where(self.cond, 1, self.neg + 1)
-
-class GeluApprox(Layer):
-    # paper: https://arxiv.org/abs/1606.08415
-    #  plot: https://www.desmos.com/calculator/ydzgtccsld
-
-    def forward(self, X):
-        self.a = 1.704 * X
-        self.sig = sigmoid(self.a)
-        return X * self.sig
-
-    def backward(self, dY):
-        return dY * self.sig * (1 + self.a * (1 - self.sig))
-
-class Softmax(Layer):
-    def forward(self, X):
-        alpha = np.max(X, axis=-1, keepdims=True)
-        num = np.exp(X - alpha)
-        den = np.sum(num, axis=-1, keepdims=True)
-        self.sm = num / den
-        return self.sm
-
-    def backward(self, dY):
-        return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
-
-class LogSoftmax(Softmax):
-    def __init__(self, eps=1e-6):
-        super().__init__()
-        self.eps = _f(eps)
-
-    def forward(self, X):
-        return np.log(super().forward(X) + self.eps)
-
-    def backward(self, dY):
-        return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
-
-class Cos(Layer):
-    # performs well on MNIST for some strange reason.
-
-    def forward(self, X):
-        self.X = X
-        return np.cos(X)
-
-    def backward(self, dY):
-        return dY * -np.sin(self.X)
-
-# Parametric Layers {{{1
-
-class Bias(Layer):
-    # TODO: support axes other than -1 and shapes other than 1D.
-
-    serialized = {
-        'b': 'biases',
-    }
-
-    def __init__(self, init=init_zeros, reg_b=None):
-        super().__init__()
-        self.biases = self._new_weights('biases', init=init, regularizer=reg_b)
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        self.output_shape = shape
-        self.biases.shape = (shape[-1],)
-
-    def forward(self, X):
-        return X + self.biases.f
-
-    def backward(self, dY):
-        self.biases.g += dY.sum(0)
-        return dY
-
-class Dense(Layer):
-    serialized = {
-        'W': 'coeffs',
-        'b': 'biases',
-    }
-
-    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None):
-        super().__init__()
-        self.dim = int(dim)
-        self.output_shape = (dim,)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
-        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        assert len(shape) == 1, shape
-        self.coeffs.shape = (shape[0], self.dim)
-        self.biases.shape = (1, self.dim)
-
-    def forward(self, X):
-        self.X = X
-        return X @ self.coeffs.f + self.biases.f
-
-    def backward(self, dY):
-        self.coeffs.g += self.X.T @ dY
-        self.biases.g += dY.sum(0, keepdims=True)
-        return dY @ self.coeffs.f.T
-
-# Models {{{1
-
-class Model:
-    def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
-        self.loss = loss if loss is not None else SquaredHalved()
-        self.mloss = mloss if mloss is not None else loss
-
-        nodes_in  = [nodes_in]  if isinstance(nodes_in,  Layer) else nodes_in
-        nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
-        assert type(nodes_in)  == list, type(nodes_in)
-        assert type(nodes_out) == list, type(nodes_out)
-        self.nodes_in = nodes_in
-        self.nodes_out = nodes_out
-
-        self.nodes = traverse_all(self.nodes_in, self.nodes_out)
-        self.make_weights()
-        for node in self.nodes:
-            node.unsafe = unsafe
-        # TODO: handle the same layer being in more than one node.
-
-    @property
-    def ordered_nodes(self):
-        # deprecated? we don't guarantee an order like we did before.
-        return self.nodes
-
-    def make_weights(self):
-        self.param_count = sum((node.size for node in self.nodes if not node.shared))
-        self.W  = np.zeros(self.param_count, dtype=_f)
-        self.dW = np.zeros(self.param_count, dtype=_f)
-
-        offset = 0
-        for node in self.nodes:
-            if node.size > 0 and not node.shared:
-                inner_offset = 0
-
-                def allocate(size):
-                    nonlocal inner_offset
-                    o = offset + inner_offset
-                    ret = self.W[o:o+size], self.dW[o:o+size]
-                    inner_offset += size
-                    assert len(ret[0]) == len(ret[1])
-                    assert size == len(ret[0]), (size, len(ret[0]))
-                    return ret
-
-                node.init(allocate)
-                assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
-                # i don't care if "less" is grammatically incorrect.
-                # you're mom is grammatically incorrect.
-                assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
-                offset += node.size
-
-    def evaluate(self, input_, deterministic=True):
-        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
-        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
-        node_in = self.nodes_in[0]
-        node_out = self.nodes_out[0]
-        outputs = self.evaluate_multi({node_in: input_}, deterministic)
-        return outputs[node_out]
-
-    def apply(self, error): # TODO: better name?
-        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
-        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
-        node_in = self.nodes_in[0]
-        node_out = self.nodes_out[0]
-        inputs = self.apply_multi({node_out: error})
-        return inputs[node_in]
-
-    def evaluate_multi(self, inputs, deterministic=True):
-        values = dict()
-        outputs = dict()
-        for node in self.nodes:
-            if node in self.nodes_in:
-                assert node in inputs, "missing input for node {}".format(node.name)
-                X = inputs[node]
-                values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
-            else:
-                values[node] = node.propagate(values, deterministic)
-            if node in self.nodes_out:
-                outputs[node] = values[node]
-        return outputs
-
-    def apply_multi(self, outputs):
-        values = dict()
-        inputs = dict()
-        for node in reversed(self.nodes):
-            if node in self.nodes_out:
-                assert node in outputs, "missing output for node {}".format(node.name)
-                X = outputs[node]
-                values[node] = node._backpropagate(np.expand_dims(X, 0))
-            else:
-                values[node] = node.backpropagate(values)
-            if node in self.nodes_in:
-                inputs[node] = values[node]
-        return inputs
-
-    def forward(self, inputs, outputs, measure=False, deterministic=False):
-        predicted = self.evaluate(inputs, deterministic=deterministic)
-        if measure:
-            error = self.mloss.forward(predicted, outputs)
-        else:
-            error = self.loss.forward(predicted, outputs)
-        return error, predicted
-
-    def backward(self, predicted, outputs, measure=False):
-        if measure:
-            error = self.mloss.backward(predicted, outputs)
-        else:
-            error = self.loss.backward(predicted, outputs)
-        # input_delta is rarely useful; it's just to match the forward pass.
-        input_delta = self.apply(error)
-        return self.dW, input_delta
-
-    def clear_grad(self):
-        for node in self.nodes:
-            node.clear_grad()
-
-    def regulate_forward(self):
-        loss = _0
-        for node in self.nodes:
-            if node.loss is not None:
-                loss += node.loss
-            for k, w in node.weights.items():
-                loss += w.forward()
-        return loss
-
-    def regulate(self):
-        for node in self.nodes:
-            for k, w in node.weights.items():
-                w.update()
-
-    def load_weights(self, fn):
-        # seemingly compatible with keras' Dense layers.
-        import h5py
-        open(fn) # just ensure the file exists (python's error is better)
-        f = h5py.File(fn, 'r')
-        weights = {}
-        def visitor(name, obj):
-            if isinstance(obj, h5py.Dataset):
-                weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
-        f.visititems(visitor)
-        f.close()
-
-        used = {}
-        for k in weights.keys():
-            used[k] = False
-
-        nodes = [node for node in self.nodes if node.size > 0]
-        # TODO: support shared weights.
-        for node in nodes:
-            full_name = str(node).lower()
-            for s_name, o_name in node.serialized.items():
-                key = full_name + '_' + s_name
-                data = weights[key]
-                target = getattr(node, o_name)
-                target.f[:] = data
-                used[key] = True
-
-        for k, v in used.items():
-            if not v:
-                lament("WARNING: unused weight", k)
-
-    def save_weights(self, fn, overwrite=False):
-        import h5py
-        f = h5py.File(fn, 'w')
-
-        counts = defaultdict(lambda: 0)
-
-        nodes = [node for node in self.nodes if node.size > 0]
-        # TODO: support shared weights.
-        for node in nodes:
-            full_name = str(node).lower()
-            grp = f.create_group(full_name)
-            for s_name, o_name in node.serialized.items():
-                key = full_name + '_' + s_name
-                target = getattr(node, o_name)
-                data = grp.create_dataset(key, target.shape, dtype=_f)
-                data[:] = target.f
-                counts[key] += 1
-                if counts[key] > 1:
-                    lament("WARNING: rewrote weight", key)
-
-        f.close()
-
-    def print_graph(self, file=sys.stdout):
-        print('digraph G {', file=file)
-        for node in self.nodes:
-            children = [str(n) for n in node.children]
-            if children:
-                sep = '->'
-                print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
-        print('}', file=file)
-
-# Rituals {{{1
-
-class Ritual: # i'm just making up names at this point.
-    def __init__(self, learner=None):
-        self.learner = learner if learner is not None else Learner(Optimizer())
-        self.model = None
-
-    def reset(self):
-        self.learner.reset(optim=True)
-        self.en = 0
-        self.bn = 0
-
-    def learn(self, inputs, outputs):
-        error, predicted = self.model.forward(inputs, outputs)
-        self.model.backward(predicted, outputs)
-        self.model.regulate()
-        return error, predicted
-
-    def update(self):
-        optim = self.learner.optim
-        optim.model = self.model
-        optim.update(self.model.dW, self.model.W)
-
-    def prepare(self, model):
-        self.en = 0
-        self.bn = 0
-        self.model = model
-
-    def _train_batch(self, batch_inputs, batch_outputs, b, batch_count,
-                     test_only=False, loss_logging=False, mloss_logging=True):
-        if not test_only and self.learner.per_batch:
-            self.learner.batch(b / batch_count)
-
-        if test_only:
-            predicted = self.model.evaluate(batch_inputs, deterministic=True)
-        else:
-            error, predicted = self.learn(batch_inputs, batch_outputs)
-            self.model.regulate_forward()
-            self.update()
-
-        if loss_logging:
-            batch_loss = self.model.loss.forward(predicted, batch_outputs)
-            if np.isnan(batch_loss):
-                raise Exception("nan")
-            self.losses.append(batch_loss)
-            self.cumsum_loss += batch_loss
-
-        if mloss_logging:
-            # NOTE: this can use the non-deterministic predictions. fixme?
-            batch_mloss = self.model.mloss.forward(predicted, batch_outputs)
-            if np.isnan(batch_mloss):
-                raise Exception("nan")
-            self.mlosses.append(batch_mloss)
-            self.cumsum_mloss += batch_mloss
-
-    def train_batched(self, inputs_or_generator, outputs_or_batch_count,
-                      batch_size=None,
-                      return_losses=False, test_only=False, shuffle=True,
-                      clear_grad=True):
-        assert isinstance(return_losses, bool) or return_losses == 'both'
-        assert self.model is not None
-
-        gen = isinstance(inputs_or_generator, types.GeneratorType)
-        if gen:
-            generator = inputs_or_generator
-            batch_count = outputs_or_batch_count
-            assert isinstance(batch_count, int), type(batch_count)
-        else:
-            inputs = inputs_or_generator
-            outputs = outputs_or_batch_count
-
-        if not test_only:
-            self.en += 1
-
-        if shuffle:
-            if gen:
-                raise Exception("shuffling is incompatibile with using a generator.")
-            indices = np.arange(inputs.shape[0])
-            np.random.shuffle(indices)
-            inputs = inputs[indices]
-            outputs = outputs[indices]
-
-        self.cumsum_loss, self.cumsum_mloss = _0, _0
-        self.losses, self.mlosses = [], []
-
-        if not gen:
-            batch_count = inputs.shape[0] // batch_size
-            # TODO: lift this restriction
-            assert inputs.shape[0] % batch_size == 0, \
-              "inputs is not evenly divisible by batch_size"
-
-        prev_batch_size = None
-        for b in range(batch_count):
-            if not test_only:
-                self.bn += 1
-
-            if gen:
-                batch_inputs, batch_outputs = next(generator)
-                batch_size = batch_inputs.shape[0]
-                # TODO: lift this restriction
-                assert batch_size == prev_batch_size or prev_batch_size is None, \
-                  "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
-            else:
-                bi = b * batch_size
-                batch_inputs  = inputs[ bi:bi+batch_size]
-                batch_outputs = outputs[bi:bi+batch_size]
-
-            if clear_grad:
-                self.model.clear_grad()
-            self._train_batch(batch_inputs, batch_outputs, b, batch_count,
-                              test_only, return_losses=='both', return_losses)
-
-            prev_batch_size = batch_size
-
-        avg_mloss = self.cumsum_mloss / _f(batch_count)
-        if return_losses == 'both':
-            avg_loss = self.cumsum_loss / _f(batch_count)
-            return avg_loss, avg_mloss, self.losses, self.mlosses
-        elif return_losses:
-            return avg_mloss, self.mlosses
-        return avg_mloss
-
-    def test_batched(self, inputs, outputs, *args, **kwargs):
-        return self.train_batched(inputs, outputs, *args,
-                                  test_only=True, **kwargs)
-
-    def train_batched_gen(self, generator, batch_count, *args, **kwargs):
-        return self.train_batched(generator, batch_count, *args,
-                                  shuffle=False, **kwargs)
-
-# Learners {{{1
-
-class Learner:
-    per_batch = False
-
-    def __init__(self, optim, epochs=100, rate=None):
-        assert isinstance(optim, Optimizer)
-        self.optim = optim
-        self.start_rate = rate # None is okay; it'll use optim.lr instead.
-        self.epochs = int(epochs)
-        self.reset()
-
-    def reset(self, optim=False):
-        self.started = False
-        self.epoch = 0
-        if optim:
-            self.optim.reset()
-
-    @property
-    def epoch(self):
-        return self._epoch
-
-    @epoch.setter
-    def epoch(self, new_epoch):
-        self._epoch = int(new_epoch)
-        if 0 <= self.epoch <= self.epochs:
-            self.rate = self.rate_at(self._epoch)
-
-    @property
-    def rate(self):
-        return self.optim.lr
-
-    @rate.setter
-    def rate(self, new_rate):
-        self.optim.lr = new_rate
-
-    def rate_at(self, epoch):
-        if self.start_rate is None:
-            return self.optim.lr
-        return self.start_rate
-
-    def next(self):
-        # prepares the next epoch. returns whether or not to continue training.
-        if not self.started:
-            self.started = True
-        self.epoch += 1
-        if self.epoch > self.epochs:
-            return False
-        return True
-
-    def batch(self, progress): # TODO: rename
-        # interpolates rates between epochs.
-        # unlike epochs, we do not store batch number as a state.
-        # i.e. calling next() will not respect progress.
-        assert 0 <= progress <= 1
-        self.rate = self.rate_at(self._epoch + progress)
-
-    @property
-    def final_rate(self):
-        return self.rate_at(self.epochs - 1e-8)
-
-class AnnealingLearner(Learner):
-    def __init__(self, optim, epochs=100, rate=None, halve_every=10):
-        self.halve_every = _f(halve_every)
-        self.anneal = _f(0.5**(1/self.halve_every))
-        super().__init__(optim, epochs, rate)
-
-    def rate_at(self, epoch):
-        return super().rate_at(epoch) * self.anneal**epoch
-
-def cosmod(x):
-    # plot: https://www.desmos.com/calculator/hlgqmyswy2
-    return (_1 + np.cos((x % _1) * _pi)) * _inv2
-
-class SGDR(Learner):
-    # Stochastic Gradient Descent with Restarts
-    # paper: https://arxiv.org/abs/1608.03983
-    # NOTE: this is missing a couple of the proposed features.
-
-    per_batch = True
-
-    def __init__(self, optim, epochs=100, rate=None,
-                 restarts=0, restart_decay=0.5, callback=None,
-                 expando=0):
-        self.restart_epochs = int(epochs)
-        self.decay = _f(restart_decay)
-        self.restarts = int(restarts)
-        self.restart_callback = callback
-
-        # TODO: rename expando to something not insane
-        self.expando = expando if expando is not None else lambda i: i
-        if type(self.expando) == int:
-            inc = self.expando
-            self.expando = lambda i: i * inc
-
-        self.splits = []
-        epochs = 0
-        for i in range(0, self.restarts + 1):
-            split = epochs + self.restart_epochs + int(self.expando(i))
-            self.splits.append(split)
-            epochs = split
-        super().__init__(optim, epochs, rate)
-
-    def split_num(self, epoch):
-        previous = [0] + self.splits
-        for i, split in enumerate(self.splits):
-            if epoch - 1 < split:
-                sub_epoch = epoch - previous[i]
-                next_restart = split - previous[i]
-                return i, sub_epoch, next_restart
-        raise Exception('this should never happen.')
-
-    def rate_at(self, epoch):
-        base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
-        restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
-        x = _f(sub_epoch - 1) / _f(next_restart)
-        return base_rate * self.decay**_f(restart) * cosmod(x)
-
-    def next(self):
-        if not super().next():
-            return False
-        restart, sub_epoch, next_restart = self.split_num(self.epoch)
-        if restart > 0 and sub_epoch == 1:
-            if self.restart_callback is not None:
-                self.restart_callback(restart)
-        return True
-
-class TriangularCLR(Learner):
-    per_batch = True
-
-    def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0,
-                 frequency=100, callback=None):
-        # NOTE: start_rate is treated as upper_rate
-        self.frequency = int(frequency)
-        assert self.frequency > 0
-        self.callback = callback
-        self.lower_rate = _f(lower_rate)
-        super().__init__(optim, epochs, upper_rate)
-
-    def _t(self, epoch):
-        # NOTE: this could probably be simplified
-        offset = self.frequency / 2
-        return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
-
-    def rate_at(self, epoch):
-        upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
-        return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
-
-    def next(self):
-        if not super().next():
-            return False
-        e = self.epoch - 1
-        if e > 0 and e % self.frequency == 0:
-            if self.callback is not None:
-                self.callback(self.epoch // self.frequency)
-        return True
-
-class SineCLR(TriangularCLR):
-    def _t(self, epoch):
-        return np.sin(_pi * _inv2 * super()._t(epoch))
-
-class WaveCLR(TriangularCLR):
-    def _t(self, epoch):
-        return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))