diff --git a/optim_nn.py b/optim_nn.py
index 08e4875..a57fe95 100644
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -3,48 +3,15 @@
 # external packages required for full functionality:
 # numpy scipy h5py sklearn dotmap
 
-import numpy as np
-# ugly shorthand:
-nf = np.float32
-nfa = lambda x: np.array(x, dtype=nf)
-ni = np.int
-nia = lambda x: np.array(x, dtype=ni)
+from optim_nn_core import *
 
-# just for speed, not strictly essential:
-from scipy.special import expit as sigmoid
+import sys
+lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
 
-# used for numbering layers like Keras:
-from collections import defaultdict
-_layer_counters = defaultdict(lambda: 0)
+def log(left, right):
+    lament("{:>20}:   {}".format(left, right))
 
-# Initializations
-
-# note: these are currently only implemented for 2D shapes.
-
-def init_he_normal(size, ins, outs):
-    s = np.sqrt(2 / ins)
-    return np.random.normal(0, s, size=size)
-
-def init_he_uniform(size, ins, outs):
-    s = np.sqrt(6 / ins)
-    return np.random.uniform(-s, s, size=size)
-
-# Loss functions
-
-class Loss:
-    def mean(self, r):
-        return np.average(self.f(r))
-
-    def dmean(self, r):
-        d = self.df(r)
-        return d / len(d)
-
-class Squared(Loss):
-    def f(self, r):
-        return np.square(r)
-
-    def df(self, r):
-        return 2 * r
+# Loss functions {{{1
 
 class SquaredHalved(Loss):
     def f(self, r):
@@ -53,13 +20,6 @@ class SquaredHalved(Loss):
     def df(self, r):
         return r
 
-class Absolute(Loss):
-    def f(self, r):
-        return np.abs(r)
-
-    def df(self, r):
-        return np.sign(r)
-
 class SomethingElse(Loss):
     # generalizes Absolute and SquaredHalved (|dx| = 1)
     # plot: https://www.desmos.com/calculator/fagjg9vuz7
@@ -75,347 +35,7 @@ class SomethingElse(Loss):
     def df(self, r):
         return np.sign(r) * np.abs(r)**self.c
 
-# Optimizers
-
-class Optimizer:
-    def __init__(self, alpha=0.1):
-        self.alpha = nf(alpha)
-        self.reset()
-
-    def reset(self):
-        pass
-
-    def compute(self, dW, W):
-        return -self.alpha * dW
-
-    def update(self, dW, W):
-        W += self.compute(dW, W)
-
-# the following optimizers are blatantly lifted from tiny-dnn:
-# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
-
-class Momentum(Optimizer):
-    def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
-        self.alpha = np.asfarray(alpha) # learning rate
-        self.lamb = np.asfarray(lamb) # weight decay
-        self.mu = np.asfarray(mu) # momentum
-        self.nesterov = bool(nesterov)
-
-        self.reset()
-
-    def reset(self):
-        self.dWprev = None
-
-    def compute(self, dW, W):
-        if self.dWprev is None:
-            #self.dWprev = np.zeros_like(dW)
-            self.dWprev = np.copy(dW)
-
-        V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb)
-        self.dWprev[:] = V
-        if self.nesterov: # TODO: is this correct? looks weird
-            return self.mu * V - self.alpha * (dW + W * self.lamb)
-        else:
-            return V
-
-class Adam(Optimizer):
-    def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
-        self.alpha = nf(alpha) # learning rate
-        self.b1 = nf(b1) # decay term
-        self.b2 = nf(b2) # decay term
-        self.b1_t_default = nf(b1_t) # decay term power t
-        self.b2_t_default = nf(b2_t) # decay term power t
-        self.eps = nf(eps)
-
-        self.reset()
-
-    def reset(self):
-        self.mt = None
-        self.vt = None
-        self.b1_t = self.b1_t_default
-        self.b2_t = self.b2_t_default
-
-    def compute(self, dW, W):
-        if self.mt is None:
-            self.mt = np.zeros_like(W)
-        if self.vt is None:
-            self.vt = np.zeros_like(W)
-
-        # decay
-        self.b1_t *= self.b1
-        self.b2_t *= self.b2
-
-        self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
-        self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
-
-        return -self.alpha * (self.mt / (1 - self.b1_t)) \
-                   / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
-
-# Abstract Layers
-
-class Layer:
-    def __init__(self):
-        self.parents = []
-        self.children = []
-        self.input_shape = None
-        self.output_shape = None
-        kind = self.__class__.__name__
-        global _layer_counters
-        _layer_counters[kind] += 1
-        self.name = "{}_{}".format(kind, _layer_counters[kind])
-        self.size = None # total weight count (if any)
-        self.unsafe = False # disables assertions for better performance
-
-    def __str__(self):
-        return self.name
-
-    # methods we might want to override:
-
-    def F(self, X):
-        raise NotImplementedError("unimplemented", self)
-
-    def dF(self, dY):
-        raise NotImplementedError("unimplemented", self)
-
-    def do_feed(self, child):
-        self.children.append(child)
-
-    def be_fed(self, parent):
-        self.parents.append(parent)
-
-    def make_shape(self, shape):
-        if not self.unsafe:
-            assert shape is not None
-        if self.output_shape is None:
-            self.output_shape = shape
-        return shape
-
-    # TODO: rename this multi and B crap to something actually relevant.
-
-    def multi(self, B):
-        if not self.unsafe:
-            assert len(B) == 1, self
-        return self.F(B[0])
-
-    def dmulti(self, dB):
-        if len(dB) == 1:
-            return self.dF(dB[0])
-        else:
-            dX = None
-            for dY in dB:
-                if dX is None:
-                    dX = self.dF(dY)
-                else:
-                    dX += self.dF(dY)
-            return dX
-
-    # general utility methods:
-
-    def compatible(self, parent):
-        if self.input_shape is None:
-            # inherit shape from output
-            shape = self.make_shape(parent.output_shape)
-            if shape is None:
-                return False
-            self.input_shape = shape
-        if np.all(self.input_shape == parent.output_shape):
-            return True
-        else:
-            return False
-
-    def feed(self, child):
-        if not child.compatible(self):
-            fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
-            raise Exception(fmt.format(self, child, self.output_shape, child.input_shape))
-        self.do_feed(child)
-        child.be_fed(self)
-        return child
-
-    def validate_input(self, X):
-        assert X.shape[1:] == self.input_shape,  (str(self), X.shape[1:], self.input_shape)
-
-    def validate_output(self, Y):
-        assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
-
-    def forward(self, lut):
-        if not self.unsafe:
-            assert len(self.parents) > 0, self
-        B = []
-        for parent in self.parents:
-            # TODO: skip over irrelevant nodes (if any)
-            X = lut[parent]
-            if not self.unsafe:
-                self.validate_input(X)
-            B.append(X)
-        Y = self.multi(B)
-        if not self.unsafe:
-            self.validate_output(Y)
-        return Y
-
-    def backward(self, lut):
-        if not self.unsafe:
-            assert len(self.children) > 0, self
-        dB = []
-        for child in self.children:
-            # TODO: skip over irrelevant nodes (if any)
-            dY = lut[child]
-            if not self.unsafe:
-                self.validate_output(dY)
-            dB.append(dY)
-        dX = self.dmulti(dB)
-        if not self.unsafe:
-            self.validate_input(dX)
-        return dX
-
-# Final Layers
-
-class Sum(Layer):
-    def multi(self, B):
-        return np.sum(B, axis=0)
-
-    def dmulti(self, dB):
-        #assert len(dB) == 1, "unimplemented"
-        return dB[0] # TODO: does this always work?
-
-class Input(Layer):
-    def __init__(self, shape):
-        assert shape is not None
-        super().__init__()
-        self.shape = tuple(shape)
-        self.input_shape = self.shape
-        self.output_shape = self.shape
-
-    def F(self, X):
-        return X
-
-    def dF(self, dY):
-        #self.dY = dY
-        return np.zeros_like(dY)
-
-class Affine(Layer):
-    def __init__(self, a=1, b=0):
-        super().__init__()
-        self.a = nf(a)
-        self.b = nf(b)
-
-    def F(self, X):
-        return self.a * X + self.b
-
-    def dF(self, dY):
-        return dY * self.a
-
-class Sigmoid(Layer): # aka Logistic
-    def F(self, X):
-        self.sig = sigmoid(X)
-        return X * self.sig
-
-    def dF(self, dY):
-        return dY * self.sig * (1 - self.sig)
-
-class Tanh(Layer):
-    def F(self, X):
-        self.sig = np.tanh(X)
-        return X * self.sig
-
-    def dF(self, dY):
-        return dY * (1 - self.sig * self.sig)
-
-class Relu(Layer):
-    def F(self, X):
-        self.cond = X >= 0
-        return np.where(self.cond, X, 0)
-
-    def dF(self, dY):
-        return np.where(self.cond, dY, 0)
-
-class Elu(Layer):
-    # paper: https://arxiv.org/abs/1511.07289
-
-    def __init__(self, alpha=1):
-        super().__init__()
-        self.alpha = nf(alpha)
-
-    def F(self, X):
-        self.cond = X >= 0
-        self.neg = np.exp(X) - 1
-        return np.where(self.cond, X, self.neg)
-
-    def dF(self, dY):
-        return dY * np.where(self.cond, 1, self.neg + 1)
-
-class GeluApprox(Layer):
-    # paper: https://arxiv.org/abs/1606.08415
-    #  plot: https://www.desmos.com/calculator/ydzgtccsld
-
-    def F(self, X):
-        self.a = 1.704 * X
-        self.sig = sigmoid(self.a)
-        return X * self.sig
-
-    def dF(self, dY):
-        return dY * self.sig * (1 + self.a * (1 - self.sig))
-
-class Dense(Layer):
-    def __init__(self, dim, init=init_he_uniform):
-        super().__init__()
-        self.dim = ni(dim)
-        self.output_shape = (dim,)
-        self.weight_init = init
-        self.size = None
-
-    def make_shape(self, shape):
-        super().make_shape(shape)
-        if len(shape) != 1:
-            return False
-        self.nW = self.dim * shape[0]
-        self.nb = self.dim
-        self.size = self.nW + self.nb
-        return shape
-
-    def init(self, W, dW):
-        ins, outs = self.input_shape[0], self.output_shape[0]
-
-        self.W = W
-        self.dW = dW
-        self.coeffs = self.W[:self.nW].reshape(ins, outs)
-        self.biases = self.W[self.nW:].reshape(1, outs)
-        self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
-        self.dbiases = self.dW[self.nW:].reshape(1, outs)
-
-        self.coeffs.flat = self.weight_init(self.nW, ins, outs)
-        self.biases.flat = 0
-
-        self.std = np.std(self.W)
-
-    def F(self, X):
-        self.X = X
-        Y = X.dot(self.coeffs) + self.biases
-        return Y
-
-    def dF(self, dY):
-        dX = dY.dot(self.coeffs.T)
-        self.dcoeffs[:] = self.X.T.dot(dY)
-        self.dbiases[:] = dY.sum(0, keepdims=True)
-        return dX
-
-class DenseOneLess(Dense):
-    def init(self, W, dW):
-        super().init(W, dW)
-        ins, outs = self.input_shape[0], self.output_shape[0]
-        assert ins == outs, (ins, outs)
-
-    def F(self, X):
-        np.fill_diagonal(self.coeffs, 0)
-        self.X = X
-        Y = X.dot(self.coeffs) + self.biases
-        return Y
-
-    def dF(self, dY):
-        dX = dY.dot(self.coeffs.T)
-        self.dcoeffs[:] = self.X.T.dot(dY)
-        self.dbiases[:] = dY.sum(0, keepdims=True)
-        np.fill_diagonal(self.dcoeffs, 0)
-        return dX
+# Parametric Layers {{{1
 
 class LayerNorm(Layer):
     # paper: https://arxiv.org/abs/1607.06450
@@ -446,161 +66,7 @@ class LayerNorm(Layer):
 
         return dX
 
-# Model
-
-class Model:
-    def __init__(self, x, y, unsafe=False):
-        assert isinstance(x, Layer), x
-        assert isinstance(y, Layer), y
-        self.x = x
-        self.y = y
-        self.ordered_nodes = self.traverse([], self.y)
-        self.make_weights()
-        for node in self.ordered_nodes:
-            node.unsafe = unsafe
-
-    def make_weights(self):
-        self.param_count = 0
-        for node in self.ordered_nodes:
-            if node.size is not None:
-                self.param_count += node.size
-        self.W  = np.zeros(self.param_count, dtype=nf)
-        self.dW = np.zeros(self.param_count, dtype=nf)
-
-        offset = 0
-        for node in self.ordered_nodes:
-            if node.size is not None:
-                end = offset + node.size
-                node.init(self.W[offset:end], self.dW[offset:end])
-                offset += node.size
-
-    def traverse(self, nodes, node):
-        if node == self.x:
-            return [node]
-        for parent in node.parents:
-            if parent not in nodes:
-                new_nodes = self.traverse(nodes, parent)
-                for new_node in new_nodes:
-                    if new_node not in nodes:
-                        nodes.append(new_node)
-        if nodes:
-            nodes.append(node)
-        return nodes
-
-    def forward(self, X):
-        lut = dict()
-        input_node = self.ordered_nodes[0]
-        output_node = self.ordered_nodes[-1]
-        lut[input_node] = input_node.multi(np.expand_dims(X, 0))
-        for node in self.ordered_nodes[1:]:
-            lut[node] = node.forward(lut)
-        return lut[output_node]
-
-    def backward(self, error):
-        lut = dict()
-        input_node = self.ordered_nodes[0]
-        output_node = self.ordered_nodes[-1]
-        lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
-        for node in reversed(self.ordered_nodes[:-1]):
-            lut[node] = node.backward(lut)
-        #return lut[input_node] # meaningless value
-        return self.dW
-
-    def load_weights(self, fn):
-        # seemingly compatible with keras' Dense layers.
-        # ignores any non-Dense layer types.
-        # TODO: assert file actually exists
-        import h5py
-        f = h5py.File(fn)
-        weights = {}
-        def visitor(name, obj):
-            if isinstance(obj, h5py.Dataset):
-                weights[name.split('/')[-1]] = nfa(obj[:])
-        f.visititems(visitor)
-        f.close()
-
-        denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
-        for i in range(len(denses)):
-            a, b = i, i + 1
-            b_name = "dense_{}".format(b)
-            # TODO: write a Dense method instead of assigning directly
-            denses[a].coeffs[:] = weights[b_name+'_W']
-            denses[a].biases[:] = np.expand_dims(weights[b_name+'_b'], 0)
-
-    def save_weights(self, fn, overwrite=False):
-        import h5py
-        f = h5py.File(fn, 'w')
-
-        denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
-        for i in range(len(denses)):
-            a, b = i, i + 1
-            b_name = "dense_{}".format(b)
-            # TODO: write a Dense method instead of assigning directly
-            grp = f.create_group(b_name)
-            data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=nf)
-            data[:] = denses[a].coeffs
-            data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=nf)
-            data[:] = denses[a].biases
-
-        f.close()
-
-class Ritual: # i'm just making up names at this point
-    def __init__(self, learner=None, loss=None, mloss=None):
-        self.learner = learner if learner is not None else Learner(Optimizer())
-        self.loss = loss if loss is not None else Squared()
-        self.mloss = mloss if mloss is not None else loss
-
-    def reset(self):
-        self.learner.reset(optim=True)
-
-    def measure(self, residual):
-        return self.mloss.mean(residual)
-
-    def derive(self, residual):
-        return self.loss.dmean(residual)
-
-    def learn(self, inputs, outputs):
-        predicted = self.model.forward(inputs)
-        residual = predicted - outputs
-        self.model.backward(self.derive(residual))
-        return residual
-
-    def update(self):
-        self.learner.optim.update(self.model.dW, self.model.W)
-
-    def prepare(self, model):
-        self.en = 0
-        self.bn = 0
-        self.model = model
-
-    def train_batched(self, inputs, outputs, batch_size, return_losses=False):
-        self.en += 1
-        cumsum_loss = 0
-        batch_count = inputs.shape[0] // batch_size
-        losses = []
-        for b in range(batch_count):
-            self.bn += 1
-            bi = b * batch_size
-            batch_inputs  = inputs[ bi:bi+batch_size]
-            batch_outputs = outputs[bi:bi+batch_size]
-
-            if self.learner.per_batch:
-                self.learner.batch(b / batch_count)
-
-            residual = self.learn(batch_inputs, batch_outputs)
-            self.update()
-
-            batch_loss = self.measure(residual)
-            if np.isnan(batch_loss):
-                raise Exception("nan")
-            cumsum_loss += batch_loss
-            if return_losses:
-                losses.append(batch_loss)
-        avg_loss = cumsum_loss / batch_count
-        if return_losses:
-            return avg_loss, losses
-        else:
-            return avg_loss
+# Rituals {{{1
 
 def stochastic_multiply(W, gamma=0.5, allow_negation=True):
     # paper: https://arxiv.org/abs/1606.01981
@@ -681,72 +147,7 @@ class NoisyRitual(Ritual):
             self.model.dW += np.random.normal(0, s, size=size)
         super().update()
 
-class Learner:
-    per_batch = False
-
-    def __init__(self, optim, epochs=100, rate=None):
-        assert isinstance(optim, Optimizer)
-        self.optim = optim
-        self.start_rate = optim.alpha if rate is None else float(rate)
-        self.epochs = int(epochs)
-        self.reset()
-
-    def reset(self, optim=False):
-        self.started = False
-        self.epoch = 0
-        if optim:
-            self.optim.reset()
-
-    @property
-    def epoch(self):
-        return self._epoch
-
-    @epoch.setter
-    def epoch(self, new_epoch):
-        self._epoch = int(new_epoch)
-        self.rate = self.rate_at(self._epoch)
-
-    @property
-    def rate(self):
-        return self.optim.alpha
-
-    @rate.setter
-    def rate(self, new_rate):
-        self.optim.alpha = new_rate
-
-    def rate_at(self, epoch):
-        return self.start_rate
-
-    def next(self):
-        # prepares the next epoch. returns whether or not to continue training.
-        if self.epoch + 1 >= self.epochs:
-            return False
-        if self.started:
-            self.epoch += 1
-        else:
-            self.started = True
-            self.epoch = self.epoch # poke property setter just in case
-        return True
-
-    def batch(self, progress): # TODO: rename
-        # interpolates rates between epochs.
-        # unlike epochs, we do not store batch number as a state.
-        # i.e. calling next() will not respect progress.
-        assert 0 <= progress <= 1
-        self.rate = self.rate_at(self._epoch + progress)
-
-    @property
-    def final_rate(self):
-        return self.rate_at(self.epochs - 1)
-
-class AnnealingLearner(Learner):
-    def __init__(self, optim, epochs=100, rate=None, halve_every=10):
-        self.halve_every = float(halve_every)
-        self.anneal = 0.5**(1/self.halve_every)
-        super().__init__(optim, epochs, rate)
-
-    def rate_at(self, epoch):
-        return self.start_rate * self.anneal**epoch
+# Learners {{{1
 
 class DumbLearner(AnnealingLearner):
     # this is my own awful contraption. it's not really "SGD with restarts".
@@ -773,58 +174,6 @@ class DumbLearner(AnnealingLearner):
                 self.restart_callback(restart)
         return True
 
-def cosmod(x):
-    # plot: https://www.desmos.com/calculator/hlgqmyswy2
-    return (1 + np.cos((x % 1) * np.pi)) / 2
-
-class SGDR(Learner):
-    # Stochastic Gradient Descent with Restarts
-    # paper: https://arxiv.org/abs/1608.03983
-    # NOTE: this is missing a couple features.
-
-    per_batch = True
-
-    def __init__(self, optim, epochs=100, rate=None,
-                 restarts=0, restart_decay=0.5, callback=None,
-                 expando=None):
-        self.restart_epochs = int(epochs)
-        self.decay = float(restart_decay)
-        self.restarts = int(restarts)
-        self.restart_callback = callback
-        # TODO: rename expando to something not insane
-        self.expando = expando if expando is not None else lambda i: 1
-
-        self.splits = []
-        epochs = 0
-        for i in range(0, self.restarts + 1):
-            split = epochs + int(self.restart_epochs * self.expando(i))
-            self.splits.append(split)
-            epochs = split
-        super().__init__(optim, epochs, rate)
-
-    def split_num(self, epoch):
-        shit = [0] + self.splits # hack
-        for i in range(0, len(self.splits)):
-            if epoch < self.splits[i]:
-                sub_epoch = epoch - shit[i]
-                next_restart = self.splits[i] - shit[i]
-                return i, sub_epoch, next_restart
-        raise Exception('this should never happen.')
-
-    def rate_at(self, epoch):
-        restart, sub_epoch, next_restart = self.split_num(epoch)
-        x = sub_epoch / next_restart
-        return self.start_rate * self.decay**restart * cosmod(x)
-
-    def next(self):
-        if not super().next():
-            return False
-        restart, sub_epoch, next_restart = self.split_num(self.epoch)
-        if restart > 0 and sub_epoch == 0:
-            if self.restart_callback is not None:
-                self.restart_callback(restart)
-        return True
-
 def multiresnet(x, width, depth, block=2, multi=1,
                 activation=Relu, style='batchless',
                 init=init_he_normal):
@@ -882,6 +231,8 @@ def multiresnet(x, width, depth, block=2, multi=1,
 
     return y
 
+# etc. {{{1
+
 inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
 activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
 
@@ -953,74 +304,7 @@ def toy_data(train_samples, valid_samples, problem=2):
 
     return (inputs, outputs), (valid_inputs, valid_outputs)
 
-def run(program, args=[]):
-    import sys
-    lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
-    def log(left, right):
-        lament("{:>20}:   {}".format(left, right))
-
-    # Config
-
-    from dotmap import DotMap
-    config = DotMap(
-        fn_load = None,
-        fn_save = 'optim_nn.h5',
-        log_fn = 'losses.npz',
-
-        # multi-residual network parameters
-        res_width = 28,
-        res_depth = 2,
-        res_block = 3, # normally 2 for plain resnet
-        res_multi = 2, # normally 1 for plain resnet
-
-        # style of resnet (order of layers, which layers, etc.)
-        parallel_style = 'onelesssum',
-        activation = 'gelu',
-
-        optim = 'adam',
-        nesterov = False, # only used with SGD or Adam
-        momentum = 0.50, # only used with SGD
-        batch_size = 64,
-
-        # learning parameters
-        learner = 'sgdr',
-        learn = 1e-2,
-        learn_halve_every = 16, # unused with SGDR
-        learn_restart_advance = 16, # unused with SGDR
-        epochs = 24,
-        restarts = 2,
-        restart_decay = 0.25, # only used with SGDR
-        expando = lambda i: i + 1,
-
-        # misc
-        init = 'he_normal',
-        loss = 'mse',
-        mloss = 'mse',
-        ritual = 'default',
-        restart_optim = False, # restarts also reset internal state of optimizer
-
-        problem = 3,
-        # best results for ~10,000 parameters
-        # (keep these paired; update both at the same time!)
-        train_compare = 1.854613e-05,
-        valid_compare = 1.623881e-05,
-
-        unsafe = True, # aka gotta go fast mode
-    )
-
-    for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']:
-        config[k] = config[k].lower()
-
-    config.pprint()
-
-    # toy data
-    # (our model is probably complete overkill for this, so TODO: better data)
-
-    (inputs, outputs), (valid_inputs, valid_outputs) = \
-      toy_data(2**14, 2**11, problem=config.problem)
-    input_features  =  inputs.shape[-1]
-    output_features = outputs.shape[-1]
-
+def model_from_config(config, input_features, output_features):
     # Our Test Model
 
     init = inits[config.init]
@@ -1038,17 +322,6 @@ def run(program, args=[]):
 
     model = Model(x, y, unsafe=config.unsafe)
 
-    if 0:
-        node_names = ' '.join([str(node) for node in model.ordered_nodes])
-        log('{} nodes'.format(len(model.ordered_nodes)), node_names)
-    else:
-        for node in model.ordered_nodes:
-            children = [str(n) for n in node.children]
-            if len(children) > 0:
-                sep = '->'
-                print(str(node)+sep+('\n'+str(node)+sep).join(children))
-    log('parameters', model.param_count)
-
     #
 
     training = config.epochs > 0 and config.restarts >= 0
@@ -1129,6 +402,92 @@ def run(program, args=[]):
     else:
         raise Exception('unknown ritual', config.ritual)
 
+    #
+
+    return model, learner, ritual, (loss, mloss)
+
+# main {{{1
+
+def run(program, args=[]):
+
+    # Config
+
+    from dotmap import DotMap
+    config = DotMap(
+        fn_load = None,
+        fn_save = 'optim_nn.h5',
+        log_fn = 'losses.npz',
+
+        # multi-residual network parameters
+        res_width = 28,
+        res_depth = 2,
+        res_block = 3, # normally 2 for plain resnet
+        res_multi = 2, # normally 1 for plain resnet
+
+        # style of resnet (order of layers, which layers, etc.)
+        parallel_style = 'onelesssum',
+        activation = 'gelu',
+
+        optim = 'adam',
+        nesterov = False, # only used with SGD or Adam
+        momentum = 0.50, # only used with SGD
+        batch_size = 64,
+
+        # learning parameters
+        learner = 'sgdr',
+        learn = 1e-2,
+        learn_halve_every = 16, # unused with SGDR
+        learn_restart_advance = 16, # unused with SGDR
+        epochs = 24,
+        restarts = 2,
+        restart_decay = 0.25, # only used with SGDR
+        expando = lambda i: i + 1,
+
+        # misc
+        init = 'he_normal',
+        loss = 'mse',
+        mloss = 'mse',
+        ritual = 'default',
+        restart_optim = False, # restarts also reset internal state of optimizer
+
+        problem = 3,
+        # best results for ~10,000 parameters
+        # (keep these paired; update both at the same time!)
+        train_compare = 1.854613e-05,
+        valid_compare = 1.623881e-05,
+
+        unsafe = True, # aka gotta go fast mode
+    )
+
+    for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']:
+        config[k] = config[k].lower()
+
+    config.pprint()
+
+    # toy data
+    # (our model is probably complete overkill for this, so TODO: better data)
+
+    (inputs, outputs), (valid_inputs, valid_outputs) = \
+      toy_data(2**14, 2**11, problem=config.problem)
+    input_features  =  inputs.shape[-1]
+    output_features = outputs.shape[-1]
+
+    model, learner, ritual, (loss, mloss) = \
+      model_from_config(config, input_features, output_features)
+
+    # Model Information
+
+    if 0:
+        node_names = ' '.join([str(node) for node in model.ordered_nodes])
+        log('{} nodes'.format(len(model.ordered_nodes)), node_names)
+    else:
+        for node in model.ordered_nodes:
+            children = [str(n) for n in node.children]
+            if len(children) > 0:
+                sep = '->'
+                print(str(node)+sep+('\n'+str(node)+sep).join(children))
+    log('parameters', model.param_count)
+
     # Training
 
     batch_losses = []
@@ -1188,6 +547,7 @@ def run(program, args=[]):
     # TODO: write this portion again
 
     if config.log_fn is not None:
+        log('saving losses', config.log_fn)
         np.savez_compressed(config.log_fn,
                             batch_losses=nfa(batch_losses),
                             train_losses=nfa(train_losses),
diff --git a/optim_nn_core.py b/optim_nn_core.py
new file mode 100644
index 0000000..27c61c1
--- /dev/null
+++ b/optim_nn_core.py
@@ -0,0 +1,672 @@
+import numpy as np
+# ugly shorthand:
+nf = np.float32
+nfa = lambda x: np.array(x, dtype=nf)
+ni = np.int
+nia = lambda x: np.array(x, dtype=ni)
+
+# just for speed, not strictly essential:
+from scipy.special import expit as sigmoid
+
+# used for numbering layers like Keras:
+from collections import defaultdict
+_layer_counters = defaultdict(lambda: 0)
+
+# Initializations {{{1
+
+# note: these are currently only implemented for 2D shapes.
+
+def init_he_normal(size, ins, outs):
+    s = np.sqrt(2 / ins)
+    return np.random.normal(0, s, size=size)
+
+def init_he_uniform(size, ins, outs):
+    s = np.sqrt(6 / ins)
+    return np.random.uniform(-s, s, size=size)
+
+# Loss functions {{{1
+
+class Loss:
+    def mean(self, r):
+        return np.average(self.f(r))
+
+    def dmean(self, r):
+        d = self.df(r)
+        return d / len(d)
+
+class Squared(Loss):
+    def f(self, r):
+        return np.square(r)
+
+    def df(self, r):
+        return 2 * r
+
+class Absolute(Loss):
+    def f(self, r):
+        return np.abs(r)
+
+    def df(self, r):
+        return np.sign(r)
+
+# Optimizers {{{1
+
+class Optimizer:
+    def __init__(self, alpha=0.1):
+        self.alpha = nf(alpha)
+        self.reset()
+
+    def reset(self):
+        pass
+
+    def compute(self, dW, W):
+        return -self.alpha * dW
+
+    def update(self, dW, W):
+        W += self.compute(dW, W)
+
+# the following optimizers are blatantly lifted from tiny-dnn:
+# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+
+class Momentum(Optimizer):
+    def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
+        self.alpha = np.asfarray(alpha) # learning rate
+        self.lamb = np.asfarray(lamb) # weight decay
+        self.mu = np.asfarray(mu) # momentum
+        self.nesterov = bool(nesterov)
+
+        self.reset()
+
+    def reset(self):
+        self.dWprev = None
+
+    def compute(self, dW, W):
+        if self.dWprev is None:
+            #self.dWprev = np.zeros_like(dW)
+            self.dWprev = np.copy(dW)
+
+        V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb)
+        self.dWprev[:] = V
+        if self.nesterov: # TODO: is this correct? looks weird
+            return self.mu * V - self.alpha * (dW + W * self.lamb)
+        else:
+            return V
+
+class Adam(Optimizer):
+    def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
+        self.alpha = nf(alpha) # learning rate
+        self.b1 = nf(b1) # decay term
+        self.b2 = nf(b2) # decay term
+        self.b1_t_default = nf(b1_t) # decay term power t
+        self.b2_t_default = nf(b2_t) # decay term power t
+        self.eps = nf(eps)
+
+        self.reset()
+
+    def reset(self):
+        self.mt = None
+        self.vt = None
+        self.b1_t = self.b1_t_default
+        self.b2_t = self.b2_t_default
+
+    def compute(self, dW, W):
+        if self.mt is None:
+            self.mt = np.zeros_like(W)
+        if self.vt is None:
+            self.vt = np.zeros_like(W)
+
+        # decay
+        self.b1_t *= self.b1
+        self.b2_t *= self.b2
+
+        self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
+        self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
+
+        return -self.alpha * (self.mt / (1 - self.b1_t)) \
+                   / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
+
+# Abstract Layers {{{1
+
+class Layer:
+    def __init__(self):
+        self.parents = []
+        self.children = []
+        self.input_shape = None
+        self.output_shape = None
+        kind = self.__class__.__name__
+        global _layer_counters
+        _layer_counters[kind] += 1
+        self.name = "{}_{}".format(kind, _layer_counters[kind])
+        self.size = None # total weight count (if any)
+        self.unsafe = False # disables assertions for better performance
+
+    def __str__(self):
+        return self.name
+
+    # methods we might want to override:
+
+    def F(self, X):
+        raise NotImplementedError("unimplemented", self)
+
+    def dF(self, dY):
+        raise NotImplementedError("unimplemented", self)
+
+    def do_feed(self, child):
+        self.children.append(child)
+
+    def be_fed(self, parent):
+        self.parents.append(parent)
+
+    def make_shape(self, shape):
+        if not self.unsafe:
+            assert shape is not None
+        if self.output_shape is None:
+            self.output_shape = shape
+        return shape
+
+    # TODO: rename this multi and B crap to something actually relevant.
+
+    def multi(self, B):
+        if not self.unsafe:
+            assert len(B) == 1, self
+        return self.F(B[0])
+
+    def dmulti(self, dB):
+        if len(dB) == 1:
+            return self.dF(dB[0])
+        else:
+            dX = None
+            for dY in dB:
+                if dX is None:
+                    dX = self.dF(dY)
+                else:
+                    dX += self.dF(dY)
+            return dX
+
+    # general utility methods:
+
+    def compatible(self, parent):
+        if self.input_shape is None:
+            # inherit shape from output
+            shape = self.make_shape(parent.output_shape)
+            if shape is None:
+                return False
+            self.input_shape = shape
+        if np.all(self.input_shape == parent.output_shape):
+            return True
+        else:
+            return False
+
+    def feed(self, child):
+        if not child.compatible(self):
+            fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
+            raise Exception(fmt.format(self, child, self.output_shape, child.input_shape))
+        self.do_feed(child)
+        child.be_fed(self)
+        return child
+
+    def validate_input(self, X):
+        assert X.shape[1:] == self.input_shape,  (str(self), X.shape[1:], self.input_shape)
+
+    def validate_output(self, Y):
+        assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
+
+    def forward(self, lut):
+        if not self.unsafe:
+            assert len(self.parents) > 0, self
+        B = []
+        for parent in self.parents:
+            # TODO: skip over irrelevant nodes (if any)
+            X = lut[parent]
+            if not self.unsafe:
+                self.validate_input(X)
+            B.append(X)
+        Y = self.multi(B)
+        if not self.unsafe:
+            self.validate_output(Y)
+        return Y
+
+    def backward(self, lut):
+        if not self.unsafe:
+            assert len(self.children) > 0, self
+        dB = []
+        for child in self.children:
+            # TODO: skip over irrelevant nodes (if any)
+            dY = lut[child]
+            if not self.unsafe:
+                self.validate_output(dY)
+            dB.append(dY)
+        dX = self.dmulti(dB)
+        if not self.unsafe:
+            self.validate_input(dX)
+        return dX
+
+# Nonparametric Layers {{{1
+
+class Sum(Layer):
+    def multi(self, B):
+        return np.sum(B, axis=0)
+
+    def dmulti(self, dB):
+        #assert len(dB) == 1, "unimplemented"
+        return dB[0] # TODO: does this always work?
+
+class Input(Layer):
+    def __init__(self, shape):
+        assert shape is not None
+        super().__init__()
+        self.shape = tuple(shape)
+        self.input_shape = self.shape
+        self.output_shape = self.shape
+
+    def F(self, X):
+        return X
+
+    def dF(self, dY):
+        #self.dY = dY
+        return np.zeros_like(dY)
+
+class Affine(Layer):
+    def __init__(self, a=1, b=0):
+        super().__init__()
+        self.a = nf(a)
+        self.b = nf(b)
+
+    def F(self, X):
+        return self.a * X + self.b
+
+    def dF(self, dY):
+        return dY * self.a
+
+class Sigmoid(Layer): # aka Logistic
+    def F(self, X):
+        self.sig = sigmoid(X)
+        return X * self.sig
+
+    def dF(self, dY):
+        return dY * self.sig * (1 - self.sig)
+
+class Tanh(Layer):
+    def F(self, X):
+        self.sig = np.tanh(X)
+        return X * self.sig
+
+    def dF(self, dY):
+        return dY * (1 - self.sig * self.sig)
+
+class Relu(Layer):
+    def F(self, X):
+        self.cond = X >= 0
+        return np.where(self.cond, X, 0)
+
+    def dF(self, dY):
+        return np.where(self.cond, dY, 0)
+
+class Elu(Layer):
+    # paper: https://arxiv.org/abs/1511.07289
+
+    def __init__(self, alpha=1):
+        super().__init__()
+        self.alpha = nf(alpha)
+
+    def F(self, X):
+        self.cond = X >= 0
+        self.neg = np.exp(X) - 1
+        return np.where(self.cond, X, self.neg)
+
+    def dF(self, dY):
+        return dY * np.where(self.cond, 1, self.neg + 1)
+
+class GeluApprox(Layer):
+    # paper: https://arxiv.org/abs/1606.08415
+    #  plot: https://www.desmos.com/calculator/ydzgtccsld
+
+    def F(self, X):
+        self.a = 1.704 * X
+        self.sig = sigmoid(self.a)
+        return X * self.sig
+
+    def dF(self, dY):
+        return dY * self.sig * (1 + self.a * (1 - self.sig))
+
+# Parametric Layers {{{1
+
+class Dense(Layer):
+    def __init__(self, dim, init=init_he_uniform):
+        super().__init__()
+        self.dim = ni(dim)
+        self.output_shape = (dim,)
+        self.weight_init = init
+        self.size = None
+
+    def make_shape(self, shape):
+        super().make_shape(shape)
+        if len(shape) != 1:
+            return False
+        self.nW = self.dim * shape[0]
+        self.nb = self.dim
+        self.size = self.nW + self.nb
+        return shape
+
+    def init(self, W, dW):
+        ins, outs = self.input_shape[0], self.output_shape[0]
+
+        self.W = W
+        self.dW = dW
+        self.coeffs = self.W[:self.nW].reshape(ins, outs)
+        self.biases = self.W[self.nW:].reshape(1, outs)
+        self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
+        self.dbiases = self.dW[self.nW:].reshape(1, outs)
+
+        self.coeffs.flat = self.weight_init(self.nW, ins, outs)
+        self.biases.flat = 0
+
+        self.std = np.std(self.W)
+
+    def F(self, X):
+        self.X = X
+        Y = X.dot(self.coeffs) + self.biases
+        return Y
+
+    def dF(self, dY):
+        dX = dY.dot(self.coeffs.T)
+        self.dcoeffs[:] = self.X.T.dot(dY)
+        self.dbiases[:] = dY.sum(0, keepdims=True)
+        return dX
+
+class DenseOneLess(Dense):
+    def init(self, W, dW):
+        super().init(W, dW)
+        ins, outs = self.input_shape[0], self.output_shape[0]
+        assert ins == outs, (ins, outs)
+
+    def F(self, X):
+        np.fill_diagonal(self.coeffs, 0)
+        self.X = X
+        Y = X.dot(self.coeffs) + self.biases
+        return Y
+
+    def dF(self, dY):
+        dX = dY.dot(self.coeffs.T)
+        self.dcoeffs[:] = self.X.T.dot(dY)
+        self.dbiases[:] = dY.sum(0, keepdims=True)
+        np.fill_diagonal(self.dcoeffs, 0)
+        return dX
+
+# Models {{{1
+
+class Model:
+    def __init__(self, x, y, unsafe=False):
+        assert isinstance(x, Layer), x
+        assert isinstance(y, Layer), y
+        self.x = x
+        self.y = y
+        self.ordered_nodes = self.traverse([], self.y)
+        self.make_weights()
+        for node in self.ordered_nodes:
+            node.unsafe = unsafe
+
+    def make_weights(self):
+        self.param_count = 0
+        for node in self.ordered_nodes:
+            if node.size is not None:
+                self.param_count += node.size
+        self.W  = np.zeros(self.param_count, dtype=nf)
+        self.dW = np.zeros(self.param_count, dtype=nf)
+
+        offset = 0
+        for node in self.ordered_nodes:
+            if node.size is not None:
+                end = offset + node.size
+                node.init(self.W[offset:end], self.dW[offset:end])
+                offset += node.size
+
+    def traverse(self, nodes, node):
+        if node == self.x:
+            return [node]
+        for parent in node.parents:
+            if parent not in nodes:
+                new_nodes = self.traverse(nodes, parent)
+                for new_node in new_nodes:
+                    if new_node not in nodes:
+                        nodes.append(new_node)
+        if nodes:
+            nodes.append(node)
+        return nodes
+
+    def forward(self, X):
+        lut = dict()
+        input_node = self.ordered_nodes[0]
+        output_node = self.ordered_nodes[-1]
+        lut[input_node] = input_node.multi(np.expand_dims(X, 0))
+        for node in self.ordered_nodes[1:]:
+            lut[node] = node.forward(lut)
+        return lut[output_node]
+
+    def backward(self, error):
+        lut = dict()
+        input_node = self.ordered_nodes[0]
+        output_node = self.ordered_nodes[-1]
+        lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
+        for node in reversed(self.ordered_nodes[:-1]):
+            lut[node] = node.backward(lut)
+        #return lut[input_node] # meaningless value
+        return self.dW
+
+    def load_weights(self, fn):
+        # seemingly compatible with keras' Dense layers.
+        # ignores any non-Dense layer types.
+        # TODO: assert file actually exists
+        import h5py
+        f = h5py.File(fn)
+        weights = {}
+        def visitor(name, obj):
+            if isinstance(obj, h5py.Dataset):
+                weights[name.split('/')[-1]] = nfa(obj[:])
+        f.visititems(visitor)
+        f.close()
+
+        denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
+        for i in range(len(denses)):
+            a, b = i, i + 1
+            b_name = "dense_{}".format(b)
+            # TODO: write a Dense method instead of assigning directly
+            denses[a].coeffs[:] = weights[b_name+'_W']
+            denses[a].biases[:] = np.expand_dims(weights[b_name+'_b'], 0)
+
+    def save_weights(self, fn, overwrite=False):
+        import h5py
+        f = h5py.File(fn, 'w')
+
+        denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
+        for i in range(len(denses)):
+            a, b = i, i + 1
+            b_name = "dense_{}".format(b)
+            # TODO: write a Dense method instead of assigning directly
+            grp = f.create_group(b_name)
+            data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=nf)
+            data[:] = denses[a].coeffs
+            data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=nf)
+            data[:] = denses[a].biases
+
+        f.close()
+
+# Rituals {{{1
+
+class Ritual: # i'm just making up names at this point
+    def __init__(self, learner=None, loss=None, mloss=None):
+        self.learner = learner if learner is not None else Learner(Optimizer())
+        self.loss = loss if loss is not None else Squared()
+        self.mloss = mloss if mloss is not None else loss
+
+    def reset(self):
+        self.learner.reset(optim=True)
+
+    def measure(self, residual):
+        return self.mloss.mean(residual)
+
+    def derive(self, residual):
+        return self.loss.dmean(residual)
+
+    def learn(self, inputs, outputs):
+        predicted = self.model.forward(inputs)
+        residual = predicted - outputs
+        self.model.backward(self.derive(residual))
+        return residual
+
+    def update(self):
+        self.learner.optim.update(self.model.dW, self.model.W)
+
+    def prepare(self, model):
+        self.en = 0
+        self.bn = 0
+        self.model = model
+
+    def train_batched(self, inputs, outputs, batch_size, return_losses=False):
+        self.en += 1
+        cumsum_loss = 0
+        batch_count = inputs.shape[0] // batch_size
+        losses = []
+        for b in range(batch_count):
+            self.bn += 1
+            bi = b * batch_size
+            batch_inputs  = inputs[ bi:bi+batch_size]
+            batch_outputs = outputs[bi:bi+batch_size]
+
+            if self.learner.per_batch:
+                self.learner.batch(b / batch_count)
+
+            residual = self.learn(batch_inputs, batch_outputs)
+            self.update()
+
+            batch_loss = self.measure(residual)
+            if np.isnan(batch_loss):
+                raise Exception("nan")
+            cumsum_loss += batch_loss
+            if return_losses:
+                losses.append(batch_loss)
+        avg_loss = cumsum_loss / batch_count
+        if return_losses:
+            return avg_loss, losses
+        else:
+            return avg_loss
+
+# Learners {{{1
+
+class Learner:
+    per_batch = False
+
+    def __init__(self, optim, epochs=100, rate=None):
+        assert isinstance(optim, Optimizer)
+        self.optim = optim
+        self.start_rate = optim.alpha if rate is None else float(rate)
+        self.epochs = int(epochs)
+        self.reset()
+
+    def reset(self, optim=False):
+        self.started = False
+        self.epoch = 0
+        if optim:
+            self.optim.reset()
+
+    @property
+    def epoch(self):
+        return self._epoch
+
+    @epoch.setter
+    def epoch(self, new_epoch):
+        self._epoch = int(new_epoch)
+        self.rate = self.rate_at(self._epoch)
+
+    @property
+    def rate(self):
+        return self.optim.alpha
+
+    @rate.setter
+    def rate(self, new_rate):
+        self.optim.alpha = new_rate
+
+    def rate_at(self, epoch):
+        return self.start_rate
+
+    def next(self):
+        # prepares the next epoch. returns whether or not to continue training.
+        if self.epoch + 1 >= self.epochs:
+            return False
+        if self.started:
+            self.epoch += 1
+        else:
+            self.started = True
+            self.epoch = self.epoch # poke property setter just in case
+        return True
+
+    def batch(self, progress): # TODO: rename
+        # interpolates rates between epochs.
+        # unlike epochs, we do not store batch number as a state.
+        # i.e. calling next() will not respect progress.
+        assert 0 <= progress <= 1
+        self.rate = self.rate_at(self._epoch + progress)
+
+    @property
+    def final_rate(self):
+        return self.rate_at(self.epochs - 1)
+
+class AnnealingLearner(Learner):
+    def __init__(self, optim, epochs=100, rate=None, halve_every=10):
+        self.halve_every = float(halve_every)
+        self.anneal = 0.5**(1/self.halve_every)
+        super().__init__(optim, epochs, rate)
+
+    def rate_at(self, epoch):
+        return self.start_rate * self.anneal**epoch
+
+def cosmod(x):
+    # plot: https://www.desmos.com/calculator/hlgqmyswy2
+    return (1 + np.cos((x % 1) * np.pi)) / 2
+
+class SGDR(Learner):
+    # Stochastic Gradient Descent with Restarts
+    # paper: https://arxiv.org/abs/1608.03983
+    # NOTE: this is missing a couple features.
+
+    per_batch = True
+
+    def __init__(self, optim, epochs=100, rate=None,
+                 restarts=0, restart_decay=0.5, callback=None,
+                 expando=None):
+        self.restart_epochs = int(epochs)
+        self.decay = float(restart_decay)
+        self.restarts = int(restarts)
+        self.restart_callback = callback
+        # TODO: rename expando to something not insane
+        self.expando = expando if expando is not None else lambda i: 1
+
+        self.splits = []
+        epochs = 0
+        for i in range(0, self.restarts + 1):
+            split = epochs + int(self.restart_epochs * self.expando(i))
+            self.splits.append(split)
+            epochs = split
+        super().__init__(optim, epochs, rate)
+
+    def split_num(self, epoch):
+        shit = [0] + self.splits # hack
+        for i in range(0, len(self.splits)):
+            if epoch < self.splits[i]:
+                sub_epoch = epoch - shit[i]
+                next_restart = self.splits[i] - shit[i]
+                return i, sub_epoch, next_restart
+        raise Exception('this should never happen.')
+
+    def rate_at(self, epoch):
+        restart, sub_epoch, next_restart = self.split_num(epoch)
+        x = sub_epoch / next_restart
+        return self.start_rate * self.decay**restart * cosmod(x)
+
+    def next(self):
+        if not super().next():
+            return False
+        restart, sub_epoch, next_restart = self.split_num(self.epoch)
+        if restart > 0 and sub_epoch == 0:
+            if self.restart_callback is not None:
+                self.restart_callback(restart)
+        return True