diff --git a/.dummy b/.dummy
deleted file mode 100644
index 945c9b4..0000000
--- a/.dummy
+++ /dev/null
@@ -1 +0,0 @@
-.
\ No newline at end of file
diff --git a/optim_nn.py b/optim_nn.py
new file mode 100644
index 0000000..6da6966
--- /dev/null
+++ b/optim_nn.py
@@ -0,0 +1,538 @@
+#!/usr/bin/env python3
+
+# imports
+
+import numpy as np
+nf = np.float32
+nfa = lambda x: np.array(x, dtype=nf)
+ni = np.int
+nia = lambda x: np.array(x, dtype=ni)
+
+from collections import defaultdict
+
+# Loss functions
+
+class Loss:
+    def mean(self, r):
+        return np.average(self.f(r))
+
+    def dmean(self, r):
+        d = self.df(r)
+        return d / len(d)
+
+class SquaredHalved(Loss):
+    def f(self, r):
+        return np.square(r) / 2
+
+    def df(self, r):
+        return r
+
+# Optimizers
+
+class Optimizer:
+    def __init__(self, alpha=0.1):
+        self.alpha = nfa(alpha)
+        self.reset()
+
+    def reset(self):
+        pass
+
+    def compute(self, dW, W):
+        return -self.alpha * dW
+
+    def update(self, dW, W):
+        W += self.compute(dW, W)
+
+# the following optimizers are blatantly ripped from tiny-dnn:
+# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+
+class Adam(Optimizer):
+    def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
+        self.alpha = nf(alpha) # learning rate
+        self.b1 = nf(b1) # decay term
+        self.b2 = nf(b2) # decay term
+        self.b1_t_default = nf(b1_t) # decay term power t
+        self.b2_t_default = nf(b2_t) # decay term power t
+        self.eps = nf(eps)
+
+        self.reset()
+
+    def reset(self):
+        self.mt = None
+        self.vt = None
+        self.b1_t = self.b1_t_default
+        self.b2_t = self.b2_t_default
+
+    def compute(self, dW, W):
+        if self.mt is None:
+            self.mt = np.zeros_like(W)
+        if self.vt is None:
+            self.vt = np.zeros_like(W)
+
+        # decay
+        self.b1_t *= self.b1
+        self.b2_t *= self.b2
+
+        self.mt = self.b1 * self.mt + (1 - self.b1) * dW
+        self.vt = self.b2 * self.vt + (1 - self.b2) * dW * dW
+
+        return -self.alpha * (self.mt / (1 - self.b1_t)) \
+                   / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
+
+# Abstract Layers
+
+_layer_counters = defaultdict(lambda: 0)
+
+class Layer:
+    def __init__(self):
+        self.parents = []
+        self.children = []
+        self.input_shape = None
+        self.output_shape = None
+        kind = self.__class__.__name__
+        global _layer_counters
+        _layer_counters[kind] += 1
+        self.name = "{}_{}".format(kind, _layer_counters[kind])
+        self.size = None # total weight count (if any)
+
+    def __str__(self):
+        return self.name
+
+    # methods we might want to override:
+
+    def F(self, X):
+        raise NotImplementedError("unimplemented", self)
+
+    def dF(self, dY):
+        raise NotImplementedError("unimplemented", self)
+
+    def do_feed(self, child):
+        pass
+
+    def be_fed(self, parent):
+        self.parents.append(parent)
+
+    def make_shape(self, shape):
+        assert shape is not None
+        if self.output_shape is None:
+            self.output_shape = shape
+        return shape
+
+    # TODO: rename this multi and B crap to something actually relevant.
+
+    def multi(self, B):
+        assert len(B) == 1, self
+        return self.F(B[0].T).T
+
+    def dmulti(self, dB):
+        if len(dB) == 1:
+            return self.dF(dB[0].T).T
+        else:
+            dX = None
+            for dY in dB:
+                if dX is None:
+                    dX = self.dF(dY.T).T
+                else:
+                    dX += self.dF(dY.T).T
+            return dX
+
+    # general utility methods:
+
+    def compatible(self, parent):
+        if self.input_shape is None:
+            # inherit shape from output
+            shape = self.make_shape(parent.output_shape)
+            if shape is None:
+                return False
+            self.input_shape = shape
+        if np.all(self.input_shape == parent.output_shape):
+            return True
+        else:
+            return False
+
+    def feed(self, child):
+        if not child.compatible(self):
+            fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
+            raise Exception(fmt.format(self, child, self.output_shape, child.input_shape))
+        self.children.append(child)
+        self.do_feed(child)
+        child.be_fed(self)
+        return child
+
+    def validate_input(self, X):
+        assert X.shape[1:] == self.input_shape,  (self, X.shape[1:], self.input_shape)
+
+    def validate_output(self, Y):
+        assert Y.shape[1:] == self.output_shape, (self, Y.shape[1:], self.output_shape)
+
+    def forward(self, lut):
+        assert len(self.parents) > 0, self
+        #print("      forwarding", self)
+        B = []
+        for parent in self.parents:
+            # TODO: skip over irrelevant nodes (if any)
+            X = lut[parent]
+            #print("collected parent", parent)
+            self.validate_input(X)
+            B.append(X)
+        Y = self.multi(B)
+        self.validate_output(Y)
+        return Y
+
+    def backward(self, lut):
+        assert len(self.children) > 0, self
+        #print("     backwarding", self)
+        dB = []
+        for child in self.children:
+            # TODO: skip over irrelevant nodes (if any)
+            dY = lut[child]
+            #print(" collected child", child)
+            self.validate_output(dY)
+            dB.append(dY)
+        dX = self.dmulti(dB)
+        self.validate_input(dX)
+        return dX
+
+# Final Layers
+
+class Sum(Layer):
+    def multi(self, B):
+        return np.sum(B, axis=0)
+
+    def dmulti(self, dB):
+        #assert len(dB) == 1, "unimplemented"
+        return dB[0] # TODO: does this always work?
+
+class Input(Layer):
+    def __init__(self, shape):
+        assert shape is not None
+        super().__init__()
+        self.shape = tuple(shape)
+        self.input_shape = self.shape
+        self.output_shape = self.shape
+
+    def F(self, X):
+        return X
+
+    def dF(self, dY):
+        #self.delta = dY
+        return np.zeros_like(dY)
+
+class Affine(Layer):
+    def __init__(self, a=1, b=0):
+        super().__init__()
+        self.a = nf(a)
+        self.b = nf(b)
+
+    def F(self, X):
+        return self.a * X + self.b
+
+    def dF(self, dY):
+        return dY * self.a
+
+class Relu(Layer):
+    def F(self, X):
+        self.cond = X >= 0
+        return np.where(self.cond, X, 0)
+
+    def dF(self, dY):
+        return np.where(self.cond, dY, 0)
+
+class Dense(Layer):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = ni(dim)
+        self.output_shape = (dim,)
+        self.size = None
+
+    def init(self, W, dW):
+        ins, outs = self.input_shape[0], self.output_shape[0]
+
+        self.W = W
+        self.dW = dW
+        #self.coeffs = np.random.normal(0, s, size=self.size)
+        #self.biases = np.zeros((self.dim, 1), dtype=nf)
+        self.coeffs = self.W[:self.nW].reshape(outs, ins)
+        self.biases = self.W[self.nW:].reshape(outs, 1)
+        self.dcoeffs = self.dW[:self.nW].reshape(outs, ins)
+        self.dbiases = self.dW[self.nW:].reshape(outs)
+
+        # he_normal
+        s = np.sqrt(2 / ins)
+        self.coeffs.flat = np.random.normal(0, s, size=self.nW)
+        self.biases.flat = 0
+
+    def make_shape(self, shape):
+        super().make_shape(shape)
+        if len(shape) != 1:
+            return False
+        self.nW = self.dim * shape[0]
+        self.nb = self.dim
+        self.size = self.nW + self.nb
+        return shape
+
+    def F(self, X):
+        self.X = X
+        Y = self.coeffs.dot(X) \
+          + self.biases
+        return Y
+
+    def dF(self, dY):
+        # http://cs231n.github.io/optimization-2/#gradients-for-vectorized-operations
+        # note: because we only call df once (we only have a df/dy method),
+        #       we have to do df/dw stuff here too.
+        dX = self.coeffs.T.dot(dY)
+        self.dcoeffs[:] = dY.dot(self.X.T)
+        self.dbiases[:] = np.sum(dY, axis=1)
+        return dX
+
+# Model
+
+class Model:
+    def __init__(self, x, y):
+        assert isinstance(x, Layer), x
+        assert isinstance(y, Layer), y
+        self.x = x
+        self.y = y
+
+        self.ordered_nodes = self.traverse([], self.y)
+        print([str(node) for node in self.ordered_nodes])
+        #print(len(self.ordered_nodes))
+
+        self.make_weights()
+
+    def make_weights(self):
+        self.param_count = 0
+        for node in self.ordered_nodes:
+            if node.size is not None:
+                self.param_count += node.size
+        print(self.param_count)
+        self.W  = np.zeros(self.param_count, dtype=nf)
+        self.dW = np.zeros(self.param_count, dtype=nf)
+
+        offset = 0
+        for node in self.ordered_nodes:
+            if node.size is not None:
+                end = offset + node.size
+                node.init(self.W[offset:end], self.dW[offset:end])
+                offset += node.size
+
+        #print(self.W, self.dW)
+
+    def traverse(self, nodes, node):
+        if node == x:
+            return [node]
+        for parent in node.parents:
+            if parent not in nodes:
+                new_nodes = self.traverse(nodes, parent)
+                for new_node in new_nodes:
+                    if new_node not in nodes:
+                        nodes.append(new_node)
+        if nodes:
+            nodes.append(node)
+        return nodes
+
+    def forward(self, X):
+        lut = dict()
+        input_node = self.ordered_nodes[0]
+        output_node = self.ordered_nodes[-1]
+        #lut[input_node] = input_node.F(X)
+        lut[input_node] = input_node.multi(np.expand_dims(X, 0))
+        for node in self.ordered_nodes[1:]:
+            lut[node] = node.forward(lut)
+        return lut[output_node]
+
+    def backward(self, error):
+        lut = dict()
+        input_node = self.ordered_nodes[0]
+        output_node = self.ordered_nodes[-1]
+        #lut[output_node] = output_node.dF(error)
+        lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
+        #for node in self.ordered_nodes[-2:0:-1]:
+        for node in reversed(self.ordered_nodes[:-1]):
+            lut[node] = node.backward(lut)
+        #return lut[input_node] # meaningless value
+
+        return self.dW
+
+    def load_model(self, fn):
+        # seemingly compatible with keras models at the moment
+        import h5py
+        f = h5py.File(fn)
+        loadweights = {}
+        def visitor(name, obj):
+            if isinstance(obj, h5py.Dataset):
+                loadweights[name.split('/')[-1]] = nfa(obj[:])
+        f.visititems(visitor)
+        f.close()
+
+        denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
+        for i in range(len(denses)):
+            a, b = i, i + 1
+            b_name = "dense_{}".format(b)
+            denses[a].coeffs = loadweights[b_name+'_W'].T
+            denses[a].biases = np.expand_dims(loadweights[b_name+'_b'], -1)
+
+    def save_model(self, fn, overwrite=False):
+        raise NotImplementedError("unimplemented", self)
+
+if __name__ == '__main__':
+    # Config
+
+    from dotmap import DotMap
+    config = DotMap(
+        fn = 'ml/cie_mlp_min.h5',
+
+        batch_size = 64,
+
+        res_width = 12,
+        res_depth = 3,
+        res_block = 2, # normally 2
+        res_multi = 4, # normally 1
+        activation = 'relu',
+
+        optim = 'adam',
+        nesterov = False, # only used with SGD or Adam
+        momentum = 0.33, # only used with SGD
+        epochs = 6, # 6
+        LR = 1e-2,
+        restarts = 3, # 3
+        LR_halve_every = 2,
+        LR_restart_advance = 3,
+        init = 'he_normal',
+        loss = 'mse',
+
+        parallel_style = 'batchless',
+    )
+
+    # toy CIE-2000 data
+    from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, x_scale, y_scale
+
+    def read_data(fn):
+        data = np.load(fn)
+        try:
+            inputs, outputs = data['inputs'], data['outputs']
+        except KeyError:
+            # because i'm bad at video games.
+            inputs, outputs = data['arr_0'], data['arr_1']
+        return inputs, outputs
+
+    inputs, outputs = read_data("ml/cie_mlp_data.npz")
+    valid_data = read_data("ml/cie_mlp_vdata.npz")
+    valid_inputs, valid_outputs = valid_data
+
+    # Our Test Model
+
+    x = Input(shape=(input_samples,))
+    y = x
+    last_size = input_samples
+
+    for blah in range(config.res_depth):
+        size = config.res_width
+
+        if last_size != size:
+            y = y.feed(Dense(size))
+
+        assert config.parallel_style == 'batchless'
+        skip = y
+        merger = Sum()
+        skip.feed(merger)
+        z_start = skip.feed(Relu())
+        for i in range(config.res_multi):
+            z = z_start
+            for i in range(config.res_block):
+                if i > 0:
+                    z = z.feed(Relu())
+                z = z.feed(Dense(size))
+            z.feed(merger)
+        y = merger
+
+        last_size = size
+
+    if last_size != output_samples:
+        y = y.feed(Dense(output_samples))
+
+    model = Model(x, y)
+
+    training = config.epochs > 0 and config.restarts >= 0
+
+    if not training:
+        model.load_weights(config.fn)
+
+    assert config.optim == 'adam'
+    if config.nesterov:
+        assert False, "unimplemented"
+    else:
+        optim = Adam()
+
+    assert config.loss == 'mse'
+    loss = SquaredHalved()
+
+    LR = config.LR
+    LRprod = 0.5**(1/config.LR_halve_every)
+
+    # Training
+
+    def measure_loss():
+        predicted = model.forward(inputs / x_scale)
+        residual = predicted - outputs / y_scale
+        err = loss.mean(residual)
+        print("train loss: {:10.6f}".format(err))
+
+        predicted = model.forward(valid_inputs / x_scale)
+        residual = predicted - valid_outputs / y_scale
+        err = loss.mean(residual)
+        print("valid loss: {:10.6f}".format(err))
+
+    for i in range(config.restarts + 1):
+        measure_loss()
+
+        if i > 0:
+            print("restarting")
+
+        assert inputs.shape[0] % config.batch_size == 0, \
+               "inputs is not evenly divisible by batch_size" # TODO: lift this restriction
+        batch_count = inputs.shape[0] // config.batch_size
+        for e in range(config.epochs):
+            indices = np.arange(len(inputs))
+            np.random.shuffle(indices)
+            shuffled_inputs = inputs[indices] / x_scale
+            shuffled_outputs = outputs[indices] / y_scale
+
+            cumsum_loss = 0
+            for b in range(batch_count):
+                bi = b * config.batch_size
+                batch_inputs  = shuffled_inputs[ bi:bi+config.batch_size]
+                batch_outputs = shuffled_outputs[bi:bi+config.batch_size]
+
+                predicted = model.forward(batch_inputs)
+                dW = model.backward(np.ones_like(predicted))
+
+                residual = predicted - batch_outputs
+
+                # TODO: try something like this instead?
+                #err_dW = np.dot(loss.dmean(residual), np.expand_dims(dW, 0))
+                err_dW = loss.df(residual) * dW / len(residual)
+                err_dW = np.sum(err_dW, axis=0)
+                optim.alpha = LR * LRprod**e
+                optim.update(err_dW, model.W)
+
+                # note: we don't actually need this for training, only monitoring.
+                cumsum_loss += loss.mean(residual)
+            print("avg loss: {:10.6f}".format(cumsum_loss / batch_count))
+
+        LR *= LRprod**config.LR_restart_advance
+
+    measure_loss()
+
+    #if training:
+    #    model.save_weights(config.fn, overwrite=True)
+
+    # Evaluation
+
+    a = (192, 128, 64)
+    b = (64, 128, 192)
+    X = np.expand_dims(np.hstack((a, b)), 0) / x_scale
+    P = model.forward(X) * y_scale
+    print("truth:", rgbcompare(a, b))
+    print("network:", np.squeeze(P))