basic PEP 8 compliance

rip readability
2018-01-22 19:40:36 +00:00 · 2018-01-22 19:40:36 +00:00 · 169303813d
commit 169303813d
parent c81ce0afbb
19 changed files with 282 additions and 150 deletions
--- a/onn/init.py
+++ b/onn/init.py
@ -1,5 +1,5 @@
 # external packages required for full functionality:
-# numpy scipy h5py sklearn dotmap
+# numpy scipy h5py sklearn
 # BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
 #           this is gonna take some work.
--- a/onn/activation.py
+++ b/onn/activation.py
@ -6,6 +6,7 @@ from scipy.special import expit as sigmoid
 from .float import *
 from .layer_base import *
 class Identity(Layer):
    def forward(self, X):
        return X
@ -13,7 +14,8 @@ class Identity(Layer):
    def backward(self, dY):
        return dY
-class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
+
 class Sigmoid(Layer):  # aka Logistic, Expit (inverse of Logit)
    def forward(self, X):
        self.sig = sigmoid(X)
        return self.sig
@ -21,6 +23,7 @@ class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
    def backward(self, dY):
        return dY * self.sig * (1 - self.sig)
 class Softplus(Layer):
    # integral of Sigmoid.
@ -31,6 +34,7 @@ class Softplus(Layer):
    def backward(self, dY):
        return dY * sigmoid(self.X)
 class Tanh(Layer):
    def forward(self, X):
        self.sig = np.tanh(X)
@ -39,6 +43,7 @@ class Tanh(Layer):
    def backward(self, dY):
        return dY * (1 - self.sig * self.sig)
 class LeCunTanh(Layer):
    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
@ -53,6 +58,7 @@ class LeCunTanh(Layer):
    def backward(self, dY):
        return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
 class Relu(Layer):
    def forward(self, X):
        self.cond = X >= 0
@ -61,12 +67,13 @@ class Relu(Layer):
    def backward(self, dY):
        return np.where(self.cond, dY, 0)
 class Elu(Layer):
    # paper: https://arxiv.org/abs/1511.07289
    def __init__(self, alpha=1):
        super().__init__()
-        self.alpha = _f(alpha) # FIXME: unused
+        self.alpha = _f(alpha)  # FIXME: unused
    def forward(self, X):
        self.cond = X >= 0
@ -76,6 +83,7 @@ class Elu(Layer):
    def backward(self, dY):
        return dY * np.where(self.cond, 1, self.neg + 1)
 class GeluApprox(Layer):
    # paper: https://arxiv.org/abs/1606.08415
    #  plot: https://www.desmos.com/calculator/ydzgtccsld
@ -88,6 +96,7 @@ class GeluApprox(Layer):
    def backward(self, dY):
        return dY * self.sig * (1 + self.a * (1 - self.sig))
 class Softmax(Layer):
    def forward(self, X):
        alpha = np.max(X, axis=-1, keepdims=True)
@ -99,6 +108,7 @@ class Softmax(Layer):
    def backward(self, dY):
        return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
 class LogSoftmax(Softmax):
    def __init__(self, eps=1e-6):
        super().__init__()
@ -110,6 +120,7 @@ class LogSoftmax(Softmax):
    def backward(self, dY):
        return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
 class Cos(Layer):
    # performs well on MNIST for some strange reason.
@ -120,6 +131,7 @@ class Cos(Layer):
    def backward(self, dY):
        return dY * -np.sin(self.X)
 class Selu(Layer):
    # paper: https://arxiv.org/abs/1706.02515
@ -136,6 +148,7 @@ class Selu(Layer):
    def backward(self, dY):
        return dY * self.lamb * np.where(self.cond, 1, self.neg)
 # more
 class TanhTest(Layer):
@ -146,6 +159,7 @@ class TanhTest(Layer):
    def backward(self, dY):
        return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
 class ExpGB(Layer):
    # an output layer for one-hot classification problems.
    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
@ -163,6 +177,7 @@ class ExpGB(Layer):
        # this gradient is intentionally incorrect.
        return dY
 class CubicGB(Layer):
    # an output layer for one-hot classification problems.
    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
@ -182,4 +197,3 @@ class CubicGB(Layer):
    def backward(self, dY):
        # this gradient is intentionally incorrect.
        return dY
--- a/onn/float.py
+++ b/onn/float.py
@ -2,11 +2,13 @@ import numpy as np
 _f = np.float32
 def _check(a):
    assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
    assert a.dtype == _f, a.dtype
    return a
 _0 = _f(0)
 _1 = _f(1)
 _2 = _f(2)
--- a/onn/initialization.py
+++ b/onn/initialization.py
@ -2,28 +2,35 @@ import numpy as np
 # note: these are currently only implemented for 2D shapes.
 def init_zeros(size, ins=None, outs=None):
    return np.zeros(size)
 def init_ones(size, ins=None, outs=None):
    return np.ones(size)
 def init_he_normal(size, ins, outs):
    s = np.sqrt(2 / ins)
    return np.random.normal(0, s, size=size)
 def init_he_uniform(size, ins, outs):
    s = np.sqrt(6 / ins)
    return np.random.uniform(-s, s, size=size)
 def init_glorot_normal(size, ins, outs):
    s = np.sqrt(2 / (ins + outs))
    return np.random.normal(0, s, size=size)
 def init_glorot_uniform(size, ins, outs):
    s = np.sqrt(6 / (ins + outs))
    return np.random.uniform(-s, s, size=size)
 # more
 def init_gaussian_unit(size, ins, outs):
--- a/onn/layer.py
+++ b/onn/layer.py
@ -2,6 +2,7 @@ from .layer_base import *
 from .initialization import *
 from .float import *
 # Nonparametric Layers {{{1
 class Input(Layer):
@ -16,9 +17,10 @@ class Input(Layer):
        return X
    def backward(self, dY):
-        #self.dY = dY
+        # self.dY = dY
        return np.zeros_like(dY)
 class Reshape(Layer):
    def __init__(self, new_shape):
        super().__init__()
@ -33,6 +35,7 @@ class Reshape(Layer):
        assert dY.shape[0] == self.batch_size
        return dY.reshape(self.batch_size, *self.input_shape)
 class Flatten(Layer):
    def make_shape(self, parent):
        shape = parent.output_shape
@ -47,6 +50,7 @@ class Flatten(Layer):
        assert dY.shape[0] == self.batch_size
        return dY.reshape(self.batch_size, *self.input_shape)
 class ConstAffine(Layer):
    def __init__(self, a=1, b=0):
        super().__init__()
@ -59,13 +63,15 @@ class ConstAffine(Layer):
    def backward(self, dY):
        return dY * self.a
 class Sum(Layer):
    def _propagate(self, edges, deterministic):
        return np.sum(edges, axis=0)
    def _backpropagate(self, edges):
-        #assert len(edges) == 1, "unimplemented"
+        # assert len(edges) == 1, "unimplemented"
-        return edges[0] # TODO: does this always work?
+        return edges[0]  # TODO: does this always work?
 class ActivityRegularizer(Layer):
    def __init__(self, reg):
@ -81,6 +87,7 @@ class ActivityRegularizer(Layer):
    def backward(self, dY):
        return dY + self.reg.backward(self.X)
 class Dropout(Layer):
    def __init__(self, dropout=0.0):
        super().__init__()
@ -92,12 +99,13 @@ class Dropout(Layer):
        return X * self.mask
    def forward_deterministic(self, X):
-        #self.mask = _1
+        # self.mask = _1
        return X
    def backward(self, dY):
        return dY * self.mask
 # more
 class AlphaDropout(Layer):
@ -136,6 +144,7 @@ class AlphaDropout(Layer):
    def backward(self, dY):
        return dY * self.a * self.mask
 class Decimate(Layer):
    # simple decimaton layer that drops every other sample from the last axis.
@ -168,6 +177,7 @@ class Decimate(Layer):
            dX.ravel()[1::2] = dY.ravel()
        return dX
 class Undecimate(Layer):
    # inverse operation of Decimate. not quite interpolation.
--- a/onn/layer_base.py
+++ b/onn/layer_base.py
@ -4,26 +4,29 @@ from collections import defaultdict, OrderedDict
 from .weight import *
 # used for numbering layers like Keras:
 _layer_counters = defaultdict(lambda: 0)
 class LayerIncompatibility(Exception):
    pass
 class Layer:
    def __init__(self):
        self.parents = []
        self.children = []
        self.weights = OrderedDict()
-        self.loss = None # for activity regularizers
+        self.loss = None  # for activity regularizers
        self.input_shape = None
        self.output_shape = None
        kind = self.__class__.__name__
        global _layer_counters
        _layer_counters[kind] += 1
        self.name = "{}_{}".format(kind, _layer_counters[kind])
-        self.unsafe = False # disables assertions for better performance
+        self.unsafe = False  # disables assertions for better performance
-        self.shared = False # as in weight sharing
+        self.shared = False  # as in weight sharing
    def __str__(self):
        return self.name
@ -40,9 +43,9 @@ class Layer:
        raise NotImplementedError("unimplemented", self)
    def make_shape(self, parent):
-        if self.input_shape == None:
+        if self.input_shape is None:
            self.input_shape = parent.output_shape
-        if self.output_shape == None:
+        if self.output_shape is None:
            self.output_shape = self.input_shape
    def do_feed(self, child):
@ -75,16 +78,19 @@ class Layer:
        child.make_shape(self)
        if not child.is_compatible(self):
            fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
-            raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
+            raise LayerIncompatibility(fmt.format(
                self, child, self.output_shape, child.input_shape))
        self.do_feed(child)
        child.be_fed(self)
        return child
    def validate_input(self, X):
-        assert X.shape[1:] == self.input_shape,  (str(self), X.shape[1:], self.input_shape)
+        assert X.shape[1:] == self.input_shape, \
            (str(self), X.shape[1:], self.input_shape)
    def validate_output(self, Y):
-        assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
+        assert Y.shape[1:] == self.output_shape, \
            (str(self), Y.shape[1:], self.output_shape)
    def _new_weights(self, name, **kwargs):
        w = Weights(**kwargs)
@ -93,9 +99,10 @@ class Layer:
        return w
    def share(self, node):
-        self.weights = node.weights # TODO: this should be all it takes.
+        self.weights = node.weights  # TODO: this should be all it takes.
        for k, v in self.weights.items():
-            vs = getattr(node, k) # hack: key isn't necessarily attribute name!
+            # hack: key isn't necessarily attribute name!
            vs = getattr(node, k)
            setattr(self, k, vs)
        self.shared = True
--- a/onn/learner.py
+++ b/onn/learner.py
@ -1,13 +1,14 @@
 from .float import *
 from .optimizer_base import *
 class Learner:
    per_batch = False
    def __init__(self, optim, epochs=100, rate=None):
        assert isinstance(optim, Optimizer)
        self.optim = optim
-        self.start_rate = rate # None is okay; it'll use optim.lr instead.
+        self.start_rate = rate  # None is okay; it'll use optim.lr instead.
        self.epochs = int(epochs)
        self.reset()
@ -49,7 +50,7 @@ class Learner:
            return False
        return True
-    def batch(self, progress): # TODO: rename
+    def batch(self, progress):  # TODO: rename
        # interpolates rates between epochs.
        # unlike epochs, we do not store batch number as a state.
        # i.e. calling next() will not respect progress.
@ -60,6 +61,7 @@ class Learner:
    def final_rate(self):
        return self.rate_at(self.epochs - 1e-8)
 class AnnealingLearner(Learner):
    def __init__(self, optim, epochs=100, rate=None, halve_every=10):
        self.halve_every = _f(halve_every)
@ -69,10 +71,12 @@ class AnnealingLearner(Learner):
    def rate_at(self, epoch):
        return super().rate_at(epoch) * self.anneal**epoch
 def cosmod(x):
    # plot: https://www.desmos.com/calculator/hlgqmyswy2
    return (_1 + np.cos((x % _1) * _pi)) * _inv2
 class SGDR(Learner):
    # Stochastic Gradient Descent with Restarts
    # paper: https://arxiv.org/abs/1608.03983
@ -112,7 +116,8 @@ class SGDR(Learner):
        raise Exception('this should never happen.')
    def rate_at(self, epoch):
-        base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
+        sr = self.start_rate
        base_rate = sr if sr is not None else self.optim.lr
        restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
        x = _f(sub_epoch - 1) / _f(next_restart)
        return base_rate * self.decay**_f(restart) * cosmod(x)
@ -126,6 +131,7 @@ class SGDR(Learner):
                self.restart_callback(restart)
        return True
 class TriangularCLR(Learner):
    per_batch = True
@ -141,11 +147,14 @@ class TriangularCLR(Learner):
    def _t(self, epoch):
        # NOTE: this could probably be simplified
        offset = self.frequency / 2
-        return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
+        return np.abs(((epoch - 1 + offset) % self.frequency) - offset) \
            / offset
    def rate_at(self, epoch):
-        upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
+        sr = self.start_rate
-        return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
+        lr = self.lower_rate
        upper_rate = sr if sr is not None else self.optim.lr
        return self._t(epoch) * (upper_rate - lr) + lr
    def next(self):
        if not super().next():
@ -156,14 +165,17 @@ class TriangularCLR(Learner):
                self.callback(self.epoch // self.frequency)
        return True
 class SineCLR(TriangularCLR):
    def _t(self, epoch):
        return np.sin(_pi * _inv2 * super()._t(epoch))
 class WaveCLR(TriangularCLR):
    def _t(self, epoch):
        return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
 # more
 class PolyLearner(Learner):
@ -177,4 +189,3 @@ class PolyLearner(Learner):
        progress = (epoch - 1) / (self.epochs)
        ret = np.polyval(self.coeffs, progress)
        return np.abs(ret)
--- a/onn/loss.py
+++ b/onn/loss.py
@ -2,6 +2,7 @@ import numpy as np
 from .float import *
 class Loss:
    def forward(self, p, y):
        raise NotImplementedError("unimplemented", self)
@ -9,7 +10,8 @@ class Loss:
    def backward(self, p, y):
        raise NotImplementedError("unimplemented", self)
-class NLL(Loss): # Negative Log Likelihood
+
 class NLL(Loss):  # Negative Log Likelihood
    def forward(self, p, y):
        correct = p * y
        return np.mean(-correct)
@ -17,6 +19,7 @@ class NLL(Loss): # Negative Log Likelihood
    def backward(self, p, y):
        return -y / len(p)
 class CategoricalCrossentropy(Loss):
    # lifted from theano
@ -33,6 +36,7 @@ class CategoricalCrossentropy(Loss):
        df = (p - y) / (p * (1 - p))
        return df / len(y)
 class Accuracy(Loss):
    # returns percentage of categories correctly predicted.
    # utilizes argmax(), so it cannot be used for gradient descent.
@ -45,6 +49,7 @@ class Accuracy(Loss):
    def backward(self, p, y):
        raise NotImplementedError("cannot take the gradient of Accuracy")
 class ResidualLoss(Loss):
    def forward(self, p, y):
        return np.mean(self.f(p - y))
@ -53,6 +58,7 @@ class ResidualLoss(Loss):
        ret = self.df(p - y) / len(y)
        return ret
 class SquaredHalved(ResidualLoss):
    def f(self, r):
        return np.square(r) / 2
@ -60,6 +66,7 @@ class SquaredHalved(ResidualLoss):
    def df(self, r):
        return r
 class Squared(ResidualLoss):
    def f(self, r):
        return np.square(r)
@ -67,6 +74,7 @@ class Squared(ResidualLoss):
    def df(self, r):
        return 2 * r
 class Absolute(ResidualLoss):
    def f(self, r):
        return np.abs(r)
@ -74,6 +82,7 @@ class Absolute(ResidualLoss):
    def df(self, r):
        return np.sign(r)
 class Huber(ResidualLoss):
    def __init__(self, delta=1.0):
        self.delta = _f(delta)
@ -88,6 +97,7 @@ class Huber(ResidualLoss):
                        r,
                        self.delta * np.sign(r))
 # more
 class SomethingElse(ResidualLoss):
@ -105,6 +115,7 @@ class SomethingElse(ResidualLoss):
    def df(self, r):
        return np.sign(r) * np.abs(r)**self.c
 class Confidence(Loss):
    # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
    # it's just a metric of how large the value is of the predicted class.
@ -126,4 +137,3 @@ class Confidence(Loss):
        detc = p / categories / (1 - 1/categories)
        dmax = p == np.max(p, axis=-1, keepdims=True)
        return detc * dmax
--- a/onn/math.py
+++ b/onn/math.py
@ -1,14 +1,15 @@
 import numpy as np
 def rolling(a, window):
    # http://stackoverflow.com/a/4924433
    shape = (a.size - window + 1, window)
    strides = (a.itemsize, a.itemsize)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 def rolling_batch(a, window):
    # same as rolling, but acts on each batch (axis 0).
    shape = (a.shape[0], a.shape[-1] - window + 1, window)
    strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
--- a/onn/model.py
+++ b/onn/model.py
@ -5,14 +5,16 @@ from .nodal import *
 from .layer_base import *
 from .utility import *
 class Model:
-    def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
+    def __init__(self, nodes_in, nodes_out,
                 loss=None, mloss=None, unsafe=False):
        self.loss = loss if loss is not None else SquaredHalved()
        self.mloss = mloss if mloss is not None else loss
-        nodes_in  = [nodes_in]  if isinstance(nodes_in,  Layer) else nodes_in
+        nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
        nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
-        assert type(nodes_in)  == list, type(nodes_in)
+        assert type(nodes_in) == list, type(nodes_in)
        assert type(nodes_out) == list, type(nodes_out)
        self.nodes_in = nodes_in
        self.nodes_out = nodes_out
@ -29,8 +31,9 @@ class Model:
        return self.nodes
    def make_weights(self):
-        self.param_count = sum((node.size for node in self.nodes if not node.shared))
+        self.param_count = sum((node.size for node in self.nodes
-        self.W  = np.zeros(self.param_count, dtype=_f)
+                                if not node.shared))
        self.W = np.zeros(self.param_count, dtype=_f)
        self.dW = np.zeros(self.param_count, dtype=_f)
        offset = 0
@ -47,37 +50,42 @@ class Model:
                    assert size == len(ret[0]), (size, len(ret[0]))
                    return ret
                fmt = "Layer {} allocated {} weights than it said it would"
                node.init(allocate)
-                assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
+                assert inner_offset <= node.size, fmt.format("more", node)
                # i don't care if "less" is grammatically incorrect.
                # you're mom is grammatically incorrect.
-                assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
+                assert inner_offset >= node.size, fmt.format("less", node)
                offset += node.size
    def evaluate(self, input_, deterministic=True):
-        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
+        fmt = "ambiguous input in multi-{} network; use {}() instead"
-        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
+        assert len(self.nodes_in) == 1, fmt.format("input", "evaluate_multi")
        assert len(self.nodes_out) == 1, fmt.format("output", "evaluate_multi")
        node_in = self.nodes_in[0]
        node_out = self.nodes_out[0]
        outputs = self.evaluate_multi({node_in: input_}, deterministic)
        return outputs[node_out]
-    def apply(self, error): # TODO: better name?
+    def apply(self, error):  # TODO: better name?
-        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
+        fmt = "ambiguous input in multi-{} network; use {}() instead"
-        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
+        assert len(self.nodes_in) == 1, fmt.format("input", "apply_multi")
        assert len(self.nodes_out) == 1, fmt.format("output", "apply_multi")
        node_in = self.nodes_in[0]
        node_out = self.nodes_out[0]
        inputs = self.apply_multi({node_out: error})
        return inputs[node_in]
    def evaluate_multi(self, inputs, deterministic=True):
        fmt = "missing {} for node {}"
        values = dict()
        outputs = dict()
        for node in self.nodes:
            if node in self.nodes_in:
-                assert node in inputs, "missing input for node {}".format(node.name)
+                assert node in inputs, fmt.format("input", node.name)
                X = inputs[node]
-                values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
+                values[node] = node._propagate(np.expand_dims(X, 0),
                                               deterministic)
            else:
                values[node] = node.propagate(values, deterministic)
            if node in self.nodes_out:
@ -85,11 +93,12 @@ class Model:
        return outputs
    def apply_multi(self, outputs):
        fmt = "missing {} for node {}"
        values = dict()
        inputs = dict()
        for node in reversed(self.nodes):
            if node in self.nodes_out:
-                assert node in outputs, "missing output for node {}".format(node.name)
+                assert node in outputs, fmt.format("output", node.name)
                X = outputs[node]
                values[node] = node._backpropagate(np.expand_dims(X, 0))
            else:
@ -135,13 +144,17 @@ class Model:
    def load_weights(self, fn):
        # seemingly compatible with keras' Dense layers.
        import h5py
        open(fn) # just ensure the file exists (python's error is better)
        f = h5py.File(fn, 'r')
        weights = {}
        import h5py
        open(fn)  # just ensure the file exists (python's error is better)
        f = h5py.File(fn, 'r')
        def visitor(name, obj):
            if isinstance(obj, h5py.Dataset):
                weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
        f.visititems(visitor)
        f.close()
@ -194,5 +207,7 @@ class Model:
            children = [str(n) for n in node.children]
            if children:
                sep = '->'
-                print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
+                print('\t' + str(node) + sep +
                      (';\n\t' + str(node) + sep).join(children) + ';',
                      file=file)
        print('}', file=file)
--- a/onn/nodal.py
+++ b/onn/nodal.py
@ -3,7 +3,8 @@ class DummyNode:
    def __init__(self, children=None, parents=None):
        self.children = children if children is not None else []
-        self.parents  = parents  if parents  is not None else []
+        self.parents = parents if parents is not None else []
 def traverse(node_in, node_out, nodes=None, dummy_mode=False):
    # i have no idea if this is any algorithm in particular.
@ -27,7 +28,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
        if not seen_up[node]:
            continue
        parents_added = (parent in nodes for parent in node.parents)
-        if not node in nodes and all(parents_added):
+        if node not in nodes and all(parents_added):
            nodes.append(node)
        for child in node.children:
            q.append(child)
@ -37,6 +38,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
    return nodes
 def traverse_all(nodes_in, nodes_out, nodes=None):
    all_in = DummyNode(children=nodes_in)
    all_out = DummyNode(parents=nodes_out)
--- a/onn/optimizer.py
+++ b/onn/optimizer.py
@ -7,9 +7,10 @@ from .utility import *
 # some of the the following optimizers are blatantly lifted from tiny-dnn:
 # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
 class Momentum(Optimizer):
    def __init__(self, lr=0.01, mu=0.9, nesterov=False):
-        self.mu = _f(mu) # momentum
+        self.mu = _f(mu)  # momentum
        self.nesterov = bool(nesterov)
        super().__init__(lr)
@ -28,6 +29,7 @@ class Momentum(Optimizer):
        return V
 class Adagrad(Optimizer):
    def __init__(self, lr=0.01, eps=1e-8):
        self.eps = _f(eps)
@ -44,6 +46,7 @@ class Adagrad(Optimizer):
        self.g += np.square(dW)
        return -self.lr * dW / (np.sqrt(self.g) + self.eps)
 class RMSprop(Optimizer):
    # RMSprop generalizes* Adagrad, etc.
@ -51,7 +54,7 @@ class RMSprop(Optimizer):
    #   RMSprop.mu == 1
    def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
-        self.mu = _f(mu) # decay term
+        self.mu = _f(mu)  # decay term
        self.eps = _f(eps)
        # one might consider the following equation when specifying mu:
@ -70,12 +73,13 @@ class RMSprop(Optimizer):
        if self.g is None:
            self.g = np.zeros_like(dW)
-        # basically apply a first-order low-pass filter to delta squared
+        # basically apply a first-order low-pass filter to delta squared,
        self.g += (1 - self.mu) * (np.square(dW) - self.g)
-        # finally sqrt it to complete the running root-mean-square approximation
+        # and sqrt it to complete the running root-mean-square approximation.
        return -self.lr * dW / (np.sqrt(self.g) + self.eps)
 class RMSpropCentered(Optimizer):
    # referenced TensorFlow/PyTorch.
    # paper: https://arxiv.org/pdf/1308.0850v5.pdf
@ -115,10 +119,11 @@ class RMSpropCentered(Optimizer):
        self.delta[:] = self.momentum * self.delta + self.lr * temp
        return -self.delta
        # PyTorch does it this way.
-        #self.delta[:] = self.momentum * self.delta + temp
+        # self.delta[:] = self.momentum * self.delta + temp
-        #return -self.lr * self.delta
+        # return -self.lr * self.delta
        # they are equivalent only when LR is constant, which it might not be.
 class Adam(Optimizer):
    # paper: https://arxiv.org/abs/1412.6980
    # Adam generalizes* RMSprop, and
@ -130,10 +135,10 @@ class Adam(Optimizer):
    #   Adam.b2 == RMSprop.mu
    def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
-        self.b1 = _f(b1) # decay term
+        self.b1 = _f(b1)  # decay term
-        self.b2 = _f(b2) # decay term
+        self.b2 = _f(b2)  # decay term
-        self.b1_t_default = _f(b1) # decay term power t
+        self.b1_t_default = _f(b1)  # decay term power t
-        self.b2_t_default = _f(b2) # decay term power t
+        self.b2_t_default = _f(b2)  # decay term power t
        self.eps = _f(eps)
        super().__init__(lr)
@ -159,18 +164,20 @@ class Adam(Optimizer):
        self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
        return -self.lr * (self.mt / (1 - self.b1_t)) \
-                / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
+            / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
 class Nadam(Optimizer):
    # paper: https://arxiv.org/abs/1412.6980
    # paper: http://cs229.stanford.edu/proj2015/054_report.pdf
    # TODO: double-check this implementation. also read the damn paper.
-    # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
+    # lifted from:
-    # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
+    # https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
    # https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
    def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
-        self.b1 = _f(b1) # decay term
+        self.b1 = _f(b1)  # decay term
-        self.b2 = _f(b2) # decay term
+        self.b2 = _f(b2)  # decay term
        self.eps = _f(eps)
        super().__init__(lr)
@ -208,6 +215,7 @@ class Nadam(Optimizer):
        return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
 # more
 class FTML(Optimizer):
@ -216,8 +224,8 @@ class FTML(Optimizer):
    def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
        self.iterations = _0
-        self.b1 = _f(b1) # decay term
+        self.b1 = _f(b1)  # decay term
-        self.b2 = _f(b2) # decay term
+        self.b2 = _f(b2)  # decay term
        self.eps = _f(eps)
        super().__init__(lr)
@ -231,10 +239,14 @@ class FTML(Optimizer):
        self.b2_t = _1
    def compute(self, dW, W):
-        if self.dt1 is None: self.dt1 = np.zeros_like(dW)
+        if self.dt1 is None:
-        if self.dt is None: self.dt = np.zeros_like(dW)
+            self.dt1 = np.zeros_like(dW)
-        if self.vt is None: self.vt = np.zeros_like(dW)
+        if self.dt is None:
-        if self.zt is None: self.zt = np.zeros_like(dW)
+            self.dt = np.zeros_like(dW)
        if self.vt is None:
            self.vt = np.zeros_like(dW)
        if self.zt is None:
            self.zt = np.zeros_like(dW)
        # NOTE: we could probably rewrite these equations to avoid this copy.
        self.dt1[:] = self.dt[:]
@ -260,6 +272,7 @@ class FTML(Optimizer):
        # subtract by weights to avoid having to override self.update.
        return -self.zt / self.dt - W
 class MomentumClip(Optimizer):
    def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
        self.mu = _f(mu)
@ -289,22 +302,25 @@ class MomentumClip(Optimizer):
        else:
            return -self.lr * self.accum
 class YellowFin(Optimizer):
    # paper: https://arxiv.org/abs/1706.03471
    # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
-    # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
+    # author's implementation:
-    # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
+    # https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
    # code lifted:
    # https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
    def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
                 debias=True, clip=1.0):
        self.lr_default = _f(lr)
        self.mu_default = _f(mu)
        self.beta = _f(beta)
-        self.window_size = int(window_size) # curv_win_width
+        self.window_size = int(window_size)  # curv_win_width
        self.debias_enabled = bool(debias)
        self.clip = _f(clip)
-        self.mu = _f(mu) # momentum
+        self.mu = _f(mu)  # momentum
        super().__init__(lr)
    def reset(self):
@ -316,13 +332,13 @@ class YellowFin(Optimizer):
        self.step = 0
        self.beta_t = self.beta
-        self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
+        self.curv_win = np.zeros([self.window_size, ], dtype=np.float32)
        self.h_min = None
        self.h_max = None
        self.g_lpf = 0
-        #self.g_squared_lpf = 0
+        # self.g_squared_lpf = 0
        self.g_norm_squared_lpf = 0
        self.g_norm_lpf = 0
        self.h_min_lpf = 0
@ -332,7 +348,8 @@ class YellowFin(Optimizer):
        self.mu_lpf = 0
    def get_lr_mu(self):
-        p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
+        p = (np.square(self.dist_avg) * np.square(self.h_min)) \
            / (2 * self.g_var)
        w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
        w = np.power(w3, 1/3)
        y = w - p / (3 * w)
@ -360,11 +377,11 @@ class YellowFin(Optimizer):
        total_norm = np.linalg.norm(dW)
        clip_scale = self.clip / (total_norm + 1e-6)
        if clip_scale < 1:
-            #print("clipping gradients; norm: {:10.5f}".format(total_norm))
+            # print("clipping gradients; norm: {:10.5f}".format(total_norm))
            dW *= clip_scale
-        #fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
+        # fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
-        #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
+        # print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
        b = self.beta
        m1b = 1 - self.beta
@ -380,30 +397,31 @@ class YellowFin(Optimizer):
        h_min_t = np.min(valid_window)
        h_max_t = np.max(valid_window)
-        self.g_lpf              = b * self.g_lpf              + m1b * g
+        self.g_lpf = b * self.g_lpf + m1b * g
-        #self.g_squared_lpf      = b * self.g_squared_lpf      + m1b * g_squared
+        # self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
-        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
+        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \
-        self.g_norm_lpf         = b * self.g_norm_lpf         + m1b * g_norm
+            + m1b * g_norm_squared
-        self.h_min_lpf          = b * self.h_min_lpf          + m1b * h_min_t
+        self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
-        self.h_max_lpf          = b * self.h_max_lpf          + m1b * h_max_t
+        self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
        self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
-        g_avg              = debias * self.g_lpf
+        g_avg = debias * self.g_lpf
-        #g_squared_avg      = debias * self.g_squared_lpf
+        # g_squared_avg = debias * self.g_squared_lpf
        g_norm_squared_avg = debias * self.g_norm_squared_lpf
-        g_norm_avg         = debias * self.g_norm_lpf
+        g_norm_avg = debias * self.g_norm_lpf
-        self.h_min         = debias * self.h_min_lpf
+        self.h_min = debias * self.h_min_lpf
-        self.h_max         = debias * self.h_max_lpf
+        self.h_max = debias * self.h_max_lpf
        assert self.h_max >= self.h_min
        dist = g_norm_avg / g_norm_squared_avg
-        self.dist_lpf           = b * self.dist_lpf           + m1b * dist
+        self.dist_lpf = b * self.dist_lpf + m1b * dist
-        self.dist_avg      = debias * self.dist_lpf
+        self.dist_avg = debias * self.dist_lpf
        self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
        # equivalently:
-        #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
+        # self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
        if self.step > 0:
            lr_for_real, mu_for_real = self.get_lr_mu()
@ -419,6 +437,7 @@ class YellowFin(Optimizer):
        self.beta_t *= self.beta
        return V
 class AddSign(Optimizer):
    # paper: https://arxiv.org/abs/1709.07417
@ -438,10 +457,11 @@ class AddSign(Optimizer):
        self.accum[:] = self.accum * self.mu + dW
        signed = np.sign(dW) * np.sign(self.accum)
-        #signed *= decay
+        # signed *= decay
        return -self.lr * dW * (self.alpha + signed)
 class PowerSign(Optimizer):
    # paper: https://arxiv.org/abs/1709.07417
@ -462,13 +482,14 @@ class PowerSign(Optimizer):
        self.accum[:] = self.accum * self.mu + dW
        signed = np.sign(dW) * np.sign(self.accum)
-        #signed *= decay
+        # signed *= decay
        if self.use_exp:
            return -self.lr * dW * np.exp(signed)
        else:
            return -self.lr * dW * np.power(self.alpha, signed)
 class Neumann(Optimizer):
    # paper: https://arxiv.org/abs/1712.03298
    # NOTE: this implementation is missing resetting as described in the paper.
@ -478,20 +499,20 @@ class Neumann(Optimizer):
    #       it seems like using a Learner like SineCLR makes this unnecessary.
    def __init__(self, lr=0.01):
-        self.alpha = _f(1e-7) # cubic.
+        self.alpha = _f(1e-7)  # cubic.
-        self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
+        self.beta = _f(1e-5)  # repulsive. NOTE: multiplied by len(dW) later.
-        self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
+        self.gamma = _f(0.99)  # EMA, or 1-pole low-pass parameter. same thing.
        # momentum is ∝ (in the shape of) 1 - 1/(1 + t)
        self.mu_min = _f(0.5)
        self.mu_max = _f(0.9)
-        self.reset_period = 0 # TODO
+        self.reset_period = 0  # TODO
        super().__init__(lr)
    def reset(self):
        # NOTE: mt and vt are different than the pair in Adam-like optimizers.
-        self.mt = None # momentum accumulator.
+        self.mt = None  # momentum accumulator.
-        self.vt = None # weight accumulator.
+        self.vt = None  # weight accumulator.
        self.t = 0
    def compute(self, dW, W):
@ -510,7 +531,7 @@ class Neumann(Optimizer):
            return
        # momentum quantity:
-        mu = _1 - _1/_f(self.t) # the + 1 is implicit.
+        mu = _1 - _1/_f(self.t)  # the + 1 is implicit.
        mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
        # smoothed change in weights:
@ -529,4 +550,3 @@ class Neumann(Optimizer):
        # weights and accumulator:
        W += mu * self.mt - self.lr * dt
        self.vt = W + self.gamma * (self.vt - W)
--- a/onn/optimizer_base.py
+++ b/onn/optimizer_base.py
@ -2,9 +2,10 @@ import numpy as np
 from .float import *
 class Optimizer:
    def __init__(self, lr=0.1):
-        self.lr = _f(lr) # learning rate
+        self.lr = _f(lr)  # learning rate
        self.reset()
    def reset(self):
@ -15,5 +16,3 @@ class Optimizer:
    def update(self, dW, W):
        W += self.compute(dW, W)
--- a/onn/parametric.py
+++ b/onn/parametric.py
@ -4,6 +4,7 @@ from .float import *
 from .layer_base import *
 from .initialization import *
 class Bias(Layer):
    # TODO: support axes other than -1 and shapes other than 1D.
@ -28,6 +29,7 @@ class Bias(Layer):
        self.biases.g += dY.sum(0)
        return dY
 class Dense(Layer):
    serialized = {
        'W': 'coeffs',
@ -38,8 +40,10 @@ class Dense(Layer):
        super().__init__()
        self.dim = int(dim)
        self.output_shape = (dim,)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
+        self.coeffs = self._new_weights('coeffs', init=init,
-        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
+                                        regularizer=reg_w)
        self.biases = self._new_weights('biases', init=init_zeros,
                                        regularizer=reg_b)
    def make_shape(self, parent):
        shape = parent.output_shape
@ -101,18 +105,20 @@ class Conv1Dper(Layer):
    def forward(self, X):
        if self.wrap0 == 0:
-            Xper = np.hstack((X,X[:,:self.wrap1]))
+            Xper = np.hstack((X, X[:, :self.wrap1]))
        elif self.wrap1 == 0:
-            Xper = np.hstack((X[:,-self.wrap0:],X))
+            Xper = np.hstack((X[:, -self.wrap0:], X))
        else:
-            Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
+            Xper = np.hstack((X[:, -self.wrap0:], X, X[:, :self.wrap1]))
        self.cols = rolling_batch(Xper, self.kernel_size)
-        convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
+        convolved = (self.cols * self.coeffs.f[:, ::-1]).sum(2)
        return convolved
    def backward(self, dY):
-        self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
+        self.coeffs.g += (dY[:, :, None] * self.cols).sum(0)[:, ::-1].sum(
-        return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
+            0, keepdims=True)
        return (dY[:, :, None] * self.coeffs.f[:, ::-1]).sum(2)
 class LayerNorm(Layer):
    # paper: https://arxiv.org/abs/1607.06450
@ -168,7 +174,8 @@ class LayerNorm(Layer):
        return dX
-class Denses(Layer): # TODO: rename?
+
 class Denses(Layer):  # TODO: rename?
    # acts as a separate Dense for each row or column. only for 2D arrays.
    serialized = {
@ -176,13 +183,16 @@ class Denses(Layer): # TODO: rename?
        'b': 'biases',
    }
-    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
+    def __init__(self, dim, init=init_he_uniform,
                 reg_w=None, reg_b=None, axis=-1):
        super().__init__()
        self.dim = int(dim)
        self.weight_init = init
        self.axis = int(axis)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
+        self.coeffs = self._new_weights('coeffs', init=init,
-        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
+                                        regularizer=reg_w)
        self.biases = self._new_weights('biases', init=init_zeros,
                                        regularizer=reg_b)
    def make_shape(self, parent):
        shape = parent.output_shape
@ -220,9 +230,11 @@ class Denses(Layer): # TODO: rename?
            self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
            return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
 class CosineDense(Dense):
    # paper: https://arxiv.org/abs/1702.05870
-    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
+    # another implementation:
    # https://github.com/farizrahman4u/keras-contrib/pull/36
    # the paper doesn't mention bias,
    # so we treat bias as an additional weight with a constant input of 1.
    # this is correct in Dense layers, so i hope it's correct here too.
@ -231,24 +243,25 @@ class CosineDense(Dense):
    def forward(self, X):
        self.X = X
-        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
+        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True)
-          + 1 + self.eps)
+                              + 1 + self.eps)
-        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
+        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True)
-          + np.square(self.biases.f) + self.eps)
+                              + np.square(self.biases.f) + self.eps)
        self.dot = X @ self.coeffs.f + self.biases.f
        Y = self.dot / (self.X_norm * self.W_norm)
        return Y
    def backward(self, dY):
        ddot = dY / self.X_norm / self.W_norm
-        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
+        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) \
-        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
+            / self.X_norm**2
        dW_norm = -(dY * self.dot / self.X_norm).sum(0, keepdims=True) \
            / self.W_norm**2
-        self.coeffs.g += self.X.T @ ddot            \
+        self.coeffs.g += self.X.T @ ddot \
-          + dW_norm / self.W_norm * self.coeffs.f
+            + dW_norm / self.W_norm * self.coeffs.f
        self.biases.g += ddot.sum(0, keepdims=True) \
-          + dW_norm / self.W_norm * self.biases.f
+            + dW_norm / self.W_norm * self.biases.f
        dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
        return dX
--- a/onn/regularizer.py
+++ b/onn/regularizer.py
@ -2,9 +2,11 @@ import numpy as np
 from .float import *
 class Regularizer:
    pass
 class L1L2(Regularizer):
    def __init__(self, l1=0.0, l2=0.0):
        self.l1 = _f(l1)
@ -26,6 +28,7 @@ class L1L2(Regularizer):
            df += self.l2 * 2 * X
        return df
 # more
 class SaturateRelu(Regularizer):
--- a/onn/ritual.py
+++ b/onn/ritual.py
@ -4,6 +4,7 @@ from .float import *
 from .initialization import *
 from .ritual_base import *
 def stochastic_multiply(W, gamma=0.5, allow_negation=False):
    # paper: https://arxiv.org/abs/1606.01981
@ -23,6 +24,7 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=False):
        mult *= np.where(samples < prob, 1, -1)
    np.multiply(W, mult, out=W)
 class StochMRitual(Ritual):
    # paper: https://arxiv.org/abs/1606.01981
    # this probably doesn't make sense for regression problems,
@ -38,8 +40,8 @@ class StochMRitual(Ritual):
    def learn(self, inputs, outputs):
        # an experiment:
-        #assert self.learner.rate < 10, self.learner.rate
+        # assert self.learner.rate < 10, self.learner.rate
-        #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
+        # self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
        self.W[:] = self.model.W
        for layer in self.model.ordered_nodes:
@ -57,6 +59,7 @@ class StochMRitual(Ritual):
                np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
            #   np.clip(layer.W, -1, 1, out=layer.W)
 class NoisyRitual(Ritual):
    def __init__(self, learner=None,
                 input_noise=0, output_noise=0, gradient_noise=0):
@ -69,7 +72,7 @@ class NoisyRitual(Ritual):
        # this is pretty crude
        if self.input_noise > 0:
            s = self.input_noise
-            inputs =   inputs + np.random.normal(0, s, size=inputs.shape)
+            inputs = inputs + np.random.normal(0, s, size=inputs.shape)
        if self.output_noise > 0:
            s = self.output_noise
            outputs = outputs + np.random.normal(0, s, size=outputs.shape)
@ -80,11 +83,10 @@ class NoisyRitual(Ritual):
        if self.gradient_noise > 0:
            size = len(self.model.dW)
            gamma = 0.55
-            #s = self.gradient_noise / (1 + self.bn) ** gamma
+            # s = self.gradient_noise / (1 + self.bn) ** gamma
            # experiments:
            s = self.gradient_noise * np.sqrt(self.learner.rate)
-            #s = np.square(self.learner.rate)
+            # s = np.square(self.learner.rate)
-            #s = self.learner.rate / self.en
+            # s = self.learner.rate / self.en
            self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
        super().update()
--- a/onn/ritual_base.py
+++ b/onn/ritual_base.py
@ -3,7 +3,8 @@ import numpy as np
 from .float import *
-class Ritual: # i'm just making up names at this point.
+
 class Ritual:  # i'm just making up names at this point.
    def __init__(self, learner=None):
        self.learner = learner if learner is not None else Learner(Optimizer())
        self.model = None
@ -77,7 +78,8 @@ class Ritual: # i'm just making up names at this point.
        if shuffle:
            if gen:
-                raise Exception("shuffling is incompatibile with using a generator.")
+                raise Exception(
                    "shuffling is incompatibile with using a generator.")
            indices = np.arange(inputs.shape[0])
            np.random.shuffle(indices)
            inputs = inputs[indices]
@ -90,7 +92,7 @@ class Ritual: # i'm just making up names at this point.
            batch_count = inputs.shape[0] // batch_size
            # TODO: lift this restriction
            assert inputs.shape[0] % batch_size == 0, \
-              "inputs is not evenly divisible by batch_size"
+                "inputs is not evenly divisible by batch_size"
        prev_batch_size = None
        for b in range(batch_count):
@ -101,17 +103,20 @@ class Ritual: # i'm just making up names at this point.
                batch_inputs, batch_outputs = next(generator)
                batch_size = batch_inputs.shape[0]
                # TODO: lift this restriction
-                assert batch_size == prev_batch_size or prev_batch_size is None, \
+                fmt = "non-constant batch size (got {}, expected {})"
-                  "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
+                assert (batch_size == prev_batch_size
                        or prev_batch_size is None), \
                    fmt.format(batch_size, prev_batch_size)
            else:
                bi = b * batch_size
-                batch_inputs  = inputs[ bi:bi+batch_size]
+                batch_inputs = inputs[bi:bi+batch_size]
                batch_outputs = outputs[bi:bi+batch_size]
            if clear_grad:
                self.model.clear_grad()
            self._train_batch(batch_inputs, batch_outputs, b, batch_count,
-                              test_only, return_losses=='both', return_losses)
+                              test_only, return_losses == 'both',
                              return_losses)
            prev_batch_size = batch_size
--- a/onn/utility.py
+++ b/onn/utility.py
@ -1,17 +1,23 @@
 import sys
 def lament(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)
 def lower_priority():
    """Set the priority of the process to below-normal."""
    # via https://stackoverflow.com/a/1023269
    if sys.platform == 'win32':
        try:
-            import win32api, win32process, win32con
+            import win32api
            import win32process
            import win32con
            pid = win32api.GetCurrentProcessId()
-            handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
+            handle = win32api.OpenProcess(
-            win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
+                win32con.PROCESS_ALL_ACCESS, True, pid)
            win32process.SetPriorityClass(
                handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
        except ImportError:
            lament("you do not have pywin32 installed.")
            lament("the process priority could not be lowered.")
@ -21,9 +27,12 @@ def lower_priority():
        import os
        os.nice(1)
 # more
 _log_was_update = False
 def log(left, right, update=False):
    s = "\x1B[1m  {:>20}:\x1B[0m   {}".format(left, right)
    global _log_was_update
@ -33,5 +42,6 @@ def log(left, right, update=False):
        lament(s)
    _log_was_update = update
 class Dummy:
    pass
--- a/onn/weight.py
+++ b/onn/weight.py
@ -1,11 +1,12 @@
 import numpy as np
 class Weights:
    # we may or may not contain weights -- or any information, for that matter.
    def __init__(self, **kwargs):
-        self.f = None # forward weights
+        self.f = None  # forward weights
-        self.g = None # backward weights (gradients)
+        self.g = None  # backward weights (gradients)
        self.shape = None
        self.init = None
        self.allocator = None
@ -16,7 +17,7 @@ class Weights:
    def configure(self, **kwargs):
        for k, v in kwargs.items():
-            getattr(self, k) # ensures the key already exists
+            getattr(self, k)  # ensures the key already exists
            setattr(self, k, v)
    @property