From 169303813d568c82ee01c4cf362df0b48af3eafc Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Mon, 22 Jan 2018 19:40:36 +0000
Subject: [PATCH] basic PEP 8 compliance

rip readability
---
 onn/__init__.py       |   2 +-
 onn/activation.py     |  20 ++++++-
 onn/float.py          |   2 +
 onn/initialization.py |   7 +++
 onn/layer.py          |  18 ++++--
 onn/layer_base.py     |  27 +++++----
 onn/learner.py        |  25 +++++---
 onn/loss.py           |  14 ++++-
 onn/math.py           |   3 +-
 onn/model.py          |  53 +++++++++++------
 onn/nodal.py          |   6 +-
 onn/optimizer.py      | 130 ++++++++++++++++++++++++------------------
 onn/optimizer_base.py |   5 +-
 onn/parametric.py     |  59 +++++++++++--------
 onn/regularizer.py    |   3 +
 onn/ritual.py         |  16 +++---
 onn/ritual_base.py    |  19 +++---
 onn/utility.py        |  16 +++++-
 onn/weight.py         |   7 ++-
 19 files changed, 282 insertions(+), 150 deletions(-)

diff --git a/onn/__init__.py b/onn/__init__.py
index 437b3a9..c724ec8 100644
--- a/onn/__init__.py
+++ b/onn/__init__.py
@@ -1,5 +1,5 @@
 # external packages required for full functionality:
-# numpy scipy h5py sklearn dotmap
+# numpy scipy h5py sklearn
 
 # BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
 #           this is gonna take some work.
diff --git a/onn/activation.py b/onn/activation.py
index f42165e..debd19d 100644
--- a/onn/activation.py
+++ b/onn/activation.py
@@ -6,6 +6,7 @@ from scipy.special import expit as sigmoid
 from .float import *
 from .layer_base import *
 
+
 class Identity(Layer):
     def forward(self, X):
         return X
@@ -13,7 +14,8 @@ class Identity(Layer):
     def backward(self, dY):
         return dY
 
-class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
+
+class Sigmoid(Layer):  # aka Logistic, Expit (inverse of Logit)
     def forward(self, X):
         self.sig = sigmoid(X)
         return self.sig
@@ -21,6 +23,7 @@ class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
     def backward(self, dY):
         return dY * self.sig * (1 - self.sig)
 
+
 class Softplus(Layer):
     # integral of Sigmoid.
 
@@ -31,6 +34,7 @@ class Softplus(Layer):
     def backward(self, dY):
         return dY * sigmoid(self.X)
 
+
 class Tanh(Layer):
     def forward(self, X):
         self.sig = np.tanh(X)
@@ -39,6 +43,7 @@ class Tanh(Layer):
     def backward(self, dY):
         return dY * (1 - self.sig * self.sig)
 
+
 class LeCunTanh(Layer):
     # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
     # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
@@ -53,6 +58,7 @@ class LeCunTanh(Layer):
     def backward(self, dY):
         return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
 
+
 class Relu(Layer):
     def forward(self, X):
         self.cond = X >= 0
@@ -61,12 +67,13 @@ class Relu(Layer):
     def backward(self, dY):
         return np.where(self.cond, dY, 0)
 
+
 class Elu(Layer):
     # paper: https://arxiv.org/abs/1511.07289
 
     def __init__(self, alpha=1):
         super().__init__()
-        self.alpha = _f(alpha) # FIXME: unused
+        self.alpha = _f(alpha)  # FIXME: unused
 
     def forward(self, X):
         self.cond = X >= 0
@@ -76,6 +83,7 @@ class Elu(Layer):
     def backward(self, dY):
         return dY * np.where(self.cond, 1, self.neg + 1)
 
+
 class GeluApprox(Layer):
     # paper: https://arxiv.org/abs/1606.08415
     #  plot: https://www.desmos.com/calculator/ydzgtccsld
@@ -88,6 +96,7 @@ class GeluApprox(Layer):
     def backward(self, dY):
         return dY * self.sig * (1 + self.a * (1 - self.sig))
 
+
 class Softmax(Layer):
     def forward(self, X):
         alpha = np.max(X, axis=-1, keepdims=True)
@@ -99,6 +108,7 @@ class Softmax(Layer):
     def backward(self, dY):
         return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
 
+
 class LogSoftmax(Softmax):
     def __init__(self, eps=1e-6):
         super().__init__()
@@ -110,6 +120,7 @@ class LogSoftmax(Softmax):
     def backward(self, dY):
         return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
 
+
 class Cos(Layer):
     # performs well on MNIST for some strange reason.
 
@@ -120,6 +131,7 @@ class Cos(Layer):
     def backward(self, dY):
         return dY * -np.sin(self.X)
 
+
 class Selu(Layer):
     # paper: https://arxiv.org/abs/1706.02515
 
@@ -136,6 +148,7 @@ class Selu(Layer):
     def backward(self, dY):
         return dY * self.lamb * np.where(self.cond, 1, self.neg)
 
+
 # more
 
 class TanhTest(Layer):
@@ -146,6 +159,7 @@ class TanhTest(Layer):
     def backward(self, dY):
         return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
 
+
 class ExpGB(Layer):
     # an output layer for one-hot classification problems.
     # use with MSE (SquaredHalved), not CategoricalCrossentropy!
@@ -163,6 +177,7 @@ class ExpGB(Layer):
         # this gradient is intentionally incorrect.
         return dY
 
+
 class CubicGB(Layer):
     # an output layer for one-hot classification problems.
     # use with MSE (SquaredHalved), not CategoricalCrossentropy!
@@ -182,4 +197,3 @@ class CubicGB(Layer):
     def backward(self, dY):
         # this gradient is intentionally incorrect.
         return dY
-
diff --git a/onn/float.py b/onn/float.py
index 7d3b2ca..e4e8945 100644
--- a/onn/float.py
+++ b/onn/float.py
@@ -2,11 +2,13 @@ import numpy as np
 
 _f = np.float32
 
+
 def _check(a):
     assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
     assert a.dtype == _f, a.dtype
     return a
 
+
 _0 = _f(0)
 _1 = _f(1)
 _2 = _f(2)
diff --git a/onn/initialization.py b/onn/initialization.py
index 46916c5..6431744 100644
--- a/onn/initialization.py
+++ b/onn/initialization.py
@@ -2,28 +2,35 @@ import numpy as np
 
 # note: these are currently only implemented for 2D shapes.
 
+
 def init_zeros(size, ins=None, outs=None):
     return np.zeros(size)
 
+
 def init_ones(size, ins=None, outs=None):
     return np.ones(size)
 
+
 def init_he_normal(size, ins, outs):
     s = np.sqrt(2 / ins)
     return np.random.normal(0, s, size=size)
 
+
 def init_he_uniform(size, ins, outs):
     s = np.sqrt(6 / ins)
     return np.random.uniform(-s, s, size=size)
 
+
 def init_glorot_normal(size, ins, outs):
     s = np.sqrt(2 / (ins + outs))
     return np.random.normal(0, s, size=size)
 
+
 def init_glorot_uniform(size, ins, outs):
     s = np.sqrt(6 / (ins + outs))
     return np.random.uniform(-s, s, size=size)
 
+
 # more
 
 def init_gaussian_unit(size, ins, outs):
diff --git a/onn/layer.py b/onn/layer.py
index e3d33bc..67fbc9a 100644
--- a/onn/layer.py
+++ b/onn/layer.py
@@ -2,6 +2,7 @@ from .layer_base import *
 from .initialization import *
 from .float import *
 
+
 # Nonparametric Layers {{{1
 
 class Input(Layer):
@@ -16,9 +17,10 @@ class Input(Layer):
         return X
 
     def backward(self, dY):
-        #self.dY = dY
+        # self.dY = dY
         return np.zeros_like(dY)
 
+
 class Reshape(Layer):
     def __init__(self, new_shape):
         super().__init__()
@@ -33,6 +35,7 @@ class Reshape(Layer):
         assert dY.shape[0] == self.batch_size
         return dY.reshape(self.batch_size, *self.input_shape)
 
+
 class Flatten(Layer):
     def make_shape(self, parent):
         shape = parent.output_shape
@@ -47,6 +50,7 @@ class Flatten(Layer):
         assert dY.shape[0] == self.batch_size
         return dY.reshape(self.batch_size, *self.input_shape)
 
+
 class ConstAffine(Layer):
     def __init__(self, a=1, b=0):
         super().__init__()
@@ -59,13 +63,15 @@ class ConstAffine(Layer):
     def backward(self, dY):
         return dY * self.a
 
+
 class Sum(Layer):
     def _propagate(self, edges, deterministic):
         return np.sum(edges, axis=0)
 
     def _backpropagate(self, edges):
-        #assert len(edges) == 1, "unimplemented"
-        return edges[0] # TODO: does this always work?
+        # assert len(edges) == 1, "unimplemented"
+        return edges[0]  # TODO: does this always work?
+
 
 class ActivityRegularizer(Layer):
     def __init__(self, reg):
@@ -81,6 +87,7 @@ class ActivityRegularizer(Layer):
     def backward(self, dY):
         return dY + self.reg.backward(self.X)
 
+
 class Dropout(Layer):
     def __init__(self, dropout=0.0):
         super().__init__()
@@ -92,12 +99,13 @@ class Dropout(Layer):
         return X * self.mask
 
     def forward_deterministic(self, X):
-        #self.mask = _1
+        # self.mask = _1
         return X
 
     def backward(self, dY):
         return dY * self.mask
 
+
 # more
 
 class AlphaDropout(Layer):
@@ -136,6 +144,7 @@ class AlphaDropout(Layer):
     def backward(self, dY):
         return dY * self.a * self.mask
 
+
 class Decimate(Layer):
     # simple decimaton layer that drops every other sample from the last axis.
 
@@ -168,6 +177,7 @@ class Decimate(Layer):
             dX.ravel()[1::2] = dY.ravel()
         return dX
 
+
 class Undecimate(Layer):
     # inverse operation of Decimate. not quite interpolation.
 
diff --git a/onn/layer_base.py b/onn/layer_base.py
index 1ef1781..cbc0a8d 100644
--- a/onn/layer_base.py
+++ b/onn/layer_base.py
@@ -4,26 +4,29 @@ from collections import defaultdict, OrderedDict
 
 from .weight import *
 
+
 # used for numbering layers like Keras:
 _layer_counters = defaultdict(lambda: 0)
 
+
 class LayerIncompatibility(Exception):
     pass
 
+
 class Layer:
     def __init__(self):
         self.parents = []
         self.children = []
         self.weights = OrderedDict()
-        self.loss = None # for activity regularizers
+        self.loss = None  # for activity regularizers
         self.input_shape = None
         self.output_shape = None
         kind = self.__class__.__name__
         global _layer_counters
         _layer_counters[kind] += 1
         self.name = "{}_{}".format(kind, _layer_counters[kind])
-        self.unsafe = False # disables assertions for better performance
-        self.shared = False # as in weight sharing
+        self.unsafe = False  # disables assertions for better performance
+        self.shared = False  # as in weight sharing
 
     def __str__(self):
         return self.name
@@ -40,9 +43,9 @@ class Layer:
         raise NotImplementedError("unimplemented", self)
 
     def make_shape(self, parent):
-        if self.input_shape == None:
+        if self.input_shape is None:
             self.input_shape = parent.output_shape
-        if self.output_shape == None:
+        if self.output_shape is None:
             self.output_shape = self.input_shape
 
     def do_feed(self, child):
@@ -75,16 +78,19 @@ class Layer:
         child.make_shape(self)
         if not child.is_compatible(self):
             fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
-            raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
+            raise LayerIncompatibility(fmt.format(
+                self, child, self.output_shape, child.input_shape))
         self.do_feed(child)
         child.be_fed(self)
         return child
 
     def validate_input(self, X):
-        assert X.shape[1:] == self.input_shape,  (str(self), X.shape[1:], self.input_shape)
+        assert X.shape[1:] == self.input_shape, \
+            (str(self), X.shape[1:], self.input_shape)
 
     def validate_output(self, Y):
-        assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
+        assert Y.shape[1:] == self.output_shape, \
+            (str(self), Y.shape[1:], self.output_shape)
 
     def _new_weights(self, name, **kwargs):
         w = Weights(**kwargs)
@@ -93,9 +99,10 @@ class Layer:
         return w
 
     def share(self, node):
-        self.weights = node.weights # TODO: this should be all it takes.
+        self.weights = node.weights  # TODO: this should be all it takes.
         for k, v in self.weights.items():
-            vs = getattr(node, k) # hack: key isn't necessarily attribute name!
+            # hack: key isn't necessarily attribute name!
+            vs = getattr(node, k)
             setattr(self, k, vs)
         self.shared = True
 
diff --git a/onn/learner.py b/onn/learner.py
index 148ab79..8673f9a 100644
--- a/onn/learner.py
+++ b/onn/learner.py
@@ -1,13 +1,14 @@
 from .float import *
 from .optimizer_base import *
 
+
 class Learner:
     per_batch = False
 
     def __init__(self, optim, epochs=100, rate=None):
         assert isinstance(optim, Optimizer)
         self.optim = optim
-        self.start_rate = rate # None is okay; it'll use optim.lr instead.
+        self.start_rate = rate  # None is okay; it'll use optim.lr instead.
         self.epochs = int(epochs)
         self.reset()
 
@@ -49,7 +50,7 @@ class Learner:
             return False
         return True
 
-    def batch(self, progress): # TODO: rename
+    def batch(self, progress):  # TODO: rename
         # interpolates rates between epochs.
         # unlike epochs, we do not store batch number as a state.
         # i.e. calling next() will not respect progress.
@@ -60,6 +61,7 @@ class Learner:
     def final_rate(self):
         return self.rate_at(self.epochs - 1e-8)
 
+
 class AnnealingLearner(Learner):
     def __init__(self, optim, epochs=100, rate=None, halve_every=10):
         self.halve_every = _f(halve_every)
@@ -69,10 +71,12 @@ class AnnealingLearner(Learner):
     def rate_at(self, epoch):
         return super().rate_at(epoch) * self.anneal**epoch
 
+
 def cosmod(x):
     # plot: https://www.desmos.com/calculator/hlgqmyswy2
     return (_1 + np.cos((x % _1) * _pi)) * _inv2
 
+
 class SGDR(Learner):
     # Stochastic Gradient Descent with Restarts
     # paper: https://arxiv.org/abs/1608.03983
@@ -112,7 +116,8 @@ class SGDR(Learner):
         raise Exception('this should never happen.')
 
     def rate_at(self, epoch):
-        base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
+        sr = self.start_rate
+        base_rate = sr if sr is not None else self.optim.lr
         restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
         x = _f(sub_epoch - 1) / _f(next_restart)
         return base_rate * self.decay**_f(restart) * cosmod(x)
@@ -126,6 +131,7 @@ class SGDR(Learner):
                 self.restart_callback(restart)
         return True
 
+
 class TriangularCLR(Learner):
     per_batch = True
 
@@ -141,11 +147,14 @@ class TriangularCLR(Learner):
     def _t(self, epoch):
         # NOTE: this could probably be simplified
         offset = self.frequency / 2
-        return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
+        return np.abs(((epoch - 1 + offset) % self.frequency) - offset) \
+            / offset
 
     def rate_at(self, epoch):
-        upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
-        return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
+        sr = self.start_rate
+        lr = self.lower_rate
+        upper_rate = sr if sr is not None else self.optim.lr
+        return self._t(epoch) * (upper_rate - lr) + lr
 
     def next(self):
         if not super().next():
@@ -156,14 +165,17 @@ class TriangularCLR(Learner):
                 self.callback(self.epoch // self.frequency)
         return True
 
+
 class SineCLR(TriangularCLR):
     def _t(self, epoch):
         return np.sin(_pi * _inv2 * super()._t(epoch))
 
+
 class WaveCLR(TriangularCLR):
     def _t(self, epoch):
         return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
 
+
 # more
 
 class PolyLearner(Learner):
@@ -177,4 +189,3 @@ class PolyLearner(Learner):
         progress = (epoch - 1) / (self.epochs)
         ret = np.polyval(self.coeffs, progress)
         return np.abs(ret)
-
diff --git a/onn/loss.py b/onn/loss.py
index 2faa309..28167ec 100644
--- a/onn/loss.py
+++ b/onn/loss.py
@@ -2,6 +2,7 @@ import numpy as np
 
 from .float import *
 
+
 class Loss:
     def forward(self, p, y):
         raise NotImplementedError("unimplemented", self)
@@ -9,7 +10,8 @@ class Loss:
     def backward(self, p, y):
         raise NotImplementedError("unimplemented", self)
 
-class NLL(Loss): # Negative Log Likelihood
+
+class NLL(Loss):  # Negative Log Likelihood
     def forward(self, p, y):
         correct = p * y
         return np.mean(-correct)
@@ -17,6 +19,7 @@ class NLL(Loss): # Negative Log Likelihood
     def backward(self, p, y):
         return -y / len(p)
 
+
 class CategoricalCrossentropy(Loss):
     # lifted from theano
 
@@ -33,6 +36,7 @@ class CategoricalCrossentropy(Loss):
         df = (p - y) / (p * (1 - p))
         return df / len(y)
 
+
 class Accuracy(Loss):
     # returns percentage of categories correctly predicted.
     # utilizes argmax(), so it cannot be used for gradient descent.
@@ -45,6 +49,7 @@ class Accuracy(Loss):
     def backward(self, p, y):
         raise NotImplementedError("cannot take the gradient of Accuracy")
 
+
 class ResidualLoss(Loss):
     def forward(self, p, y):
         return np.mean(self.f(p - y))
@@ -53,6 +58,7 @@ class ResidualLoss(Loss):
         ret = self.df(p - y) / len(y)
         return ret
 
+
 class SquaredHalved(ResidualLoss):
     def f(self, r):
         return np.square(r) / 2
@@ -60,6 +66,7 @@ class SquaredHalved(ResidualLoss):
     def df(self, r):
         return r
 
+
 class Squared(ResidualLoss):
     def f(self, r):
         return np.square(r)
@@ -67,6 +74,7 @@ class Squared(ResidualLoss):
     def df(self, r):
         return 2 * r
 
+
 class Absolute(ResidualLoss):
     def f(self, r):
         return np.abs(r)
@@ -74,6 +82,7 @@ class Absolute(ResidualLoss):
     def df(self, r):
         return np.sign(r)
 
+
 class Huber(ResidualLoss):
     def __init__(self, delta=1.0):
         self.delta = _f(delta)
@@ -88,6 +97,7 @@ class Huber(ResidualLoss):
                         r,
                         self.delta * np.sign(r))
 
+
 # more
 
 class SomethingElse(ResidualLoss):
@@ -105,6 +115,7 @@ class SomethingElse(ResidualLoss):
     def df(self, r):
         return np.sign(r) * np.abs(r)**self.c
 
+
 class Confidence(Loss):
     # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
     # it's just a metric of how large the value is of the predicted class.
@@ -126,4 +137,3 @@ class Confidence(Loss):
         detc = p / categories / (1 - 1/categories)
         dmax = p == np.max(p, axis=-1, keepdims=True)
         return detc * dmax
-
diff --git a/onn/math.py b/onn/math.py
index 794dfe6..9b2c90c 100644
--- a/onn/math.py
+++ b/onn/math.py
@@ -1,14 +1,15 @@
 import numpy as np
 
+
 def rolling(a, window):
     # http://stackoverflow.com/a/4924433
     shape = (a.size - window + 1, window)
     strides = (a.itemsize, a.itemsize)
     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 
+
 def rolling_batch(a, window):
     # same as rolling, but acts on each batch (axis 0).
     shape = (a.shape[0], a.shape[-1] - window + 1, window)
     strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
-
diff --git a/onn/model.py b/onn/model.py
index ddda277..ece35dc 100644
--- a/onn/model.py
+++ b/onn/model.py
@@ -5,14 +5,16 @@ from .nodal import *
 from .layer_base import *
 from .utility import *
 
+
 class Model:
-    def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
+    def __init__(self, nodes_in, nodes_out,
+                 loss=None, mloss=None, unsafe=False):
         self.loss = loss if loss is not None else SquaredHalved()
         self.mloss = mloss if mloss is not None else loss
 
-        nodes_in  = [nodes_in]  if isinstance(nodes_in,  Layer) else nodes_in
+        nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
         nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
-        assert type(nodes_in)  == list, type(nodes_in)
+        assert type(nodes_in) == list, type(nodes_in)
         assert type(nodes_out) == list, type(nodes_out)
         self.nodes_in = nodes_in
         self.nodes_out = nodes_out
@@ -29,8 +31,9 @@ class Model:
         return self.nodes
 
     def make_weights(self):
-        self.param_count = sum((node.size for node in self.nodes if not node.shared))
-        self.W  = np.zeros(self.param_count, dtype=_f)
+        self.param_count = sum((node.size for node in self.nodes
+                                if not node.shared))
+        self.W = np.zeros(self.param_count, dtype=_f)
         self.dW = np.zeros(self.param_count, dtype=_f)
 
         offset = 0
@@ -47,37 +50,42 @@ class Model:
                     assert size == len(ret[0]), (size, len(ret[0]))
                     return ret
 
+                fmt = "Layer {} allocated {} weights than it said it would"
                 node.init(allocate)
-                assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
+                assert inner_offset <= node.size, fmt.format("more", node)
                 # i don't care if "less" is grammatically incorrect.
                 # you're mom is grammatically incorrect.
-                assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
+                assert inner_offset >= node.size, fmt.format("less", node)
                 offset += node.size
 
     def evaluate(self, input_, deterministic=True):
-        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
-        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
+        fmt = "ambiguous input in multi-{} network; use {}() instead"
+        assert len(self.nodes_in) == 1, fmt.format("input", "evaluate_multi")
+        assert len(self.nodes_out) == 1, fmt.format("output", "evaluate_multi")
         node_in = self.nodes_in[0]
         node_out = self.nodes_out[0]
         outputs = self.evaluate_multi({node_in: input_}, deterministic)
         return outputs[node_out]
 
-    def apply(self, error): # TODO: better name?
-        assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
-        assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
+    def apply(self, error):  # TODO: better name?
+        fmt = "ambiguous input in multi-{} network; use {}() instead"
+        assert len(self.nodes_in) == 1, fmt.format("input", "apply_multi")
+        assert len(self.nodes_out) == 1, fmt.format("output", "apply_multi")
         node_in = self.nodes_in[0]
         node_out = self.nodes_out[0]
         inputs = self.apply_multi({node_out: error})
         return inputs[node_in]
 
     def evaluate_multi(self, inputs, deterministic=True):
+        fmt = "missing {} for node {}"
         values = dict()
         outputs = dict()
         for node in self.nodes:
             if node in self.nodes_in:
-                assert node in inputs, "missing input for node {}".format(node.name)
+                assert node in inputs, fmt.format("input", node.name)
                 X = inputs[node]
-                values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
+                values[node] = node._propagate(np.expand_dims(X, 0),
+                                               deterministic)
             else:
                 values[node] = node.propagate(values, deterministic)
             if node in self.nodes_out:
@@ -85,11 +93,12 @@ class Model:
         return outputs
 
     def apply_multi(self, outputs):
+        fmt = "missing {} for node {}"
         values = dict()
         inputs = dict()
         for node in reversed(self.nodes):
             if node in self.nodes_out:
-                assert node in outputs, "missing output for node {}".format(node.name)
+                assert node in outputs, fmt.format("output", node.name)
                 X = outputs[node]
                 values[node] = node._backpropagate(np.expand_dims(X, 0))
             else:
@@ -135,13 +144,17 @@ class Model:
 
     def load_weights(self, fn):
         # seemingly compatible with keras' Dense layers.
-        import h5py
-        open(fn) # just ensure the file exists (python's error is better)
-        f = h5py.File(fn, 'r')
         weights = {}
+
+        import h5py
+        open(fn)  # just ensure the file exists (python's error is better)
+
+        f = h5py.File(fn, 'r')
+
         def visitor(name, obj):
             if isinstance(obj, h5py.Dataset):
                 weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
+
         f.visititems(visitor)
         f.close()
 
@@ -194,5 +207,7 @@ class Model:
             children = [str(n) for n in node.children]
             if children:
                 sep = '->'
-                print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
+                print('\t' + str(node) + sep +
+                      (';\n\t' + str(node) + sep).join(children) + ';',
+                      file=file)
         print('}', file=file)
diff --git a/onn/nodal.py b/onn/nodal.py
index 081b045..842c5f9 100644
--- a/onn/nodal.py
+++ b/onn/nodal.py
@@ -3,7 +3,8 @@ class DummyNode:
 
     def __init__(self, children=None, parents=None):
         self.children = children if children is not None else []
-        self.parents  = parents  if parents  is not None else []
+        self.parents = parents if parents is not None else []
+
 
 def traverse(node_in, node_out, nodes=None, dummy_mode=False):
     # i have no idea if this is any algorithm in particular.
@@ -27,7 +28,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
         if not seen_up[node]:
             continue
         parents_added = (parent in nodes for parent in node.parents)
-        if not node in nodes and all(parents_added):
+        if node not in nodes and all(parents_added):
             nodes.append(node)
         for child in node.children:
             q.append(child)
@@ -37,6 +38,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
 
     return nodes
 
+
 def traverse_all(nodes_in, nodes_out, nodes=None):
     all_in = DummyNode(children=nodes_in)
     all_out = DummyNode(parents=nodes_out)
diff --git a/onn/optimizer.py b/onn/optimizer.py
index 370f794..4a5836d 100644
--- a/onn/optimizer.py
+++ b/onn/optimizer.py
@@ -7,9 +7,10 @@ from .utility import *
 # some of the the following optimizers are blatantly lifted from tiny-dnn:
 # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
 
+
 class Momentum(Optimizer):
     def __init__(self, lr=0.01, mu=0.9, nesterov=False):
-        self.mu = _f(mu) # momentum
+        self.mu = _f(mu)  # momentum
         self.nesterov = bool(nesterov)
 
         super().__init__(lr)
@@ -28,6 +29,7 @@ class Momentum(Optimizer):
 
         return V
 
+
 class Adagrad(Optimizer):
     def __init__(self, lr=0.01, eps=1e-8):
         self.eps = _f(eps)
@@ -44,6 +46,7 @@ class Adagrad(Optimizer):
         self.g += np.square(dW)
         return -self.lr * dW / (np.sqrt(self.g) + self.eps)
 
+
 class RMSprop(Optimizer):
     # RMSprop generalizes* Adagrad, etc.
 
@@ -51,7 +54,7 @@ class RMSprop(Optimizer):
     #   RMSprop.mu == 1
 
     def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
-        self.mu = _f(mu) # decay term
+        self.mu = _f(mu)  # decay term
         self.eps = _f(eps)
 
         # one might consider the following equation when specifying mu:
@@ -70,12 +73,13 @@ class RMSprop(Optimizer):
         if self.g is None:
             self.g = np.zeros_like(dW)
 
-        # basically apply a first-order low-pass filter to delta squared
+        # basically apply a first-order low-pass filter to delta squared,
         self.g += (1 - self.mu) * (np.square(dW) - self.g)
 
-        # finally sqrt it to complete the running root-mean-square approximation
+        # and sqrt it to complete the running root-mean-square approximation.
         return -self.lr * dW / (np.sqrt(self.g) + self.eps)
 
+
 class RMSpropCentered(Optimizer):
     # referenced TensorFlow/PyTorch.
     # paper: https://arxiv.org/pdf/1308.0850v5.pdf
@@ -115,10 +119,11 @@ class RMSpropCentered(Optimizer):
         self.delta[:] = self.momentum * self.delta + self.lr * temp
         return -self.delta
         # PyTorch does it this way.
-        #self.delta[:] = self.momentum * self.delta + temp
-        #return -self.lr * self.delta
+        # self.delta[:] = self.momentum * self.delta + temp
+        # return -self.lr * self.delta
         # they are equivalent only when LR is constant, which it might not be.
 
+
 class Adam(Optimizer):
     # paper: https://arxiv.org/abs/1412.6980
     # Adam generalizes* RMSprop, and
@@ -130,10 +135,10 @@ class Adam(Optimizer):
     #   Adam.b2 == RMSprop.mu
 
     def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
-        self.b1 = _f(b1) # decay term
-        self.b2 = _f(b2) # decay term
-        self.b1_t_default = _f(b1) # decay term power t
-        self.b2_t_default = _f(b2) # decay term power t
+        self.b1 = _f(b1)  # decay term
+        self.b2 = _f(b2)  # decay term
+        self.b1_t_default = _f(b1)  # decay term power t
+        self.b2_t_default = _f(b2)  # decay term power t
         self.eps = _f(eps)
 
         super().__init__(lr)
@@ -159,18 +164,20 @@ class Adam(Optimizer):
         self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
 
         return -self.lr * (self.mt / (1 - self.b1_t)) \
-                / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
+            / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
+
 
 class Nadam(Optimizer):
     # paper: https://arxiv.org/abs/1412.6980
     # paper: http://cs229.stanford.edu/proj2015/054_report.pdf
     # TODO: double-check this implementation. also read the damn paper.
-    # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
-    # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
+    # lifted from:
+    # https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
+    # https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
 
     def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
-        self.b1 = _f(b1) # decay term
-        self.b2 = _f(b2) # decay term
+        self.b1 = _f(b1)  # decay term
+        self.b2 = _f(b2)  # decay term
         self.eps = _f(eps)
 
         super().__init__(lr)
@@ -208,6 +215,7 @@ class Nadam(Optimizer):
 
         return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
 
+
 # more
 
 class FTML(Optimizer):
@@ -216,8 +224,8 @@ class FTML(Optimizer):
 
     def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
         self.iterations = _0
-        self.b1 = _f(b1) # decay term
-        self.b2 = _f(b2) # decay term
+        self.b1 = _f(b1)  # decay term
+        self.b2 = _f(b2)  # decay term
         self.eps = _f(eps)
 
         super().__init__(lr)
@@ -231,10 +239,14 @@ class FTML(Optimizer):
         self.b2_t = _1
 
     def compute(self, dW, W):
-        if self.dt1 is None: self.dt1 = np.zeros_like(dW)
-        if self.dt is None: self.dt = np.zeros_like(dW)
-        if self.vt is None: self.vt = np.zeros_like(dW)
-        if self.zt is None: self.zt = np.zeros_like(dW)
+        if self.dt1 is None:
+            self.dt1 = np.zeros_like(dW)
+        if self.dt is None:
+            self.dt = np.zeros_like(dW)
+        if self.vt is None:
+            self.vt = np.zeros_like(dW)
+        if self.zt is None:
+            self.zt = np.zeros_like(dW)
 
         # NOTE: we could probably rewrite these equations to avoid this copy.
         self.dt1[:] = self.dt[:]
@@ -260,6 +272,7 @@ class FTML(Optimizer):
         # subtract by weights to avoid having to override self.update.
         return -self.zt / self.dt - W
 
+
 class MomentumClip(Optimizer):
     def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
         self.mu = _f(mu)
@@ -289,22 +302,25 @@ class MomentumClip(Optimizer):
         else:
             return -self.lr * self.accum
 
+
 class YellowFin(Optimizer):
     # paper: https://arxiv.org/abs/1706.03471
     # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
-    # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
-    # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
+    # author's implementation:
+    # https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
+    # code lifted:
+    # https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
 
     def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
                  debias=True, clip=1.0):
         self.lr_default = _f(lr)
         self.mu_default = _f(mu)
         self.beta = _f(beta)
-        self.window_size = int(window_size) # curv_win_width
+        self.window_size = int(window_size)  # curv_win_width
         self.debias_enabled = bool(debias)
         self.clip = _f(clip)
 
-        self.mu = _f(mu) # momentum
+        self.mu = _f(mu)  # momentum
         super().__init__(lr)
 
     def reset(self):
@@ -316,13 +332,13 @@ class YellowFin(Optimizer):
         self.step = 0
         self.beta_t = self.beta
 
-        self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
+        self.curv_win = np.zeros([self.window_size, ], dtype=np.float32)
 
         self.h_min = None
         self.h_max = None
 
         self.g_lpf = 0
-        #self.g_squared_lpf = 0
+        # self.g_squared_lpf = 0
         self.g_norm_squared_lpf = 0
         self.g_norm_lpf = 0
         self.h_min_lpf = 0
@@ -332,7 +348,8 @@ class YellowFin(Optimizer):
         self.mu_lpf = 0
 
     def get_lr_mu(self):
-        p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
+        p = (np.square(self.dist_avg) * np.square(self.h_min)) \
+            / (2 * self.g_var)
         w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
         w = np.power(w3, 1/3)
         y = w - p / (3 * w)
@@ -360,11 +377,11 @@ class YellowFin(Optimizer):
         total_norm = np.linalg.norm(dW)
         clip_scale = self.clip / (total_norm + 1e-6)
         if clip_scale < 1:
-            #print("clipping gradients; norm: {:10.5f}".format(total_norm))
+            # print("clipping gradients; norm: {:10.5f}".format(total_norm))
             dW *= clip_scale
 
-        #fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
-        #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
+        # fmt = 'W std: {:10.7f}e-3,  dWstd: {:10.7f}e-3,  V std: {:10.7f}e-3'
+        # print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
 
         b = self.beta
         m1b = 1 - self.beta
@@ -380,30 +397,31 @@ class YellowFin(Optimizer):
         h_min_t = np.min(valid_window)
         h_max_t = np.max(valid_window)
 
-        self.g_lpf              = b * self.g_lpf              + m1b * g
-        #self.g_squared_lpf      = b * self.g_squared_lpf      + m1b * g_squared
-        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
-        self.g_norm_lpf         = b * self.g_norm_lpf         + m1b * g_norm
-        self.h_min_lpf          = b * self.h_min_lpf          + m1b * h_min_t
-        self.h_max_lpf          = b * self.h_max_lpf          + m1b * h_max_t
+        self.g_lpf = b * self.g_lpf + m1b * g
+        # self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
+        self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \
+            + m1b * g_norm_squared
+        self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
+        self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
+        self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
 
-        g_avg              = debias * self.g_lpf
-        #g_squared_avg      = debias * self.g_squared_lpf
+        g_avg = debias * self.g_lpf
+        # g_squared_avg = debias * self.g_squared_lpf
         g_norm_squared_avg = debias * self.g_norm_squared_lpf
-        g_norm_avg         = debias * self.g_norm_lpf
-        self.h_min         = debias * self.h_min_lpf
-        self.h_max         = debias * self.h_max_lpf
+        g_norm_avg = debias * self.g_norm_lpf
+        self.h_min = debias * self.h_min_lpf
+        self.h_max = debias * self.h_max_lpf
         assert self.h_max >= self.h_min
 
         dist = g_norm_avg / g_norm_squared_avg
 
-        self.dist_lpf           = b * self.dist_lpf           + m1b * dist
+        self.dist_lpf = b * self.dist_lpf + m1b * dist
 
-        self.dist_avg      = debias * self.dist_lpf
+        self.dist_avg = debias * self.dist_lpf
 
         self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
         # equivalently:
-        #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
+        # self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
 
         if self.step > 0:
             lr_for_real, mu_for_real = self.get_lr_mu()
@@ -419,6 +437,7 @@ class YellowFin(Optimizer):
         self.beta_t *= self.beta
         return V
 
+
 class AddSign(Optimizer):
     # paper: https://arxiv.org/abs/1709.07417
 
@@ -438,10 +457,11 @@ class AddSign(Optimizer):
         self.accum[:] = self.accum * self.mu + dW
 
         signed = np.sign(dW) * np.sign(self.accum)
-        #signed *= decay
+        # signed *= decay
 
         return -self.lr * dW * (self.alpha + signed)
 
+
 class PowerSign(Optimizer):
     # paper: https://arxiv.org/abs/1709.07417
 
@@ -462,13 +482,14 @@ class PowerSign(Optimizer):
         self.accum[:] = self.accum * self.mu + dW
 
         signed = np.sign(dW) * np.sign(self.accum)
-        #signed *= decay
+        # signed *= decay
 
         if self.use_exp:
             return -self.lr * dW * np.exp(signed)
         else:
             return -self.lr * dW * np.power(self.alpha, signed)
 
+
 class Neumann(Optimizer):
     # paper: https://arxiv.org/abs/1712.03298
     # NOTE: this implementation is missing resetting as described in the paper.
@@ -478,20 +499,20 @@ class Neumann(Optimizer):
     #       it seems like using a Learner like SineCLR makes this unnecessary.
 
     def __init__(self, lr=0.01):
-        self.alpha = _f(1e-7) # cubic.
-        self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
-        self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
+        self.alpha = _f(1e-7)  # cubic.
+        self.beta = _f(1e-5)  # repulsive. NOTE: multiplied by len(dW) later.
+        self.gamma = _f(0.99)  # EMA, or 1-pole low-pass parameter. same thing.
         # momentum is ∝ (in the shape of) 1 - 1/(1 + t)
         self.mu_min = _f(0.5)
         self.mu_max = _f(0.9)
-        self.reset_period = 0 # TODO
+        self.reset_period = 0  # TODO
 
         super().__init__(lr)
 
     def reset(self):
         # NOTE: mt and vt are different than the pair in Adam-like optimizers.
-        self.mt = None # momentum accumulator.
-        self.vt = None # weight accumulator.
+        self.mt = None  # momentum accumulator.
+        self.vt = None  # weight accumulator.
         self.t = 0
 
     def compute(self, dW, W):
@@ -510,7 +531,7 @@ class Neumann(Optimizer):
             return
 
         # momentum quantity:
-        mu = _1 - _1/_f(self.t) # the + 1 is implicit.
+        mu = _1 - _1/_f(self.t)  # the + 1 is implicit.
         mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
 
         # smoothed change in weights:
@@ -529,4 +550,3 @@ class Neumann(Optimizer):
         # weights and accumulator:
         W += mu * self.mt - self.lr * dt
         self.vt = W + self.gamma * (self.vt - W)
-
diff --git a/onn/optimizer_base.py b/onn/optimizer_base.py
index 3a90f64..95852b5 100644
--- a/onn/optimizer_base.py
+++ b/onn/optimizer_base.py
@@ -2,9 +2,10 @@ import numpy as np
 
 from .float import *
 
+
 class Optimizer:
     def __init__(self, lr=0.1):
-        self.lr = _f(lr) # learning rate
+        self.lr = _f(lr)  # learning rate
         self.reset()
 
     def reset(self):
@@ -15,5 +16,3 @@ class Optimizer:
 
     def update(self, dW, W):
         W += self.compute(dW, W)
-
-
diff --git a/onn/parametric.py b/onn/parametric.py
index 540b857..52bf0ca 100644
--- a/onn/parametric.py
+++ b/onn/parametric.py
@@ -4,6 +4,7 @@ from .float import *
 from .layer_base import *
 from .initialization import *
 
+
 class Bias(Layer):
     # TODO: support axes other than -1 and shapes other than 1D.
 
@@ -28,6 +29,7 @@ class Bias(Layer):
         self.biases.g += dY.sum(0)
         return dY
 
+
 class Dense(Layer):
     serialized = {
         'W': 'coeffs',
@@ -38,8 +40,10 @@ class Dense(Layer):
         super().__init__()
         self.dim = int(dim)
         self.output_shape = (dim,)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
-        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
+        self.coeffs = self._new_weights('coeffs', init=init,
+                                        regularizer=reg_w)
+        self.biases = self._new_weights('biases', init=init_zeros,
+                                        regularizer=reg_b)
 
     def make_shape(self, parent):
         shape = parent.output_shape
@@ -101,18 +105,20 @@ class Conv1Dper(Layer):
 
     def forward(self, X):
         if self.wrap0 == 0:
-            Xper = np.hstack((X,X[:,:self.wrap1]))
+            Xper = np.hstack((X, X[:, :self.wrap1]))
         elif self.wrap1 == 0:
-            Xper = np.hstack((X[:,-self.wrap0:],X))
+            Xper = np.hstack((X[:, -self.wrap0:], X))
         else:
-            Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
+            Xper = np.hstack((X[:, -self.wrap0:], X, X[:, :self.wrap1]))
         self.cols = rolling_batch(Xper, self.kernel_size)
-        convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
+        convolved = (self.cols * self.coeffs.f[:, ::-1]).sum(2)
         return convolved
 
     def backward(self, dY):
-        self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
-        return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
+        self.coeffs.g += (dY[:, :, None] * self.cols).sum(0)[:, ::-1].sum(
+            0, keepdims=True)
+        return (dY[:, :, None] * self.coeffs.f[:, ::-1]).sum(2)
+
 
 class LayerNorm(Layer):
     # paper: https://arxiv.org/abs/1607.06450
@@ -168,7 +174,8 @@ class LayerNorm(Layer):
 
         return dX
 
-class Denses(Layer): # TODO: rename?
+
+class Denses(Layer):  # TODO: rename?
     # acts as a separate Dense for each row or column. only for 2D arrays.
 
     serialized = {
@@ -176,13 +183,16 @@ class Denses(Layer): # TODO: rename?
         'b': 'biases',
     }
 
-    def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
+    def __init__(self, dim, init=init_he_uniform,
+                 reg_w=None, reg_b=None, axis=-1):
         super().__init__()
         self.dim = int(dim)
         self.weight_init = init
         self.axis = int(axis)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
-        self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
+        self.coeffs = self._new_weights('coeffs', init=init,
+                                        regularizer=reg_w)
+        self.biases = self._new_weights('biases', init=init_zeros,
+                                        regularizer=reg_b)
 
     def make_shape(self, parent):
         shape = parent.output_shape
@@ -220,9 +230,11 @@ class Denses(Layer): # TODO: rename?
             self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
             return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
 
+
 class CosineDense(Dense):
     # paper: https://arxiv.org/abs/1702.05870
-    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
+    # another implementation:
+    # https://github.com/farizrahman4u/keras-contrib/pull/36
     # the paper doesn't mention bias,
     # so we treat bias as an additional weight with a constant input of 1.
     # this is correct in Dense layers, so i hope it's correct here too.
@@ -231,24 +243,25 @@ class CosineDense(Dense):
 
     def forward(self, X):
         self.X = X
-        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
-          + 1 + self.eps)
-        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
-          + np.square(self.biases.f) + self.eps)
+        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True)
+                              + 1 + self.eps)
+        self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True)
+                              + np.square(self.biases.f) + self.eps)
         self.dot = X @ self.coeffs.f + self.biases.f
         Y = self.dot / (self.X_norm * self.W_norm)
         return Y
 
     def backward(self, dY):
         ddot = dY / self.X_norm / self.W_norm
-        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
-        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
+        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) \
+            / self.X_norm**2
+        dW_norm = -(dY * self.dot / self.X_norm).sum(0, keepdims=True) \
+            / self.W_norm**2
 
-        self.coeffs.g += self.X.T @ ddot            \
-          + dW_norm / self.W_norm * self.coeffs.f
+        self.coeffs.g += self.X.T @ ddot \
+            + dW_norm / self.W_norm * self.coeffs.f
         self.biases.g += ddot.sum(0, keepdims=True) \
-          + dW_norm / self.W_norm * self.biases.f
+            + dW_norm / self.W_norm * self.biases.f
         dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
 
         return dX
-
diff --git a/onn/regularizer.py b/onn/regularizer.py
index 9a6aebf..37887c3 100644
--- a/onn/regularizer.py
+++ b/onn/regularizer.py
@@ -2,9 +2,11 @@ import numpy as np
 
 from .float import *
 
+
 class Regularizer:
     pass
 
+
 class L1L2(Regularizer):
     def __init__(self, l1=0.0, l2=0.0):
         self.l1 = _f(l1)
@@ -26,6 +28,7 @@ class L1L2(Regularizer):
             df += self.l2 * 2 * X
         return df
 
+
 # more
 
 class SaturateRelu(Regularizer):
diff --git a/onn/ritual.py b/onn/ritual.py
index 07b4dea..96c6b4d 100644
--- a/onn/ritual.py
+++ b/onn/ritual.py
@@ -4,6 +4,7 @@ from .float import *
 from .initialization import *
 from .ritual_base import *
 
+
 def stochastic_multiply(W, gamma=0.5, allow_negation=False):
     # paper: https://arxiv.org/abs/1606.01981
 
@@ -23,6 +24,7 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=False):
         mult *= np.where(samples < prob, 1, -1)
     np.multiply(W, mult, out=W)
 
+
 class StochMRitual(Ritual):
     # paper: https://arxiv.org/abs/1606.01981
     # this probably doesn't make sense for regression problems,
@@ -38,8 +40,8 @@ class StochMRitual(Ritual):
 
     def learn(self, inputs, outputs):
         # an experiment:
-        #assert self.learner.rate < 10, self.learner.rate
-        #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
+        # assert self.learner.rate < 10, self.learner.rate
+        # self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
 
         self.W[:] = self.model.W
         for layer in self.model.ordered_nodes:
@@ -57,6 +59,7 @@ class StochMRitual(Ritual):
                 np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
             #   np.clip(layer.W, -1, 1, out=layer.W)
 
+
 class NoisyRitual(Ritual):
     def __init__(self, learner=None,
                  input_noise=0, output_noise=0, gradient_noise=0):
@@ -69,7 +72,7 @@ class NoisyRitual(Ritual):
         # this is pretty crude
         if self.input_noise > 0:
             s = self.input_noise
-            inputs =   inputs + np.random.normal(0, s, size=inputs.shape)
+            inputs = inputs + np.random.normal(0, s, size=inputs.shape)
         if self.output_noise > 0:
             s = self.output_noise
             outputs = outputs + np.random.normal(0, s, size=outputs.shape)
@@ -80,11 +83,10 @@ class NoisyRitual(Ritual):
         if self.gradient_noise > 0:
             size = len(self.model.dW)
             gamma = 0.55
-            #s = self.gradient_noise / (1 + self.bn) ** gamma
+            # s = self.gradient_noise / (1 + self.bn) ** gamma
             # experiments:
             s = self.gradient_noise * np.sqrt(self.learner.rate)
-            #s = np.square(self.learner.rate)
-            #s = self.learner.rate / self.en
+            # s = np.square(self.learner.rate)
+            # s = self.learner.rate / self.en
             self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
         super().update()
-
diff --git a/onn/ritual_base.py b/onn/ritual_base.py
index 470a4f6..c026994 100644
--- a/onn/ritual_base.py
+++ b/onn/ritual_base.py
@@ -3,7 +3,8 @@ import numpy as np
 
 from .float import *
 
-class Ritual: # i'm just making up names at this point.
+
+class Ritual:  # i'm just making up names at this point.
     def __init__(self, learner=None):
         self.learner = learner if learner is not None else Learner(Optimizer())
         self.model = None
@@ -77,7 +78,8 @@ class Ritual: # i'm just making up names at this point.
 
         if shuffle:
             if gen:
-                raise Exception("shuffling is incompatibile with using a generator.")
+                raise Exception(
+                    "shuffling is incompatibile with using a generator.")
             indices = np.arange(inputs.shape[0])
             np.random.shuffle(indices)
             inputs = inputs[indices]
@@ -90,7 +92,7 @@ class Ritual: # i'm just making up names at this point.
             batch_count = inputs.shape[0] // batch_size
             # TODO: lift this restriction
             assert inputs.shape[0] % batch_size == 0, \
-              "inputs is not evenly divisible by batch_size"
+                "inputs is not evenly divisible by batch_size"
 
         prev_batch_size = None
         for b in range(batch_count):
@@ -101,17 +103,20 @@ class Ritual: # i'm just making up names at this point.
                 batch_inputs, batch_outputs = next(generator)
                 batch_size = batch_inputs.shape[0]
                 # TODO: lift this restriction
-                assert batch_size == prev_batch_size or prev_batch_size is None, \
-                  "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
+                fmt = "non-constant batch size (got {}, expected {})"
+                assert (batch_size == prev_batch_size
+                        or prev_batch_size is None), \
+                    fmt.format(batch_size, prev_batch_size)
             else:
                 bi = b * batch_size
-                batch_inputs  = inputs[ bi:bi+batch_size]
+                batch_inputs = inputs[bi:bi+batch_size]
                 batch_outputs = outputs[bi:bi+batch_size]
 
             if clear_grad:
                 self.model.clear_grad()
             self._train_batch(batch_inputs, batch_outputs, b, batch_count,
-                              test_only, return_losses=='both', return_losses)
+                              test_only, return_losses == 'both',
+                              return_losses)
 
             prev_batch_size = batch_size
 
diff --git a/onn/utility.py b/onn/utility.py
index edd895c..9dc14f7 100644
--- a/onn/utility.py
+++ b/onn/utility.py
@@ -1,17 +1,23 @@
 import sys
 
+
 def lament(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
 
+
 def lower_priority():
     """Set the priority of the process to below-normal."""
     # via https://stackoverflow.com/a/1023269
     if sys.platform == 'win32':
         try:
-            import win32api, win32process, win32con
+            import win32api
+            import win32process
+            import win32con
             pid = win32api.GetCurrentProcessId()
-            handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
-            win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
+            handle = win32api.OpenProcess(
+                win32con.PROCESS_ALL_ACCESS, True, pid)
+            win32process.SetPriorityClass(
+                handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
         except ImportError:
             lament("you do not have pywin32 installed.")
             lament("the process priority could not be lowered.")
@@ -21,9 +27,12 @@ def lower_priority():
         import os
         os.nice(1)
 
+
 # more
 
 _log_was_update = False
+
+
 def log(left, right, update=False):
     s = "\x1B[1m  {:>20}:\x1B[0m   {}".format(left, right)
     global _log_was_update
@@ -33,5 +42,6 @@ def log(left, right, update=False):
         lament(s)
     _log_was_update = update
 
+
 class Dummy:
     pass
diff --git a/onn/weight.py b/onn/weight.py
index a531d64..2ea6092 100644
--- a/onn/weight.py
+++ b/onn/weight.py
@@ -1,11 +1,12 @@
 import numpy as np
 
+
 class Weights:
     # we may or may not contain weights -- or any information, for that matter.
 
     def __init__(self, **kwargs):
-        self.f = None # forward weights
-        self.g = None # backward weights (gradients)
+        self.f = None  # forward weights
+        self.g = None  # backward weights (gradients)
         self.shape = None
         self.init = None
         self.allocator = None
@@ -16,7 +17,7 @@ class Weights:
 
     def configure(self, **kwargs):
         for k, v in kwargs.items():
-            getattr(self, k) # ensures the key already exists
+            getattr(self, k)  # ensures the key already exists
             setattr(self, k, v)
 
     @property