From 169303813d568c82ee01c4cf362df0b48af3eafc Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Mon, 22 Jan 2018 19:40:36 +0000 Subject: [PATCH] basic PEP 8 compliance rip readability --- onn/__init__.py | 2 +- onn/activation.py | 20 ++++++- onn/float.py | 2 + onn/initialization.py | 7 +++ onn/layer.py | 18 ++++-- onn/layer_base.py | 27 +++++---- onn/learner.py | 25 +++++--- onn/loss.py | 14 ++++- onn/math.py | 3 +- onn/model.py | 53 +++++++++++------ onn/nodal.py | 6 +- onn/optimizer.py | 130 ++++++++++++++++++++++++------------------ onn/optimizer_base.py | 5 +- onn/parametric.py | 59 +++++++++++-------- onn/regularizer.py | 3 + onn/ritual.py | 16 +++--- onn/ritual_base.py | 19 +++--- onn/utility.py | 16 +++++- onn/weight.py | 7 ++- 19 files changed, 282 insertions(+), 150 deletions(-) diff --git a/onn/__init__.py b/onn/__init__.py index 437b3a9..c724ec8 100644 --- a/onn/__init__.py +++ b/onn/__init__.py @@ -1,5 +1,5 @@ # external packages required for full functionality: -# numpy scipy h5py sklearn dotmap +# numpy scipy h5py sklearn # BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*. # this is gonna take some work. diff --git a/onn/activation.py b/onn/activation.py index f42165e..debd19d 100644 --- a/onn/activation.py +++ b/onn/activation.py @@ -6,6 +6,7 @@ from scipy.special import expit as sigmoid from .float import * from .layer_base import * + class Identity(Layer): def forward(self, X): return X @@ -13,7 +14,8 @@ class Identity(Layer): def backward(self, dY): return dY -class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) + +class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) def forward(self, X): self.sig = sigmoid(X) return self.sig @@ -21,6 +23,7 @@ class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) def backward(self, dY): return dY * self.sig * (1 - self.sig) + class Softplus(Layer): # integral of Sigmoid. @@ -31,6 +34,7 @@ class Softplus(Layer): def backward(self, dY): return dY * sigmoid(self.X) + class Tanh(Layer): def forward(self, X): self.sig = np.tanh(X) @@ -39,6 +43,7 @@ class Tanh(Layer): def backward(self, dY): return dY * (1 - self.sig * self.sig) + class LeCunTanh(Layer): # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf @@ -53,6 +58,7 @@ class LeCunTanh(Layer): def backward(self, dY): return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig) + class Relu(Layer): def forward(self, X): self.cond = X >= 0 @@ -61,12 +67,13 @@ class Relu(Layer): def backward(self, dY): return np.where(self.cond, dY, 0) + class Elu(Layer): # paper: https://arxiv.org/abs/1511.07289 def __init__(self, alpha=1): super().__init__() - self.alpha = _f(alpha) # FIXME: unused + self.alpha = _f(alpha) # FIXME: unused def forward(self, X): self.cond = X >= 0 @@ -76,6 +83,7 @@ class Elu(Layer): def backward(self, dY): return dY * np.where(self.cond, 1, self.neg + 1) + class GeluApprox(Layer): # paper: https://arxiv.org/abs/1606.08415 # plot: https://www.desmos.com/calculator/ydzgtccsld @@ -88,6 +96,7 @@ class GeluApprox(Layer): def backward(self, dY): return dY * self.sig * (1 + self.a * (1 - self.sig)) + class Softmax(Layer): def forward(self, X): alpha = np.max(X, axis=-1, keepdims=True) @@ -99,6 +108,7 @@ class Softmax(Layer): def backward(self, dY): return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm + class LogSoftmax(Softmax): def __init__(self, eps=1e-6): super().__init__() @@ -110,6 +120,7 @@ class LogSoftmax(Softmax): def backward(self, dY): return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm + class Cos(Layer): # performs well on MNIST for some strange reason. @@ -120,6 +131,7 @@ class Cos(Layer): def backward(self, dY): return dY * -np.sin(self.X) + class Selu(Layer): # paper: https://arxiv.org/abs/1706.02515 @@ -136,6 +148,7 @@ class Selu(Layer): def backward(self, dY): return dY * self.lamb * np.where(self.cond, 1, self.neg) + # more class TanhTest(Layer): @@ -146,6 +159,7 @@ class TanhTest(Layer): def backward(self, dY): return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig) + class ExpGB(Layer): # an output layer for one-hot classification problems. # use with MSE (SquaredHalved), not CategoricalCrossentropy! @@ -163,6 +177,7 @@ class ExpGB(Layer): # this gradient is intentionally incorrect. return dY + class CubicGB(Layer): # an output layer for one-hot classification problems. # use with MSE (SquaredHalved), not CategoricalCrossentropy! @@ -182,4 +197,3 @@ class CubicGB(Layer): def backward(self, dY): # this gradient is intentionally incorrect. return dY - diff --git a/onn/float.py b/onn/float.py index 7d3b2ca..e4e8945 100644 --- a/onn/float.py +++ b/onn/float.py @@ -2,11 +2,13 @@ import numpy as np _f = np.float32 + def _check(a): assert isinstance(a, np.ndarray) or type(a) == _f, type(a) assert a.dtype == _f, a.dtype return a + _0 = _f(0) _1 = _f(1) _2 = _f(2) diff --git a/onn/initialization.py b/onn/initialization.py index 46916c5..6431744 100644 --- a/onn/initialization.py +++ b/onn/initialization.py @@ -2,28 +2,35 @@ import numpy as np # note: these are currently only implemented for 2D shapes. + def init_zeros(size, ins=None, outs=None): return np.zeros(size) + def init_ones(size, ins=None, outs=None): return np.ones(size) + def init_he_normal(size, ins, outs): s = np.sqrt(2 / ins) return np.random.normal(0, s, size=size) + def init_he_uniform(size, ins, outs): s = np.sqrt(6 / ins) return np.random.uniform(-s, s, size=size) + def init_glorot_normal(size, ins, outs): s = np.sqrt(2 / (ins + outs)) return np.random.normal(0, s, size=size) + def init_glorot_uniform(size, ins, outs): s = np.sqrt(6 / (ins + outs)) return np.random.uniform(-s, s, size=size) + # more def init_gaussian_unit(size, ins, outs): diff --git a/onn/layer.py b/onn/layer.py index e3d33bc..67fbc9a 100644 --- a/onn/layer.py +++ b/onn/layer.py @@ -2,6 +2,7 @@ from .layer_base import * from .initialization import * from .float import * + # Nonparametric Layers {{{1 class Input(Layer): @@ -16,9 +17,10 @@ class Input(Layer): return X def backward(self, dY): - #self.dY = dY + # self.dY = dY return np.zeros_like(dY) + class Reshape(Layer): def __init__(self, new_shape): super().__init__() @@ -33,6 +35,7 @@ class Reshape(Layer): assert dY.shape[0] == self.batch_size return dY.reshape(self.batch_size, *self.input_shape) + class Flatten(Layer): def make_shape(self, parent): shape = parent.output_shape @@ -47,6 +50,7 @@ class Flatten(Layer): assert dY.shape[0] == self.batch_size return dY.reshape(self.batch_size, *self.input_shape) + class ConstAffine(Layer): def __init__(self, a=1, b=0): super().__init__() @@ -59,13 +63,15 @@ class ConstAffine(Layer): def backward(self, dY): return dY * self.a + class Sum(Layer): def _propagate(self, edges, deterministic): return np.sum(edges, axis=0) def _backpropagate(self, edges): - #assert len(edges) == 1, "unimplemented" - return edges[0] # TODO: does this always work? + # assert len(edges) == 1, "unimplemented" + return edges[0] # TODO: does this always work? + class ActivityRegularizer(Layer): def __init__(self, reg): @@ -81,6 +87,7 @@ class ActivityRegularizer(Layer): def backward(self, dY): return dY + self.reg.backward(self.X) + class Dropout(Layer): def __init__(self, dropout=0.0): super().__init__() @@ -92,12 +99,13 @@ class Dropout(Layer): return X * self.mask def forward_deterministic(self, X): - #self.mask = _1 + # self.mask = _1 return X def backward(self, dY): return dY * self.mask + # more class AlphaDropout(Layer): @@ -136,6 +144,7 @@ class AlphaDropout(Layer): def backward(self, dY): return dY * self.a * self.mask + class Decimate(Layer): # simple decimaton layer that drops every other sample from the last axis. @@ -168,6 +177,7 @@ class Decimate(Layer): dX.ravel()[1::2] = dY.ravel() return dX + class Undecimate(Layer): # inverse operation of Decimate. not quite interpolation. diff --git a/onn/layer_base.py b/onn/layer_base.py index 1ef1781..cbc0a8d 100644 --- a/onn/layer_base.py +++ b/onn/layer_base.py @@ -4,26 +4,29 @@ from collections import defaultdict, OrderedDict from .weight import * + # used for numbering layers like Keras: _layer_counters = defaultdict(lambda: 0) + class LayerIncompatibility(Exception): pass + class Layer: def __init__(self): self.parents = [] self.children = [] self.weights = OrderedDict() - self.loss = None # for activity regularizers + self.loss = None # for activity regularizers self.input_shape = None self.output_shape = None kind = self.__class__.__name__ global _layer_counters _layer_counters[kind] += 1 self.name = "{}_{}".format(kind, _layer_counters[kind]) - self.unsafe = False # disables assertions for better performance - self.shared = False # as in weight sharing + self.unsafe = False # disables assertions for better performance + self.shared = False # as in weight sharing def __str__(self): return self.name @@ -40,9 +43,9 @@ class Layer: raise NotImplementedError("unimplemented", self) def make_shape(self, parent): - if self.input_shape == None: + if self.input_shape is None: self.input_shape = parent.output_shape - if self.output_shape == None: + if self.output_shape is None: self.output_shape = self.input_shape def do_feed(self, child): @@ -75,16 +78,19 @@ class Layer: child.make_shape(self) if not child.is_compatible(self): fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" - raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape)) + raise LayerIncompatibility(fmt.format( + self, child, self.output_shape, child.input_shape)) self.do_feed(child) child.be_fed(self) return child def validate_input(self, X): - assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) + assert X.shape[1:] == self.input_shape, \ + (str(self), X.shape[1:], self.input_shape) def validate_output(self, Y): - assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) + assert Y.shape[1:] == self.output_shape, \ + (str(self), Y.shape[1:], self.output_shape) def _new_weights(self, name, **kwargs): w = Weights(**kwargs) @@ -93,9 +99,10 @@ class Layer: return w def share(self, node): - self.weights = node.weights # TODO: this should be all it takes. + self.weights = node.weights # TODO: this should be all it takes. for k, v in self.weights.items(): - vs = getattr(node, k) # hack: key isn't necessarily attribute name! + # hack: key isn't necessarily attribute name! + vs = getattr(node, k) setattr(self, k, vs) self.shared = True diff --git a/onn/learner.py b/onn/learner.py index 148ab79..8673f9a 100644 --- a/onn/learner.py +++ b/onn/learner.py @@ -1,13 +1,14 @@ from .float import * from .optimizer_base import * + class Learner: per_batch = False def __init__(self, optim, epochs=100, rate=None): assert isinstance(optim, Optimizer) self.optim = optim - self.start_rate = rate # None is okay; it'll use optim.lr instead. + self.start_rate = rate # None is okay; it'll use optim.lr instead. self.epochs = int(epochs) self.reset() @@ -49,7 +50,7 @@ class Learner: return False return True - def batch(self, progress): # TODO: rename + def batch(self, progress): # TODO: rename # interpolates rates between epochs. # unlike epochs, we do not store batch number as a state. # i.e. calling next() will not respect progress. @@ -60,6 +61,7 @@ class Learner: def final_rate(self): return self.rate_at(self.epochs - 1e-8) + class AnnealingLearner(Learner): def __init__(self, optim, epochs=100, rate=None, halve_every=10): self.halve_every = _f(halve_every) @@ -69,10 +71,12 @@ class AnnealingLearner(Learner): def rate_at(self, epoch): return super().rate_at(epoch) * self.anneal**epoch + def cosmod(x): # plot: https://www.desmos.com/calculator/hlgqmyswy2 return (_1 + np.cos((x % _1) * _pi)) * _inv2 + class SGDR(Learner): # Stochastic Gradient Descent with Restarts # paper: https://arxiv.org/abs/1608.03983 @@ -112,7 +116,8 @@ class SGDR(Learner): raise Exception('this should never happen.') def rate_at(self, epoch): - base_rate = self.start_rate if self.start_rate is not None else self.optim.lr + sr = self.start_rate + base_rate = sr if sr is not None else self.optim.lr restart, sub_epoch, next_restart = self.split_num(max(1, epoch)) x = _f(sub_epoch - 1) / _f(next_restart) return base_rate * self.decay**_f(restart) * cosmod(x) @@ -126,6 +131,7 @@ class SGDR(Learner): self.restart_callback(restart) return True + class TriangularCLR(Learner): per_batch = True @@ -141,11 +147,14 @@ class TriangularCLR(Learner): def _t(self, epoch): # NOTE: this could probably be simplified offset = self.frequency / 2 - return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset + return np.abs(((epoch - 1 + offset) % self.frequency) - offset) \ + / offset def rate_at(self, epoch): - upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr - return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate + sr = self.start_rate + lr = self.lower_rate + upper_rate = sr if sr is not None else self.optim.lr + return self._t(epoch) * (upper_rate - lr) + lr def next(self): if not super().next(): @@ -156,14 +165,17 @@ class TriangularCLR(Learner): self.callback(self.epoch // self.frequency) return True + class SineCLR(TriangularCLR): def _t(self, epoch): return np.sin(_pi * _inv2 * super()._t(epoch)) + class WaveCLR(TriangularCLR): def _t(self, epoch): return _inv2 * (_1 - np.cos(_pi * super()._t(epoch))) + # more class PolyLearner(Learner): @@ -177,4 +189,3 @@ class PolyLearner(Learner): progress = (epoch - 1) / (self.epochs) ret = np.polyval(self.coeffs, progress) return np.abs(ret) - diff --git a/onn/loss.py b/onn/loss.py index 2faa309..28167ec 100644 --- a/onn/loss.py +++ b/onn/loss.py @@ -2,6 +2,7 @@ import numpy as np from .float import * + class Loss: def forward(self, p, y): raise NotImplementedError("unimplemented", self) @@ -9,7 +10,8 @@ class Loss: def backward(self, p, y): raise NotImplementedError("unimplemented", self) -class NLL(Loss): # Negative Log Likelihood + +class NLL(Loss): # Negative Log Likelihood def forward(self, p, y): correct = p * y return np.mean(-correct) @@ -17,6 +19,7 @@ class NLL(Loss): # Negative Log Likelihood def backward(self, p, y): return -y / len(p) + class CategoricalCrossentropy(Loss): # lifted from theano @@ -33,6 +36,7 @@ class CategoricalCrossentropy(Loss): df = (p - y) / (p * (1 - p)) return df / len(y) + class Accuracy(Loss): # returns percentage of categories correctly predicted. # utilizes argmax(), so it cannot be used for gradient descent. @@ -45,6 +49,7 @@ class Accuracy(Loss): def backward(self, p, y): raise NotImplementedError("cannot take the gradient of Accuracy") + class ResidualLoss(Loss): def forward(self, p, y): return np.mean(self.f(p - y)) @@ -53,6 +58,7 @@ class ResidualLoss(Loss): ret = self.df(p - y) / len(y) return ret + class SquaredHalved(ResidualLoss): def f(self, r): return np.square(r) / 2 @@ -60,6 +66,7 @@ class SquaredHalved(ResidualLoss): def df(self, r): return r + class Squared(ResidualLoss): def f(self, r): return np.square(r) @@ -67,6 +74,7 @@ class Squared(ResidualLoss): def df(self, r): return 2 * r + class Absolute(ResidualLoss): def f(self, r): return np.abs(r) @@ -74,6 +82,7 @@ class Absolute(ResidualLoss): def df(self, r): return np.sign(r) + class Huber(ResidualLoss): def __init__(self, delta=1.0): self.delta = _f(delta) @@ -88,6 +97,7 @@ class Huber(ResidualLoss): r, self.delta * np.sign(r)) + # more class SomethingElse(ResidualLoss): @@ -105,6 +115,7 @@ class SomethingElse(ResidualLoss): def df(self, r): return np.sign(r) * np.abs(r)**self.c + class Confidence(Loss): # this isn't "confidence" in any meaningful way; (e.g. Bayesian) # it's just a metric of how large the value is of the predicted class. @@ -126,4 +137,3 @@ class Confidence(Loss): detc = p / categories / (1 - 1/categories) dmax = p == np.max(p, axis=-1, keepdims=True) return detc * dmax - diff --git a/onn/math.py b/onn/math.py index 794dfe6..9b2c90c 100644 --- a/onn/math.py +++ b/onn/math.py @@ -1,14 +1,15 @@ import numpy as np + def rolling(a, window): # http://stackoverflow.com/a/4924433 shape = (a.size - window + 1, window) strides = (a.itemsize, a.itemsize) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + def rolling_batch(a, window): # same as rolling, but acts on each batch (axis 0). shape = (a.shape[0], a.shape[-1] - window + 1, window) strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) - diff --git a/onn/model.py b/onn/model.py index ddda277..ece35dc 100644 --- a/onn/model.py +++ b/onn/model.py @@ -5,14 +5,16 @@ from .nodal import * from .layer_base import * from .utility import * + class Model: - def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False): + def __init__(self, nodes_in, nodes_out, + loss=None, mloss=None, unsafe=False): self.loss = loss if loss is not None else SquaredHalved() self.mloss = mloss if mloss is not None else loss - nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in + nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out - assert type(nodes_in) == list, type(nodes_in) + assert type(nodes_in) == list, type(nodes_in) assert type(nodes_out) == list, type(nodes_out) self.nodes_in = nodes_in self.nodes_out = nodes_out @@ -29,8 +31,9 @@ class Model: return self.nodes def make_weights(self): - self.param_count = sum((node.size for node in self.nodes if not node.shared)) - self.W = np.zeros(self.param_count, dtype=_f) + self.param_count = sum((node.size for node in self.nodes + if not node.shared)) + self.W = np.zeros(self.param_count, dtype=_f) self.dW = np.zeros(self.param_count, dtype=_f) offset = 0 @@ -47,37 +50,42 @@ class Model: assert size == len(ret[0]), (size, len(ret[0])) return ret + fmt = "Layer {} allocated {} weights than it said it would" node.init(allocate) - assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node) + assert inner_offset <= node.size, fmt.format("more", node) # i don't care if "less" is grammatically incorrect. # you're mom is grammatically incorrect. - assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node) + assert inner_offset >= node.size, fmt.format("less", node) offset += node.size def evaluate(self, input_, deterministic=True): - assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead" - assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead" + fmt = "ambiguous input in multi-{} network; use {}() instead" + assert len(self.nodes_in) == 1, fmt.format("input", "evaluate_multi") + assert len(self.nodes_out) == 1, fmt.format("output", "evaluate_multi") node_in = self.nodes_in[0] node_out = self.nodes_out[0] outputs = self.evaluate_multi({node_in: input_}, deterministic) return outputs[node_out] - def apply(self, error): # TODO: better name? - assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead" - assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead" + def apply(self, error): # TODO: better name? + fmt = "ambiguous input in multi-{} network; use {}() instead" + assert len(self.nodes_in) == 1, fmt.format("input", "apply_multi") + assert len(self.nodes_out) == 1, fmt.format("output", "apply_multi") node_in = self.nodes_in[0] node_out = self.nodes_out[0] inputs = self.apply_multi({node_out: error}) return inputs[node_in] def evaluate_multi(self, inputs, deterministic=True): + fmt = "missing {} for node {}" values = dict() outputs = dict() for node in self.nodes: if node in self.nodes_in: - assert node in inputs, "missing input for node {}".format(node.name) + assert node in inputs, fmt.format("input", node.name) X = inputs[node] - values[node] = node._propagate(np.expand_dims(X, 0), deterministic) + values[node] = node._propagate(np.expand_dims(X, 0), + deterministic) else: values[node] = node.propagate(values, deterministic) if node in self.nodes_out: @@ -85,11 +93,12 @@ class Model: return outputs def apply_multi(self, outputs): + fmt = "missing {} for node {}" values = dict() inputs = dict() for node in reversed(self.nodes): if node in self.nodes_out: - assert node in outputs, "missing output for node {}".format(node.name) + assert node in outputs, fmt.format("output", node.name) X = outputs[node] values[node] = node._backpropagate(np.expand_dims(X, 0)) else: @@ -135,13 +144,17 @@ class Model: def load_weights(self, fn): # seemingly compatible with keras' Dense layers. - import h5py - open(fn) # just ensure the file exists (python's error is better) - f = h5py.File(fn, 'r') weights = {} + + import h5py + open(fn) # just ensure the file exists (python's error is better) + + f = h5py.File(fn, 'r') + def visitor(name, obj): if isinstance(obj, h5py.Dataset): weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f) + f.visititems(visitor) f.close() @@ -194,5 +207,7 @@ class Model: children = [str(n) for n in node.children] if children: sep = '->' - print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file) + print('\t' + str(node) + sep + + (';\n\t' + str(node) + sep).join(children) + ';', + file=file) print('}', file=file) diff --git a/onn/nodal.py b/onn/nodal.py index 081b045..842c5f9 100644 --- a/onn/nodal.py +++ b/onn/nodal.py @@ -3,7 +3,8 @@ class DummyNode: def __init__(self, children=None, parents=None): self.children = children if children is not None else [] - self.parents = parents if parents is not None else [] + self.parents = parents if parents is not None else [] + def traverse(node_in, node_out, nodes=None, dummy_mode=False): # i have no idea if this is any algorithm in particular. @@ -27,7 +28,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False): if not seen_up[node]: continue parents_added = (parent in nodes for parent in node.parents) - if not node in nodes and all(parents_added): + if node not in nodes and all(parents_added): nodes.append(node) for child in node.children: q.append(child) @@ -37,6 +38,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False): return nodes + def traverse_all(nodes_in, nodes_out, nodes=None): all_in = DummyNode(children=nodes_in) all_out = DummyNode(parents=nodes_out) diff --git a/onn/optimizer.py b/onn/optimizer.py index 370f794..4a5836d 100644 --- a/onn/optimizer.py +++ b/onn/optimizer.py @@ -7,9 +7,10 @@ from .utility import * # some of the the following optimizers are blatantly lifted from tiny-dnn: # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h + class Momentum(Optimizer): def __init__(self, lr=0.01, mu=0.9, nesterov=False): - self.mu = _f(mu) # momentum + self.mu = _f(mu) # momentum self.nesterov = bool(nesterov) super().__init__(lr) @@ -28,6 +29,7 @@ class Momentum(Optimizer): return V + class Adagrad(Optimizer): def __init__(self, lr=0.01, eps=1e-8): self.eps = _f(eps) @@ -44,6 +46,7 @@ class Adagrad(Optimizer): self.g += np.square(dW) return -self.lr * dW / (np.sqrt(self.g) + self.eps) + class RMSprop(Optimizer): # RMSprop generalizes* Adagrad, etc. @@ -51,7 +54,7 @@ class RMSprop(Optimizer): # RMSprop.mu == 1 def __init__(self, lr=1e-4, mu=0.99, eps=1e-8): - self.mu = _f(mu) # decay term + self.mu = _f(mu) # decay term self.eps = _f(eps) # one might consider the following equation when specifying mu: @@ -70,12 +73,13 @@ class RMSprop(Optimizer): if self.g is None: self.g = np.zeros_like(dW) - # basically apply a first-order low-pass filter to delta squared + # basically apply a first-order low-pass filter to delta squared, self.g += (1 - self.mu) * (np.square(dW) - self.g) - # finally sqrt it to complete the running root-mean-square approximation + # and sqrt it to complete the running root-mean-square approximation. return -self.lr * dW / (np.sqrt(self.g) + self.eps) + class RMSpropCentered(Optimizer): # referenced TensorFlow/PyTorch. # paper: https://arxiv.org/pdf/1308.0850v5.pdf @@ -115,10 +119,11 @@ class RMSpropCentered(Optimizer): self.delta[:] = self.momentum * self.delta + self.lr * temp return -self.delta # PyTorch does it this way. - #self.delta[:] = self.momentum * self.delta + temp - #return -self.lr * self.delta + # self.delta[:] = self.momentum * self.delta + temp + # return -self.lr * self.delta # they are equivalent only when LR is constant, which it might not be. + class Adam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 # Adam generalizes* RMSprop, and @@ -130,10 +135,10 @@ class Adam(Optimizer): # Adam.b2 == RMSprop.mu def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term - self.b1_t_default = _f(b1) # decay term power t - self.b2_t_default = _f(b2) # decay term power t + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term + self.b1_t_default = _f(b1) # decay term power t + self.b2_t_default = _f(b2) # decay term power t self.eps = _f(eps) super().__init__(lr) @@ -159,18 +164,20 @@ class Adam(Optimizer): self.vt += (1 - self.b2) * (np.square(dW) - self.vt) return -self.lr * (self.mt / (1 - self.b1_t)) \ - / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) + / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) + class Nadam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 # paper: http://cs229.stanford.edu/proj2015/054_report.pdf # TODO: double-check this implementation. also read the damn paper. - # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 - # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py + # lifted from: + # https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 + # https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term self.eps = _f(eps) super().__init__(lr) @@ -208,6 +215,7 @@ class Nadam(Optimizer): return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) + # more class FTML(Optimizer): @@ -216,8 +224,8 @@ class FTML(Optimizer): def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8): self.iterations = _0 - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term self.eps = _f(eps) super().__init__(lr) @@ -231,10 +239,14 @@ class FTML(Optimizer): self.b2_t = _1 def compute(self, dW, W): - if self.dt1 is None: self.dt1 = np.zeros_like(dW) - if self.dt is None: self.dt = np.zeros_like(dW) - if self.vt is None: self.vt = np.zeros_like(dW) - if self.zt is None: self.zt = np.zeros_like(dW) + if self.dt1 is None: + self.dt1 = np.zeros_like(dW) + if self.dt is None: + self.dt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + if self.zt is None: + self.zt = np.zeros_like(dW) # NOTE: we could probably rewrite these equations to avoid this copy. self.dt1[:] = self.dt[:] @@ -260,6 +272,7 @@ class FTML(Optimizer): # subtract by weights to avoid having to override self.update. return -self.zt / self.dt - W + class MomentumClip(Optimizer): def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False): self.mu = _f(mu) @@ -289,22 +302,25 @@ class MomentumClip(Optimizer): else: return -self.lr * self.accum + class YellowFin(Optimizer): # paper: https://arxiv.org/abs/1706.03471 # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/ - # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py - # code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666 + # author's implementation: + # https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py + # code lifted: + # https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666 def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20, debias=True, clip=1.0): self.lr_default = _f(lr) self.mu_default = _f(mu) self.beta = _f(beta) - self.window_size = int(window_size) # curv_win_width + self.window_size = int(window_size) # curv_win_width self.debias_enabled = bool(debias) self.clip = _f(clip) - self.mu = _f(mu) # momentum + self.mu = _f(mu) # momentum super().__init__(lr) def reset(self): @@ -316,13 +332,13 @@ class YellowFin(Optimizer): self.step = 0 self.beta_t = self.beta - self.curv_win = np.zeros([self.window_size,], dtype=np.float32) + self.curv_win = np.zeros([self.window_size, ], dtype=np.float32) self.h_min = None self.h_max = None self.g_lpf = 0 - #self.g_squared_lpf = 0 + # self.g_squared_lpf = 0 self.g_norm_squared_lpf = 0 self.g_norm_lpf = 0 self.h_min_lpf = 0 @@ -332,7 +348,8 @@ class YellowFin(Optimizer): self.mu_lpf = 0 def get_lr_mu(self): - p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var) + p = (np.square(self.dist_avg) * np.square(self.h_min)) \ + / (2 * self.g_var) w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5) w = np.power(w3, 1/3) y = w - p / (3 * w) @@ -360,11 +377,11 @@ class YellowFin(Optimizer): total_norm = np.linalg.norm(dW) clip_scale = self.clip / (total_norm + 1e-6) if clip_scale < 1: - #print("clipping gradients; norm: {:10.5f}".format(total_norm)) + # print("clipping gradients; norm: {:10.5f}".format(total_norm)) dW *= clip_scale - #fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3' - #print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100)) + # fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3' + # print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100)) b = self.beta m1b = 1 - self.beta @@ -380,30 +397,31 @@ class YellowFin(Optimizer): h_min_t = np.min(valid_window) h_max_t = np.max(valid_window) - self.g_lpf = b * self.g_lpf + m1b * g - #self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared - self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared - self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm - self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t - self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t + self.g_lpf = b * self.g_lpf + m1b * g + # self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared + self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \ + + m1b * g_norm_squared + self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm + self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t + self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t - g_avg = debias * self.g_lpf - #g_squared_avg = debias * self.g_squared_lpf + g_avg = debias * self.g_lpf + # g_squared_avg = debias * self.g_squared_lpf g_norm_squared_avg = debias * self.g_norm_squared_lpf - g_norm_avg = debias * self.g_norm_lpf - self.h_min = debias * self.h_min_lpf - self.h_max = debias * self.h_max_lpf + g_norm_avg = debias * self.g_norm_lpf + self.h_min = debias * self.h_min_lpf + self.h_max = debias * self.h_max_lpf assert self.h_max >= self.h_min dist = g_norm_avg / g_norm_squared_avg - self.dist_lpf = b * self.dist_lpf + m1b * dist + self.dist_lpf = b * self.dist_lpf + m1b * dist - self.dist_avg = debias * self.dist_lpf + self.dist_avg = debias * self.dist_lpf self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg)) # equivalently: - #self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg))) + # self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg))) if self.step > 0: lr_for_real, mu_for_real = self.get_lr_mu() @@ -419,6 +437,7 @@ class YellowFin(Optimizer): self.beta_t *= self.beta return V + class AddSign(Optimizer): # paper: https://arxiv.org/abs/1709.07417 @@ -438,10 +457,11 @@ class AddSign(Optimizer): self.accum[:] = self.accum * self.mu + dW signed = np.sign(dW) * np.sign(self.accum) - #signed *= decay + # signed *= decay return -self.lr * dW * (self.alpha + signed) + class PowerSign(Optimizer): # paper: https://arxiv.org/abs/1709.07417 @@ -462,13 +482,14 @@ class PowerSign(Optimizer): self.accum[:] = self.accum * self.mu + dW signed = np.sign(dW) * np.sign(self.accum) - #signed *= decay + # signed *= decay if self.use_exp: return -self.lr * dW * np.exp(signed) else: return -self.lr * dW * np.power(self.alpha, signed) + class Neumann(Optimizer): # paper: https://arxiv.org/abs/1712.03298 # NOTE: this implementation is missing resetting as described in the paper. @@ -478,20 +499,20 @@ class Neumann(Optimizer): # it seems like using a Learner like SineCLR makes this unnecessary. def __init__(self, lr=0.01): - self.alpha = _f(1e-7) # cubic. - self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later. - self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing. + self.alpha = _f(1e-7) # cubic. + self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later. + self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing. # momentum is ∝ (in the shape of) 1 - 1/(1 + t) self.mu_min = _f(0.5) self.mu_max = _f(0.9) - self.reset_period = 0 # TODO + self.reset_period = 0 # TODO super().__init__(lr) def reset(self): # NOTE: mt and vt are different than the pair in Adam-like optimizers. - self.mt = None # momentum accumulator. - self.vt = None # weight accumulator. + self.mt = None # momentum accumulator. + self.vt = None # weight accumulator. self.t = 0 def compute(self, dW, W): @@ -510,7 +531,7 @@ class Neumann(Optimizer): return # momentum quantity: - mu = _1 - _1/_f(self.t) # the + 1 is implicit. + mu = _1 - _1/_f(self.t) # the + 1 is implicit. mu = (mu + self.mu_min) * (self.mu_max - self.mu_min) # smoothed change in weights: @@ -529,4 +550,3 @@ class Neumann(Optimizer): # weights and accumulator: W += mu * self.mt - self.lr * dt self.vt = W + self.gamma * (self.vt - W) - diff --git a/onn/optimizer_base.py b/onn/optimizer_base.py index 3a90f64..95852b5 100644 --- a/onn/optimizer_base.py +++ b/onn/optimizer_base.py @@ -2,9 +2,10 @@ import numpy as np from .float import * + class Optimizer: def __init__(self, lr=0.1): - self.lr = _f(lr) # learning rate + self.lr = _f(lr) # learning rate self.reset() def reset(self): @@ -15,5 +16,3 @@ class Optimizer: def update(self, dW, W): W += self.compute(dW, W) - - diff --git a/onn/parametric.py b/onn/parametric.py index 540b857..52bf0ca 100644 --- a/onn/parametric.py +++ b/onn/parametric.py @@ -4,6 +4,7 @@ from .float import * from .layer_base import * from .initialization import * + class Bias(Layer): # TODO: support axes other than -1 and shapes other than 1D. @@ -28,6 +29,7 @@ class Bias(Layer): self.biases.g += dY.sum(0) return dY + class Dense(Layer): serialized = { 'W': 'coeffs', @@ -38,8 +40,10 @@ class Dense(Layer): super().__init__() self.dim = int(dim) self.output_shape = (dim,) - self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) - self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) + self.coeffs = self._new_weights('coeffs', init=init, + regularizer=reg_w) + self.biases = self._new_weights('biases', init=init_zeros, + regularizer=reg_b) def make_shape(self, parent): shape = parent.output_shape @@ -101,18 +105,20 @@ class Conv1Dper(Layer): def forward(self, X): if self.wrap0 == 0: - Xper = np.hstack((X,X[:,:self.wrap1])) + Xper = np.hstack((X, X[:, :self.wrap1])) elif self.wrap1 == 0: - Xper = np.hstack((X[:,-self.wrap0:],X)) + Xper = np.hstack((X[:, -self.wrap0:], X)) else: - Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1])) + Xper = np.hstack((X[:, -self.wrap0:], X, X[:, :self.wrap1])) self.cols = rolling_batch(Xper, self.kernel_size) - convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2) + convolved = (self.cols * self.coeffs.f[:, ::-1]).sum(2) return convolved def backward(self, dY): - self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True) - return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2) + self.coeffs.g += (dY[:, :, None] * self.cols).sum(0)[:, ::-1].sum( + 0, keepdims=True) + return (dY[:, :, None] * self.coeffs.f[:, ::-1]).sum(2) + class LayerNorm(Layer): # paper: https://arxiv.org/abs/1607.06450 @@ -168,7 +174,8 @@ class LayerNorm(Layer): return dX -class Denses(Layer): # TODO: rename? + +class Denses(Layer): # TODO: rename? # acts as a separate Dense for each row or column. only for 2D arrays. serialized = { @@ -176,13 +183,16 @@ class Denses(Layer): # TODO: rename? 'b': 'biases', } - def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1): + def __init__(self, dim, init=init_he_uniform, + reg_w=None, reg_b=None, axis=-1): super().__init__() self.dim = int(dim) self.weight_init = init self.axis = int(axis) - self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) - self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) + self.coeffs = self._new_weights('coeffs', init=init, + regularizer=reg_w) + self.biases = self._new_weights('biases', init=init_zeros, + regularizer=reg_b) def make_shape(self, parent): shape = parent.output_shape @@ -220,9 +230,11 @@ class Denses(Layer): # TODO: rename? self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY) return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f) + class CosineDense(Dense): # paper: https://arxiv.org/abs/1702.05870 - # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36 + # another implementation: + # https://github.com/farizrahman4u/keras-contrib/pull/36 # the paper doesn't mention bias, # so we treat bias as an additional weight with a constant input of 1. # this is correct in Dense layers, so i hope it's correct here too. @@ -231,24 +243,25 @@ class CosineDense(Dense): def forward(self, X): self.X = X - self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \ - + 1 + self.eps) - self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \ - + np.square(self.biases.f) + self.eps) + self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) + + 1 + self.eps) + self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) + + np.square(self.biases.f) + self.eps) self.dot = X @ self.coeffs.f + self.biases.f Y = self.dot / (self.X_norm * self.W_norm) return Y def backward(self, dY): ddot = dY / self.X_norm / self.W_norm - dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2 - dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2 + dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) \ + / self.X_norm**2 + dW_norm = -(dY * self.dot / self.X_norm).sum(0, keepdims=True) \ + / self.W_norm**2 - self.coeffs.g += self.X.T @ ddot \ - + dW_norm / self.W_norm * self.coeffs.f + self.coeffs.g += self.X.T @ ddot \ + + dW_norm / self.W_norm * self.coeffs.f self.biases.g += ddot.sum(0, keepdims=True) \ - + dW_norm / self.W_norm * self.biases.f + + dW_norm / self.W_norm * self.biases.f dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X return dX - diff --git a/onn/regularizer.py b/onn/regularizer.py index 9a6aebf..37887c3 100644 --- a/onn/regularizer.py +++ b/onn/regularizer.py @@ -2,9 +2,11 @@ import numpy as np from .float import * + class Regularizer: pass + class L1L2(Regularizer): def __init__(self, l1=0.0, l2=0.0): self.l1 = _f(l1) @@ -26,6 +28,7 @@ class L1L2(Regularizer): df += self.l2 * 2 * X return df + # more class SaturateRelu(Regularizer): diff --git a/onn/ritual.py b/onn/ritual.py index 07b4dea..96c6b4d 100644 --- a/onn/ritual.py +++ b/onn/ritual.py @@ -4,6 +4,7 @@ from .float import * from .initialization import * from .ritual_base import * + def stochastic_multiply(W, gamma=0.5, allow_negation=False): # paper: https://arxiv.org/abs/1606.01981 @@ -23,6 +24,7 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=False): mult *= np.where(samples < prob, 1, -1) np.multiply(W, mult, out=W) + class StochMRitual(Ritual): # paper: https://arxiv.org/abs/1606.01981 # this probably doesn't make sense for regression problems, @@ -38,8 +40,8 @@ class StochMRitual(Ritual): def learn(self, inputs, outputs): # an experiment: - #assert self.learner.rate < 10, self.learner.rate - #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate)) + # assert self.learner.rate < 10, self.learner.rate + # self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate)) self.W[:] = self.model.W for layer in self.model.ordered_nodes: @@ -57,6 +59,7 @@ class StochMRitual(Ritual): np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W) # np.clip(layer.W, -1, 1, out=layer.W) + class NoisyRitual(Ritual): def __init__(self, learner=None, input_noise=0, output_noise=0, gradient_noise=0): @@ -69,7 +72,7 @@ class NoisyRitual(Ritual): # this is pretty crude if self.input_noise > 0: s = self.input_noise - inputs = inputs + np.random.normal(0, s, size=inputs.shape) + inputs = inputs + np.random.normal(0, s, size=inputs.shape) if self.output_noise > 0: s = self.output_noise outputs = outputs + np.random.normal(0, s, size=outputs.shape) @@ -80,11 +83,10 @@ class NoisyRitual(Ritual): if self.gradient_noise > 0: size = len(self.model.dW) gamma = 0.55 - #s = self.gradient_noise / (1 + self.bn) ** gamma + # s = self.gradient_noise / (1 + self.bn) ** gamma # experiments: s = self.gradient_noise * np.sqrt(self.learner.rate) - #s = np.square(self.learner.rate) - #s = self.learner.rate / self.en + # s = np.square(self.learner.rate) + # s = self.learner.rate / self.en self.model.dW += np.random.normal(0, max(s, 1e-8), size=size) super().update() - diff --git a/onn/ritual_base.py b/onn/ritual_base.py index 470a4f6..c026994 100644 --- a/onn/ritual_base.py +++ b/onn/ritual_base.py @@ -3,7 +3,8 @@ import numpy as np from .float import * -class Ritual: # i'm just making up names at this point. + +class Ritual: # i'm just making up names at this point. def __init__(self, learner=None): self.learner = learner if learner is not None else Learner(Optimizer()) self.model = None @@ -77,7 +78,8 @@ class Ritual: # i'm just making up names at this point. if shuffle: if gen: - raise Exception("shuffling is incompatibile with using a generator.") + raise Exception( + "shuffling is incompatibile with using a generator.") indices = np.arange(inputs.shape[0]) np.random.shuffle(indices) inputs = inputs[indices] @@ -90,7 +92,7 @@ class Ritual: # i'm just making up names at this point. batch_count = inputs.shape[0] // batch_size # TODO: lift this restriction assert inputs.shape[0] % batch_size == 0, \ - "inputs is not evenly divisible by batch_size" + "inputs is not evenly divisible by batch_size" prev_batch_size = None for b in range(batch_count): @@ -101,17 +103,20 @@ class Ritual: # i'm just making up names at this point. batch_inputs, batch_outputs = next(generator) batch_size = batch_inputs.shape[0] # TODO: lift this restriction - assert batch_size == prev_batch_size or prev_batch_size is None, \ - "non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size) + fmt = "non-constant batch size (got {}, expected {})" + assert (batch_size == prev_batch_size + or prev_batch_size is None), \ + fmt.format(batch_size, prev_batch_size) else: bi = b * batch_size - batch_inputs = inputs[ bi:bi+batch_size] + batch_inputs = inputs[bi:bi+batch_size] batch_outputs = outputs[bi:bi+batch_size] if clear_grad: self.model.clear_grad() self._train_batch(batch_inputs, batch_outputs, b, batch_count, - test_only, return_losses=='both', return_losses) + test_only, return_losses == 'both', + return_losses) prev_batch_size = batch_size diff --git a/onn/utility.py b/onn/utility.py index edd895c..9dc14f7 100644 --- a/onn/utility.py +++ b/onn/utility.py @@ -1,17 +1,23 @@ import sys + def lament(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) + def lower_priority(): """Set the priority of the process to below-normal.""" # via https://stackoverflow.com/a/1023269 if sys.platform == 'win32': try: - import win32api, win32process, win32con + import win32api + import win32process + import win32con pid = win32api.GetCurrentProcessId() - handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid) - win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS) + handle = win32api.OpenProcess( + win32con.PROCESS_ALL_ACCESS, True, pid) + win32process.SetPriorityClass( + handle, win32process.BELOW_NORMAL_PRIORITY_CLASS) except ImportError: lament("you do not have pywin32 installed.") lament("the process priority could not be lowered.") @@ -21,9 +27,12 @@ def lower_priority(): import os os.nice(1) + # more _log_was_update = False + + def log(left, right, update=False): s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right) global _log_was_update @@ -33,5 +42,6 @@ def log(left, right, update=False): lament(s) _log_was_update = update + class Dummy: pass diff --git a/onn/weight.py b/onn/weight.py index a531d64..2ea6092 100644 --- a/onn/weight.py +++ b/onn/weight.py @@ -1,11 +1,12 @@ import numpy as np + class Weights: # we may or may not contain weights -- or any information, for that matter. def __init__(self, **kwargs): - self.f = None # forward weights - self.g = None # backward weights (gradients) + self.f = None # forward weights + self.g = None # backward weights (gradients) self.shape = None self.init = None self.allocator = None @@ -16,7 +17,7 @@ class Weights: def configure(self, **kwargs): for k, v in kwargs.items(): - getattr(self, k) # ensures the key already exists + getattr(self, k) # ensures the key already exists setattr(self, k, v) @property