basic PEP 8 compliance

rip readability
This commit is contained in:
Connor Olding 2018-01-22 19:40:36 +00:00
parent c81ce0afbb
commit 169303813d
19 changed files with 282 additions and 150 deletions

View file

@ -1,5 +1,5 @@
# external packages required for full functionality:
# numpy scipy h5py sklearn dotmap
# numpy scipy h5py sklearn
# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
# this is gonna take some work.

View file

@ -6,6 +6,7 @@ from scipy.special import expit as sigmoid
from .float import *
from .layer_base import *
class Identity(Layer):
def forward(self, X):
return X
@ -13,6 +14,7 @@ class Identity(Layer):
def backward(self, dY):
return dY
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
def forward(self, X):
self.sig = sigmoid(X)
@ -21,6 +23,7 @@ class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
def backward(self, dY):
return dY * self.sig * (1 - self.sig)
class Softplus(Layer):
# integral of Sigmoid.
@ -31,6 +34,7 @@ class Softplus(Layer):
def backward(self, dY):
return dY * sigmoid(self.X)
class Tanh(Layer):
def forward(self, X):
self.sig = np.tanh(X)
@ -39,6 +43,7 @@ class Tanh(Layer):
def backward(self, dY):
return dY * (1 - self.sig * self.sig)
class LeCunTanh(Layer):
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
@ -53,6 +58,7 @@ class LeCunTanh(Layer):
def backward(self, dY):
return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
class Relu(Layer):
def forward(self, X):
self.cond = X >= 0
@ -61,6 +67,7 @@ class Relu(Layer):
def backward(self, dY):
return np.where(self.cond, dY, 0)
class Elu(Layer):
# paper: https://arxiv.org/abs/1511.07289
@ -76,6 +83,7 @@ class Elu(Layer):
def backward(self, dY):
return dY * np.where(self.cond, 1, self.neg + 1)
class GeluApprox(Layer):
# paper: https://arxiv.org/abs/1606.08415
# plot: https://www.desmos.com/calculator/ydzgtccsld
@ -88,6 +96,7 @@ class GeluApprox(Layer):
def backward(self, dY):
return dY * self.sig * (1 + self.a * (1 - self.sig))
class Softmax(Layer):
def forward(self, X):
alpha = np.max(X, axis=-1, keepdims=True)
@ -99,6 +108,7 @@ class Softmax(Layer):
def backward(self, dY):
return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
class LogSoftmax(Softmax):
def __init__(self, eps=1e-6):
super().__init__()
@ -110,6 +120,7 @@ class LogSoftmax(Softmax):
def backward(self, dY):
return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
class Cos(Layer):
# performs well on MNIST for some strange reason.
@ -120,6 +131,7 @@ class Cos(Layer):
def backward(self, dY):
return dY * -np.sin(self.X)
class Selu(Layer):
# paper: https://arxiv.org/abs/1706.02515
@ -136,6 +148,7 @@ class Selu(Layer):
def backward(self, dY):
return dY * self.lamb * np.where(self.cond, 1, self.neg)
# more
class TanhTest(Layer):
@ -146,6 +159,7 @@ class TanhTest(Layer):
def backward(self, dY):
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
class ExpGB(Layer):
# an output layer for one-hot classification problems.
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
@ -163,6 +177,7 @@ class ExpGB(Layer):
# this gradient is intentionally incorrect.
return dY
class CubicGB(Layer):
# an output layer for one-hot classification problems.
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
@ -182,4 +197,3 @@ class CubicGB(Layer):
def backward(self, dY):
# this gradient is intentionally incorrect.
return dY

View file

@ -2,11 +2,13 @@ import numpy as np
_f = np.float32
def _check(a):
assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
assert a.dtype == _f, a.dtype
return a
_0 = _f(0)
_1 = _f(1)
_2 = _f(2)

View file

@ -2,28 +2,35 @@ import numpy as np
# note: these are currently only implemented for 2D shapes.
def init_zeros(size, ins=None, outs=None):
return np.zeros(size)
def init_ones(size, ins=None, outs=None):
return np.ones(size)
def init_he_normal(size, ins, outs):
s = np.sqrt(2 / ins)
return np.random.normal(0, s, size=size)
def init_he_uniform(size, ins, outs):
s = np.sqrt(6 / ins)
return np.random.uniform(-s, s, size=size)
def init_glorot_normal(size, ins, outs):
s = np.sqrt(2 / (ins + outs))
return np.random.normal(0, s, size=size)
def init_glorot_uniform(size, ins, outs):
s = np.sqrt(6 / (ins + outs))
return np.random.uniform(-s, s, size=size)
# more
def init_gaussian_unit(size, ins, outs):

View file

@ -2,6 +2,7 @@ from .layer_base import *
from .initialization import *
from .float import *
# Nonparametric Layers {{{1
class Input(Layer):
@ -16,9 +17,10 @@ class Input(Layer):
return X
def backward(self, dY):
#self.dY = dY
# self.dY = dY
return np.zeros_like(dY)
class Reshape(Layer):
def __init__(self, new_shape):
super().__init__()
@ -33,6 +35,7 @@ class Reshape(Layer):
assert dY.shape[0] == self.batch_size
return dY.reshape(self.batch_size, *self.input_shape)
class Flatten(Layer):
def make_shape(self, parent):
shape = parent.output_shape
@ -47,6 +50,7 @@ class Flatten(Layer):
assert dY.shape[0] == self.batch_size
return dY.reshape(self.batch_size, *self.input_shape)
class ConstAffine(Layer):
def __init__(self, a=1, b=0):
super().__init__()
@ -59,14 +63,16 @@ class ConstAffine(Layer):
def backward(self, dY):
return dY * self.a
class Sum(Layer):
def _propagate(self, edges, deterministic):
return np.sum(edges, axis=0)
def _backpropagate(self, edges):
#assert len(edges) == 1, "unimplemented"
# assert len(edges) == 1, "unimplemented"
return edges[0] # TODO: does this always work?
class ActivityRegularizer(Layer):
def __init__(self, reg):
super().__init__()
@ -81,6 +87,7 @@ class ActivityRegularizer(Layer):
def backward(self, dY):
return dY + self.reg.backward(self.X)
class Dropout(Layer):
def __init__(self, dropout=0.0):
super().__init__()
@ -92,12 +99,13 @@ class Dropout(Layer):
return X * self.mask
def forward_deterministic(self, X):
#self.mask = _1
# self.mask = _1
return X
def backward(self, dY):
return dY * self.mask
# more
class AlphaDropout(Layer):
@ -136,6 +144,7 @@ class AlphaDropout(Layer):
def backward(self, dY):
return dY * self.a * self.mask
class Decimate(Layer):
# simple decimaton layer that drops every other sample from the last axis.
@ -168,6 +177,7 @@ class Decimate(Layer):
dX.ravel()[1::2] = dY.ravel()
return dX
class Undecimate(Layer):
# inverse operation of Decimate. not quite interpolation.

View file

@ -4,12 +4,15 @@ from collections import defaultdict, OrderedDict
from .weight import *
# used for numbering layers like Keras:
_layer_counters = defaultdict(lambda: 0)
class LayerIncompatibility(Exception):
pass
class Layer:
def __init__(self):
self.parents = []
@ -40,9 +43,9 @@ class Layer:
raise NotImplementedError("unimplemented", self)
def make_shape(self, parent):
if self.input_shape == None:
if self.input_shape is None:
self.input_shape = parent.output_shape
if self.output_shape == None:
if self.output_shape is None:
self.output_shape = self.input_shape
def do_feed(self, child):
@ -75,16 +78,19 @@ class Layer:
child.make_shape(self)
if not child.is_compatible(self):
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
raise LayerIncompatibility(fmt.format(
self, child, self.output_shape, child.input_shape))
self.do_feed(child)
child.be_fed(self)
return child
def validate_input(self, X):
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
assert X.shape[1:] == self.input_shape, \
(str(self), X.shape[1:], self.input_shape)
def validate_output(self, Y):
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
assert Y.shape[1:] == self.output_shape, \
(str(self), Y.shape[1:], self.output_shape)
def _new_weights(self, name, **kwargs):
w = Weights(**kwargs)
@ -95,7 +101,8 @@ class Layer:
def share(self, node):
self.weights = node.weights # TODO: this should be all it takes.
for k, v in self.weights.items():
vs = getattr(node, k) # hack: key isn't necessarily attribute name!
# hack: key isn't necessarily attribute name!
vs = getattr(node, k)
setattr(self, k, vs)
self.shared = True

View file

@ -1,6 +1,7 @@
from .float import *
from .optimizer_base import *
class Learner:
per_batch = False
@ -60,6 +61,7 @@ class Learner:
def final_rate(self):
return self.rate_at(self.epochs - 1e-8)
class AnnealingLearner(Learner):
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
self.halve_every = _f(halve_every)
@ -69,10 +71,12 @@ class AnnealingLearner(Learner):
def rate_at(self, epoch):
return super().rate_at(epoch) * self.anneal**epoch
def cosmod(x):
# plot: https://www.desmos.com/calculator/hlgqmyswy2
return (_1 + np.cos((x % _1) * _pi)) * _inv2
class SGDR(Learner):
# Stochastic Gradient Descent with Restarts
# paper: https://arxiv.org/abs/1608.03983
@ -112,7 +116,8 @@ class SGDR(Learner):
raise Exception('this should never happen.')
def rate_at(self, epoch):
base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
sr = self.start_rate
base_rate = sr if sr is not None else self.optim.lr
restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
x = _f(sub_epoch - 1) / _f(next_restart)
return base_rate * self.decay**_f(restart) * cosmod(x)
@ -126,6 +131,7 @@ class SGDR(Learner):
self.restart_callback(restart)
return True
class TriangularCLR(Learner):
per_batch = True
@ -141,11 +147,14 @@ class TriangularCLR(Learner):
def _t(self, epoch):
# NOTE: this could probably be simplified
offset = self.frequency / 2
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) \
/ offset
def rate_at(self, epoch):
upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
sr = self.start_rate
lr = self.lower_rate
upper_rate = sr if sr is not None else self.optim.lr
return self._t(epoch) * (upper_rate - lr) + lr
def next(self):
if not super().next():
@ -156,14 +165,17 @@ class TriangularCLR(Learner):
self.callback(self.epoch // self.frequency)
return True
class SineCLR(TriangularCLR):
def _t(self, epoch):
return np.sin(_pi * _inv2 * super()._t(epoch))
class WaveCLR(TriangularCLR):
def _t(self, epoch):
return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
# more
class PolyLearner(Learner):
@ -177,4 +189,3 @@ class PolyLearner(Learner):
progress = (epoch - 1) / (self.epochs)
ret = np.polyval(self.coeffs, progress)
return np.abs(ret)

View file

@ -2,6 +2,7 @@ import numpy as np
from .float import *
class Loss:
def forward(self, p, y):
raise NotImplementedError("unimplemented", self)
@ -9,6 +10,7 @@ class Loss:
def backward(self, p, y):
raise NotImplementedError("unimplemented", self)
class NLL(Loss): # Negative Log Likelihood
def forward(self, p, y):
correct = p * y
@ -17,6 +19,7 @@ class NLL(Loss): # Negative Log Likelihood
def backward(self, p, y):
return -y / len(p)
class CategoricalCrossentropy(Loss):
# lifted from theano
@ -33,6 +36,7 @@ class CategoricalCrossentropy(Loss):
df = (p - y) / (p * (1 - p))
return df / len(y)
class Accuracy(Loss):
# returns percentage of categories correctly predicted.
# utilizes argmax(), so it cannot be used for gradient descent.
@ -45,6 +49,7 @@ class Accuracy(Loss):
def backward(self, p, y):
raise NotImplementedError("cannot take the gradient of Accuracy")
class ResidualLoss(Loss):
def forward(self, p, y):
return np.mean(self.f(p - y))
@ -53,6 +58,7 @@ class ResidualLoss(Loss):
ret = self.df(p - y) / len(y)
return ret
class SquaredHalved(ResidualLoss):
def f(self, r):
return np.square(r) / 2
@ -60,6 +66,7 @@ class SquaredHalved(ResidualLoss):
def df(self, r):
return r
class Squared(ResidualLoss):
def f(self, r):
return np.square(r)
@ -67,6 +74,7 @@ class Squared(ResidualLoss):
def df(self, r):
return 2 * r
class Absolute(ResidualLoss):
def f(self, r):
return np.abs(r)
@ -74,6 +82,7 @@ class Absolute(ResidualLoss):
def df(self, r):
return np.sign(r)
class Huber(ResidualLoss):
def __init__(self, delta=1.0):
self.delta = _f(delta)
@ -88,6 +97,7 @@ class Huber(ResidualLoss):
r,
self.delta * np.sign(r))
# more
class SomethingElse(ResidualLoss):
@ -105,6 +115,7 @@ class SomethingElse(ResidualLoss):
def df(self, r):
return np.sign(r) * np.abs(r)**self.c
class Confidence(Loss):
# this isn't "confidence" in any meaningful way; (e.g. Bayesian)
# it's just a metric of how large the value is of the predicted class.
@ -126,4 +137,3 @@ class Confidence(Loss):
detc = p / categories / (1 - 1/categories)
dmax = p == np.max(p, axis=-1, keepdims=True)
return detc * dmax

View file

@ -1,14 +1,15 @@
import numpy as np
def rolling(a, window):
# http://stackoverflow.com/a/4924433
shape = (a.size - window + 1, window)
strides = (a.itemsize, a.itemsize)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def rolling_batch(a, window):
# same as rolling, but acts on each batch (axis 0).
shape = (a.shape[0], a.shape[-1] - window + 1, window)
strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

View file

@ -5,8 +5,10 @@ from .nodal import *
from .layer_base import *
from .utility import *
class Model:
def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
def __init__(self, nodes_in, nodes_out,
loss=None, mloss=None, unsafe=False):
self.loss = loss if loss is not None else SquaredHalved()
self.mloss = mloss if mloss is not None else loss
@ -29,7 +31,8 @@ class Model:
return self.nodes
def make_weights(self):
self.param_count = sum((node.size for node in self.nodes if not node.shared))
self.param_count = sum((node.size for node in self.nodes
if not node.shared))
self.W = np.zeros(self.param_count, dtype=_f)
self.dW = np.zeros(self.param_count, dtype=_f)
@ -47,37 +50,42 @@ class Model:
assert size == len(ret[0]), (size, len(ret[0]))
return ret
fmt = "Layer {} allocated {} weights than it said it would"
node.init(allocate)
assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
assert inner_offset <= node.size, fmt.format("more", node)
# i don't care if "less" is grammatically incorrect.
# you're mom is grammatically incorrect.
assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
assert inner_offset >= node.size, fmt.format("less", node)
offset += node.size
def evaluate(self, input_, deterministic=True):
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
fmt = "ambiguous input in multi-{} network; use {}() instead"
assert len(self.nodes_in) == 1, fmt.format("input", "evaluate_multi")
assert len(self.nodes_out) == 1, fmt.format("output", "evaluate_multi")
node_in = self.nodes_in[0]
node_out = self.nodes_out[0]
outputs = self.evaluate_multi({node_in: input_}, deterministic)
return outputs[node_out]
def apply(self, error): # TODO: better name?
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
fmt = "ambiguous input in multi-{} network; use {}() instead"
assert len(self.nodes_in) == 1, fmt.format("input", "apply_multi")
assert len(self.nodes_out) == 1, fmt.format("output", "apply_multi")
node_in = self.nodes_in[0]
node_out = self.nodes_out[0]
inputs = self.apply_multi({node_out: error})
return inputs[node_in]
def evaluate_multi(self, inputs, deterministic=True):
fmt = "missing {} for node {}"
values = dict()
outputs = dict()
for node in self.nodes:
if node in self.nodes_in:
assert node in inputs, "missing input for node {}".format(node.name)
assert node in inputs, fmt.format("input", node.name)
X = inputs[node]
values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
values[node] = node._propagate(np.expand_dims(X, 0),
deterministic)
else:
values[node] = node.propagate(values, deterministic)
if node in self.nodes_out:
@ -85,11 +93,12 @@ class Model:
return outputs
def apply_multi(self, outputs):
fmt = "missing {} for node {}"
values = dict()
inputs = dict()
for node in reversed(self.nodes):
if node in self.nodes_out:
assert node in outputs, "missing output for node {}".format(node.name)
assert node in outputs, fmt.format("output", node.name)
X = outputs[node]
values[node] = node._backpropagate(np.expand_dims(X, 0))
else:
@ -135,13 +144,17 @@ class Model:
def load_weights(self, fn):
# seemingly compatible with keras' Dense layers.
weights = {}
import h5py
open(fn) # just ensure the file exists (python's error is better)
f = h5py.File(fn, 'r')
weights = {}
def visitor(name, obj):
if isinstance(obj, h5py.Dataset):
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
f.visititems(visitor)
f.close()
@ -194,5 +207,7 @@ class Model:
children = [str(n) for n in node.children]
if children:
sep = '->'
print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
print('\t' + str(node) + sep +
(';\n\t' + str(node) + sep).join(children) + ';',
file=file)
print('}', file=file)

View file

@ -5,6 +5,7 @@ class DummyNode:
self.children = children if children is not None else []
self.parents = parents if parents is not None else []
def traverse(node_in, node_out, nodes=None, dummy_mode=False):
# i have no idea if this is any algorithm in particular.
nodes = nodes if nodes is not None else []
@ -27,7 +28,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
if not seen_up[node]:
continue
parents_added = (parent in nodes for parent in node.parents)
if not node in nodes and all(parents_added):
if node not in nodes and all(parents_added):
nodes.append(node)
for child in node.children:
q.append(child)
@ -37,6 +38,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
return nodes
def traverse_all(nodes_in, nodes_out, nodes=None):
all_in = DummyNode(children=nodes_in)
all_out = DummyNode(parents=nodes_out)

View file

@ -7,6 +7,7 @@ from .utility import *
# some of the the following optimizers are blatantly lifted from tiny-dnn:
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
class Momentum(Optimizer):
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
self.mu = _f(mu) # momentum
@ -28,6 +29,7 @@ class Momentum(Optimizer):
return V
class Adagrad(Optimizer):
def __init__(self, lr=0.01, eps=1e-8):
self.eps = _f(eps)
@ -44,6 +46,7 @@ class Adagrad(Optimizer):
self.g += np.square(dW)
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
class RMSprop(Optimizer):
# RMSprop generalizes* Adagrad, etc.
@ -70,12 +73,13 @@ class RMSprop(Optimizer):
if self.g is None:
self.g = np.zeros_like(dW)
# basically apply a first-order low-pass filter to delta squared
# basically apply a first-order low-pass filter to delta squared,
self.g += (1 - self.mu) * (np.square(dW) - self.g)
# finally sqrt it to complete the running root-mean-square approximation
# and sqrt it to complete the running root-mean-square approximation.
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
class RMSpropCentered(Optimizer):
# referenced TensorFlow/PyTorch.
# paper: https://arxiv.org/pdf/1308.0850v5.pdf
@ -115,10 +119,11 @@ class RMSpropCentered(Optimizer):
self.delta[:] = self.momentum * self.delta + self.lr * temp
return -self.delta
# PyTorch does it this way.
#self.delta[:] = self.momentum * self.delta + temp
#return -self.lr * self.delta
# self.delta[:] = self.momentum * self.delta + temp
# return -self.lr * self.delta
# they are equivalent only when LR is constant, which it might not be.
class Adam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
# Adam generalizes* RMSprop, and
@ -161,12 +166,14 @@ class Adam(Optimizer):
return -self.lr * (self.mt / (1 - self.b1_t)) \
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
class Nadam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
# TODO: double-check this implementation. also read the damn paper.
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
# lifted from:
# https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term
@ -208,6 +215,7 @@ class Nadam(Optimizer):
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
# more
class FTML(Optimizer):
@ -231,10 +239,14 @@ class FTML(Optimizer):
self.b2_t = _1
def compute(self, dW, W):
if self.dt1 is None: self.dt1 = np.zeros_like(dW)
if self.dt is None: self.dt = np.zeros_like(dW)
if self.vt is None: self.vt = np.zeros_like(dW)
if self.zt is None: self.zt = np.zeros_like(dW)
if self.dt1 is None:
self.dt1 = np.zeros_like(dW)
if self.dt is None:
self.dt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
if self.zt is None:
self.zt = np.zeros_like(dW)
# NOTE: we could probably rewrite these equations to avoid this copy.
self.dt1[:] = self.dt[:]
@ -260,6 +272,7 @@ class FTML(Optimizer):
# subtract by weights to avoid having to override self.update.
return -self.zt / self.dt - W
class MomentumClip(Optimizer):
def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
self.mu = _f(mu)
@ -289,11 +302,14 @@ class MomentumClip(Optimizer):
else:
return -self.lr * self.accum
class YellowFin(Optimizer):
# paper: https://arxiv.org/abs/1706.03471
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
# code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
# author's implementation:
# https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
# code lifted:
# https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
debias=True, clip=1.0):
@ -316,13 +332,13 @@ class YellowFin(Optimizer):
self.step = 0
self.beta_t = self.beta
self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
self.curv_win = np.zeros([self.window_size, ], dtype=np.float32)
self.h_min = None
self.h_max = None
self.g_lpf = 0
#self.g_squared_lpf = 0
# self.g_squared_lpf = 0
self.g_norm_squared_lpf = 0
self.g_norm_lpf = 0
self.h_min_lpf = 0
@ -332,7 +348,8 @@ class YellowFin(Optimizer):
self.mu_lpf = 0
def get_lr_mu(self):
p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
p = (np.square(self.dist_avg) * np.square(self.h_min)) \
/ (2 * self.g_var)
w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
w = np.power(w3, 1/3)
y = w - p / (3 * w)
@ -360,11 +377,11 @@ class YellowFin(Optimizer):
total_norm = np.linalg.norm(dW)
clip_scale = self.clip / (total_norm + 1e-6)
if clip_scale < 1:
#print("clipping gradients; norm: {:10.5f}".format(total_norm))
# print("clipping gradients; norm: {:10.5f}".format(total_norm))
dW *= clip_scale
#fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
#print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
# fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
# print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
b = self.beta
m1b = 1 - self.beta
@ -381,14 +398,15 @@ class YellowFin(Optimizer):
h_max_t = np.max(valid_window)
self.g_lpf = b * self.g_lpf + m1b * g
#self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
# self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \
+ m1b * g_norm_squared
self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
g_avg = debias * self.g_lpf
#g_squared_avg = debias * self.g_squared_lpf
# g_squared_avg = debias * self.g_squared_lpf
g_norm_squared_avg = debias * self.g_norm_squared_lpf
g_norm_avg = debias * self.g_norm_lpf
self.h_min = debias * self.h_min_lpf
@ -403,7 +421,7 @@ class YellowFin(Optimizer):
self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
# equivalently:
#self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
# self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
if self.step > 0:
lr_for_real, mu_for_real = self.get_lr_mu()
@ -419,6 +437,7 @@ class YellowFin(Optimizer):
self.beta_t *= self.beta
return V
class AddSign(Optimizer):
# paper: https://arxiv.org/abs/1709.07417
@ -438,10 +457,11 @@ class AddSign(Optimizer):
self.accum[:] = self.accum * self.mu + dW
signed = np.sign(dW) * np.sign(self.accum)
#signed *= decay
# signed *= decay
return -self.lr * dW * (self.alpha + signed)
class PowerSign(Optimizer):
# paper: https://arxiv.org/abs/1709.07417
@ -462,13 +482,14 @@ class PowerSign(Optimizer):
self.accum[:] = self.accum * self.mu + dW
signed = np.sign(dW) * np.sign(self.accum)
#signed *= decay
# signed *= decay
if self.use_exp:
return -self.lr * dW * np.exp(signed)
else:
return -self.lr * dW * np.power(self.alpha, signed)
class Neumann(Optimizer):
# paper: https://arxiv.org/abs/1712.03298
# NOTE: this implementation is missing resetting as described in the paper.
@ -529,4 +550,3 @@ class Neumann(Optimizer):
# weights and accumulator:
W += mu * self.mt - self.lr * dt
self.vt = W + self.gamma * (self.vt - W)

View file

@ -2,6 +2,7 @@ import numpy as np
from .float import *
class Optimizer:
def __init__(self, lr=0.1):
self.lr = _f(lr) # learning rate
@ -15,5 +16,3 @@ class Optimizer:
def update(self, dW, W):
W += self.compute(dW, W)

View file

@ -4,6 +4,7 @@ from .float import *
from .layer_base import *
from .initialization import *
class Bias(Layer):
# TODO: support axes other than -1 and shapes other than 1D.
@ -28,6 +29,7 @@ class Bias(Layer):
self.biases.g += dY.sum(0)
return dY
class Dense(Layer):
serialized = {
'W': 'coeffs',
@ -38,8 +40,10 @@ class Dense(Layer):
super().__init__()
self.dim = int(dim)
self.output_shape = (dim,)
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
self.coeffs = self._new_weights('coeffs', init=init,
regularizer=reg_w)
self.biases = self._new_weights('biases', init=init_zeros,
regularizer=reg_b)
def make_shape(self, parent):
shape = parent.output_shape
@ -101,18 +105,20 @@ class Conv1Dper(Layer):
def forward(self, X):
if self.wrap0 == 0:
Xper = np.hstack((X,X[:,:self.wrap1]))
Xper = np.hstack((X, X[:, :self.wrap1]))
elif self.wrap1 == 0:
Xper = np.hstack((X[:,-self.wrap0:],X))
Xper = np.hstack((X[:, -self.wrap0:], X))
else:
Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
Xper = np.hstack((X[:, -self.wrap0:], X, X[:, :self.wrap1]))
self.cols = rolling_batch(Xper, self.kernel_size)
convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
convolved = (self.cols * self.coeffs.f[:, ::-1]).sum(2)
return convolved
def backward(self, dY):
self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
self.coeffs.g += (dY[:, :, None] * self.cols).sum(0)[:, ::-1].sum(
0, keepdims=True)
return (dY[:, :, None] * self.coeffs.f[:, ::-1]).sum(2)
class LayerNorm(Layer):
# paper: https://arxiv.org/abs/1607.06450
@ -168,6 +174,7 @@ class LayerNorm(Layer):
return dX
class Denses(Layer): # TODO: rename?
# acts as a separate Dense for each row or column. only for 2D arrays.
@ -176,13 +183,16 @@ class Denses(Layer): # TODO: rename?
'b': 'biases',
}
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
def __init__(self, dim, init=init_he_uniform,
reg_w=None, reg_b=None, axis=-1):
super().__init__()
self.dim = int(dim)
self.weight_init = init
self.axis = int(axis)
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
self.coeffs = self._new_weights('coeffs', init=init,
regularizer=reg_w)
self.biases = self._new_weights('biases', init=init_zeros,
regularizer=reg_b)
def make_shape(self, parent):
shape = parent.output_shape
@ -220,9 +230,11 @@ class Denses(Layer): # TODO: rename?
self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
class CosineDense(Dense):
# paper: https://arxiv.org/abs/1702.05870
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
# another implementation:
# https://github.com/farizrahman4u/keras-contrib/pull/36
# the paper doesn't mention bias,
# so we treat bias as an additional weight with a constant input of 1.
# this is correct in Dense layers, so i hope it's correct here too.
@ -231,9 +243,9 @@ class CosineDense(Dense):
def forward(self, X):
self.X = X
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True)
+ 1 + self.eps)
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True)
+ np.square(self.biases.f) + self.eps)
self.dot = X @ self.coeffs.f + self.biases.f
Y = self.dot / (self.X_norm * self.W_norm)
@ -241,8 +253,10 @@ class CosineDense(Dense):
def backward(self, dY):
ddot = dY / self.X_norm / self.W_norm
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) \
/ self.X_norm**2
dW_norm = -(dY * self.dot / self.X_norm).sum(0, keepdims=True) \
/ self.W_norm**2
self.coeffs.g += self.X.T @ ddot \
+ dW_norm / self.W_norm * self.coeffs.f
@ -251,4 +265,3 @@ class CosineDense(Dense):
dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
return dX

View file

@ -2,9 +2,11 @@ import numpy as np
from .float import *
class Regularizer:
pass
class L1L2(Regularizer):
def __init__(self, l1=0.0, l2=0.0):
self.l1 = _f(l1)
@ -26,6 +28,7 @@ class L1L2(Regularizer):
df += self.l2 * 2 * X
return df
# more
class SaturateRelu(Regularizer):

View file

@ -4,6 +4,7 @@ from .float import *
from .initialization import *
from .ritual_base import *
def stochastic_multiply(W, gamma=0.5, allow_negation=False):
# paper: https://arxiv.org/abs/1606.01981
@ -23,6 +24,7 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=False):
mult *= np.where(samples < prob, 1, -1)
np.multiply(W, mult, out=W)
class StochMRitual(Ritual):
# paper: https://arxiv.org/abs/1606.01981
# this probably doesn't make sense for regression problems,
@ -38,8 +40,8 @@ class StochMRitual(Ritual):
def learn(self, inputs, outputs):
# an experiment:
#assert self.learner.rate < 10, self.learner.rate
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
# assert self.learner.rate < 10, self.learner.rate
# self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
self.W[:] = self.model.W
for layer in self.model.ordered_nodes:
@ -57,6 +59,7 @@ class StochMRitual(Ritual):
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
# np.clip(layer.W, -1, 1, out=layer.W)
class NoisyRitual(Ritual):
def __init__(self, learner=None,
input_noise=0, output_noise=0, gradient_noise=0):
@ -80,11 +83,10 @@ class NoisyRitual(Ritual):
if self.gradient_noise > 0:
size = len(self.model.dW)
gamma = 0.55
#s = self.gradient_noise / (1 + self.bn) ** gamma
# s = self.gradient_noise / (1 + self.bn) ** gamma
# experiments:
s = self.gradient_noise * np.sqrt(self.learner.rate)
#s = np.square(self.learner.rate)
#s = self.learner.rate / self.en
# s = np.square(self.learner.rate)
# s = self.learner.rate / self.en
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
super().update()

View file

@ -3,6 +3,7 @@ import numpy as np
from .float import *
class Ritual: # i'm just making up names at this point.
def __init__(self, learner=None):
self.learner = learner if learner is not None else Learner(Optimizer())
@ -77,7 +78,8 @@ class Ritual: # i'm just making up names at this point.
if shuffle:
if gen:
raise Exception("shuffling is incompatibile with using a generator.")
raise Exception(
"shuffling is incompatibile with using a generator.")
indices = np.arange(inputs.shape[0])
np.random.shuffle(indices)
inputs = inputs[indices]
@ -101,17 +103,20 @@ class Ritual: # i'm just making up names at this point.
batch_inputs, batch_outputs = next(generator)
batch_size = batch_inputs.shape[0]
# TODO: lift this restriction
assert batch_size == prev_batch_size or prev_batch_size is None, \
"non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
fmt = "non-constant batch size (got {}, expected {})"
assert (batch_size == prev_batch_size
or prev_batch_size is None), \
fmt.format(batch_size, prev_batch_size)
else:
bi = b * batch_size
batch_inputs = inputs[ bi:bi+batch_size]
batch_inputs = inputs[bi:bi+batch_size]
batch_outputs = outputs[bi:bi+batch_size]
if clear_grad:
self.model.clear_grad()
self._train_batch(batch_inputs, batch_outputs, b, batch_count,
test_only, return_losses=='both', return_losses)
test_only, return_losses == 'both',
return_losses)
prev_batch_size = batch_size

View file

@ -1,17 +1,23 @@
import sys
def lament(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def lower_priority():
"""Set the priority of the process to below-normal."""
# via https://stackoverflow.com/a/1023269
if sys.platform == 'win32':
try:
import win32api, win32process, win32con
import win32api
import win32process
import win32con
pid = win32api.GetCurrentProcessId()
handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
handle = win32api.OpenProcess(
win32con.PROCESS_ALL_ACCESS, True, pid)
win32process.SetPriorityClass(
handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
except ImportError:
lament("you do not have pywin32 installed.")
lament("the process priority could not be lowered.")
@ -21,9 +27,12 @@ def lower_priority():
import os
os.nice(1)
# more
_log_was_update = False
def log(left, right, update=False):
s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right)
global _log_was_update
@ -33,5 +42,6 @@ def log(left, right, update=False):
lament(s)
_log_was_update = update
class Dummy:
pass

View file

@ -1,5 +1,6 @@
import numpy as np
class Weights:
# we may or may not contain weights -- or any information, for that matter.