basic PEP 8 compliance

rip readability
This commit is contained in:
Connor Olding 2018-01-22 19:40:36 +00:00
parent c81ce0afbb
commit 169303813d
19 changed files with 282 additions and 150 deletions

View file

@ -1,5 +1,5 @@
# external packages required for full functionality: # external packages required for full functionality:
# numpy scipy h5py sklearn dotmap # numpy scipy h5py sklearn
# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*. # BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
# this is gonna take some work. # this is gonna take some work.

View file

@ -6,6 +6,7 @@ from scipy.special import expit as sigmoid
from .float import * from .float import *
from .layer_base import * from .layer_base import *
class Identity(Layer): class Identity(Layer):
def forward(self, X): def forward(self, X):
return X return X
@ -13,7 +14,8 @@ class Identity(Layer):
def backward(self, dY): def backward(self, dY):
return dY return dY
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
def forward(self, X): def forward(self, X):
self.sig = sigmoid(X) self.sig = sigmoid(X)
return self.sig return self.sig
@ -21,6 +23,7 @@ class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
def backward(self, dY): def backward(self, dY):
return dY * self.sig * (1 - self.sig) return dY * self.sig * (1 - self.sig)
class Softplus(Layer): class Softplus(Layer):
# integral of Sigmoid. # integral of Sigmoid.
@ -31,6 +34,7 @@ class Softplus(Layer):
def backward(self, dY): def backward(self, dY):
return dY * sigmoid(self.X) return dY * sigmoid(self.X)
class Tanh(Layer): class Tanh(Layer):
def forward(self, X): def forward(self, X):
self.sig = np.tanh(X) self.sig = np.tanh(X)
@ -39,6 +43,7 @@ class Tanh(Layer):
def backward(self, dY): def backward(self, dY):
return dY * (1 - self.sig * self.sig) return dY * (1 - self.sig * self.sig)
class LeCunTanh(Layer): class LeCunTanh(Layer):
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
@ -53,6 +58,7 @@ class LeCunTanh(Layer):
def backward(self, dY): def backward(self, dY):
return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig) return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
class Relu(Layer): class Relu(Layer):
def forward(self, X): def forward(self, X):
self.cond = X >= 0 self.cond = X >= 0
@ -61,12 +67,13 @@ class Relu(Layer):
def backward(self, dY): def backward(self, dY):
return np.where(self.cond, dY, 0) return np.where(self.cond, dY, 0)
class Elu(Layer): class Elu(Layer):
# paper: https://arxiv.org/abs/1511.07289 # paper: https://arxiv.org/abs/1511.07289
def __init__(self, alpha=1): def __init__(self, alpha=1):
super().__init__() super().__init__()
self.alpha = _f(alpha) # FIXME: unused self.alpha = _f(alpha) # FIXME: unused
def forward(self, X): def forward(self, X):
self.cond = X >= 0 self.cond = X >= 0
@ -76,6 +83,7 @@ class Elu(Layer):
def backward(self, dY): def backward(self, dY):
return dY * np.where(self.cond, 1, self.neg + 1) return dY * np.where(self.cond, 1, self.neg + 1)
class GeluApprox(Layer): class GeluApprox(Layer):
# paper: https://arxiv.org/abs/1606.08415 # paper: https://arxiv.org/abs/1606.08415
# plot: https://www.desmos.com/calculator/ydzgtccsld # plot: https://www.desmos.com/calculator/ydzgtccsld
@ -88,6 +96,7 @@ class GeluApprox(Layer):
def backward(self, dY): def backward(self, dY):
return dY * self.sig * (1 + self.a * (1 - self.sig)) return dY * self.sig * (1 + self.a * (1 - self.sig))
class Softmax(Layer): class Softmax(Layer):
def forward(self, X): def forward(self, X):
alpha = np.max(X, axis=-1, keepdims=True) alpha = np.max(X, axis=-1, keepdims=True)
@ -99,6 +108,7 @@ class Softmax(Layer):
def backward(self, dY): def backward(self, dY):
return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
class LogSoftmax(Softmax): class LogSoftmax(Softmax):
def __init__(self, eps=1e-6): def __init__(self, eps=1e-6):
super().__init__() super().__init__()
@ -110,6 +120,7 @@ class LogSoftmax(Softmax):
def backward(self, dY): def backward(self, dY):
return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
class Cos(Layer): class Cos(Layer):
# performs well on MNIST for some strange reason. # performs well on MNIST for some strange reason.
@ -120,6 +131,7 @@ class Cos(Layer):
def backward(self, dY): def backward(self, dY):
return dY * -np.sin(self.X) return dY * -np.sin(self.X)
class Selu(Layer): class Selu(Layer):
# paper: https://arxiv.org/abs/1706.02515 # paper: https://arxiv.org/abs/1706.02515
@ -136,6 +148,7 @@ class Selu(Layer):
def backward(self, dY): def backward(self, dY):
return dY * self.lamb * np.where(self.cond, 1, self.neg) return dY * self.lamb * np.where(self.cond, 1, self.neg)
# more # more
class TanhTest(Layer): class TanhTest(Layer):
@ -146,6 +159,7 @@ class TanhTest(Layer):
def backward(self, dY): def backward(self, dY):
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig) return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
class ExpGB(Layer): class ExpGB(Layer):
# an output layer for one-hot classification problems. # an output layer for one-hot classification problems.
# use with MSE (SquaredHalved), not CategoricalCrossentropy! # use with MSE (SquaredHalved), not CategoricalCrossentropy!
@ -163,6 +177,7 @@ class ExpGB(Layer):
# this gradient is intentionally incorrect. # this gradient is intentionally incorrect.
return dY return dY
class CubicGB(Layer): class CubicGB(Layer):
# an output layer for one-hot classification problems. # an output layer for one-hot classification problems.
# use with MSE (SquaredHalved), not CategoricalCrossentropy! # use with MSE (SquaredHalved), not CategoricalCrossentropy!
@ -182,4 +197,3 @@ class CubicGB(Layer):
def backward(self, dY): def backward(self, dY):
# this gradient is intentionally incorrect. # this gradient is intentionally incorrect.
return dY return dY

View file

@ -2,11 +2,13 @@ import numpy as np
_f = np.float32 _f = np.float32
def _check(a): def _check(a):
assert isinstance(a, np.ndarray) or type(a) == _f, type(a) assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
assert a.dtype == _f, a.dtype assert a.dtype == _f, a.dtype
return a return a
_0 = _f(0) _0 = _f(0)
_1 = _f(1) _1 = _f(1)
_2 = _f(2) _2 = _f(2)

View file

@ -2,28 +2,35 @@ import numpy as np
# note: these are currently only implemented for 2D shapes. # note: these are currently only implemented for 2D shapes.
def init_zeros(size, ins=None, outs=None): def init_zeros(size, ins=None, outs=None):
return np.zeros(size) return np.zeros(size)
def init_ones(size, ins=None, outs=None): def init_ones(size, ins=None, outs=None):
return np.ones(size) return np.ones(size)
def init_he_normal(size, ins, outs): def init_he_normal(size, ins, outs):
s = np.sqrt(2 / ins) s = np.sqrt(2 / ins)
return np.random.normal(0, s, size=size) return np.random.normal(0, s, size=size)
def init_he_uniform(size, ins, outs): def init_he_uniform(size, ins, outs):
s = np.sqrt(6 / ins) s = np.sqrt(6 / ins)
return np.random.uniform(-s, s, size=size) return np.random.uniform(-s, s, size=size)
def init_glorot_normal(size, ins, outs): def init_glorot_normal(size, ins, outs):
s = np.sqrt(2 / (ins + outs)) s = np.sqrt(2 / (ins + outs))
return np.random.normal(0, s, size=size) return np.random.normal(0, s, size=size)
def init_glorot_uniform(size, ins, outs): def init_glorot_uniform(size, ins, outs):
s = np.sqrt(6 / (ins + outs)) s = np.sqrt(6 / (ins + outs))
return np.random.uniform(-s, s, size=size) return np.random.uniform(-s, s, size=size)
# more # more
def init_gaussian_unit(size, ins, outs): def init_gaussian_unit(size, ins, outs):

View file

@ -2,6 +2,7 @@ from .layer_base import *
from .initialization import * from .initialization import *
from .float import * from .float import *
# Nonparametric Layers {{{1 # Nonparametric Layers {{{1
class Input(Layer): class Input(Layer):
@ -16,9 +17,10 @@ class Input(Layer):
return X return X
def backward(self, dY): def backward(self, dY):
#self.dY = dY # self.dY = dY
return np.zeros_like(dY) return np.zeros_like(dY)
class Reshape(Layer): class Reshape(Layer):
def __init__(self, new_shape): def __init__(self, new_shape):
super().__init__() super().__init__()
@ -33,6 +35,7 @@ class Reshape(Layer):
assert dY.shape[0] == self.batch_size assert dY.shape[0] == self.batch_size
return dY.reshape(self.batch_size, *self.input_shape) return dY.reshape(self.batch_size, *self.input_shape)
class Flatten(Layer): class Flatten(Layer):
def make_shape(self, parent): def make_shape(self, parent):
shape = parent.output_shape shape = parent.output_shape
@ -47,6 +50,7 @@ class Flatten(Layer):
assert dY.shape[0] == self.batch_size assert dY.shape[0] == self.batch_size
return dY.reshape(self.batch_size, *self.input_shape) return dY.reshape(self.batch_size, *self.input_shape)
class ConstAffine(Layer): class ConstAffine(Layer):
def __init__(self, a=1, b=0): def __init__(self, a=1, b=0):
super().__init__() super().__init__()
@ -59,13 +63,15 @@ class ConstAffine(Layer):
def backward(self, dY): def backward(self, dY):
return dY * self.a return dY * self.a
class Sum(Layer): class Sum(Layer):
def _propagate(self, edges, deterministic): def _propagate(self, edges, deterministic):
return np.sum(edges, axis=0) return np.sum(edges, axis=0)
def _backpropagate(self, edges): def _backpropagate(self, edges):
#assert len(edges) == 1, "unimplemented" # assert len(edges) == 1, "unimplemented"
return edges[0] # TODO: does this always work? return edges[0] # TODO: does this always work?
class ActivityRegularizer(Layer): class ActivityRegularizer(Layer):
def __init__(self, reg): def __init__(self, reg):
@ -81,6 +87,7 @@ class ActivityRegularizer(Layer):
def backward(self, dY): def backward(self, dY):
return dY + self.reg.backward(self.X) return dY + self.reg.backward(self.X)
class Dropout(Layer): class Dropout(Layer):
def __init__(self, dropout=0.0): def __init__(self, dropout=0.0):
super().__init__() super().__init__()
@ -92,12 +99,13 @@ class Dropout(Layer):
return X * self.mask return X * self.mask
def forward_deterministic(self, X): def forward_deterministic(self, X):
#self.mask = _1 # self.mask = _1
return X return X
def backward(self, dY): def backward(self, dY):
return dY * self.mask return dY * self.mask
# more # more
class AlphaDropout(Layer): class AlphaDropout(Layer):
@ -136,6 +144,7 @@ class AlphaDropout(Layer):
def backward(self, dY): def backward(self, dY):
return dY * self.a * self.mask return dY * self.a * self.mask
class Decimate(Layer): class Decimate(Layer):
# simple decimaton layer that drops every other sample from the last axis. # simple decimaton layer that drops every other sample from the last axis.
@ -168,6 +177,7 @@ class Decimate(Layer):
dX.ravel()[1::2] = dY.ravel() dX.ravel()[1::2] = dY.ravel()
return dX return dX
class Undecimate(Layer): class Undecimate(Layer):
# inverse operation of Decimate. not quite interpolation. # inverse operation of Decimate. not quite interpolation.

View file

@ -4,26 +4,29 @@ from collections import defaultdict, OrderedDict
from .weight import * from .weight import *
# used for numbering layers like Keras: # used for numbering layers like Keras:
_layer_counters = defaultdict(lambda: 0) _layer_counters = defaultdict(lambda: 0)
class LayerIncompatibility(Exception): class LayerIncompatibility(Exception):
pass pass
class Layer: class Layer:
def __init__(self): def __init__(self):
self.parents = [] self.parents = []
self.children = [] self.children = []
self.weights = OrderedDict() self.weights = OrderedDict()
self.loss = None # for activity regularizers self.loss = None # for activity regularizers
self.input_shape = None self.input_shape = None
self.output_shape = None self.output_shape = None
kind = self.__class__.__name__ kind = self.__class__.__name__
global _layer_counters global _layer_counters
_layer_counters[kind] += 1 _layer_counters[kind] += 1
self.name = "{}_{}".format(kind, _layer_counters[kind]) self.name = "{}_{}".format(kind, _layer_counters[kind])
self.unsafe = False # disables assertions for better performance self.unsafe = False # disables assertions for better performance
self.shared = False # as in weight sharing self.shared = False # as in weight sharing
def __str__(self): def __str__(self):
return self.name return self.name
@ -40,9 +43,9 @@ class Layer:
raise NotImplementedError("unimplemented", self) raise NotImplementedError("unimplemented", self)
def make_shape(self, parent): def make_shape(self, parent):
if self.input_shape == None: if self.input_shape is None:
self.input_shape = parent.output_shape self.input_shape = parent.output_shape
if self.output_shape == None: if self.output_shape is None:
self.output_shape = self.input_shape self.output_shape = self.input_shape
def do_feed(self, child): def do_feed(self, child):
@ -75,16 +78,19 @@ class Layer:
child.make_shape(self) child.make_shape(self)
if not child.is_compatible(self): if not child.is_compatible(self):
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape)) raise LayerIncompatibility(fmt.format(
self, child, self.output_shape, child.input_shape))
self.do_feed(child) self.do_feed(child)
child.be_fed(self) child.be_fed(self)
return child return child
def validate_input(self, X): def validate_input(self, X):
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) assert X.shape[1:] == self.input_shape, \
(str(self), X.shape[1:], self.input_shape)
def validate_output(self, Y): def validate_output(self, Y):
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) assert Y.shape[1:] == self.output_shape, \
(str(self), Y.shape[1:], self.output_shape)
def _new_weights(self, name, **kwargs): def _new_weights(self, name, **kwargs):
w = Weights(**kwargs) w = Weights(**kwargs)
@ -93,9 +99,10 @@ class Layer:
return w return w
def share(self, node): def share(self, node):
self.weights = node.weights # TODO: this should be all it takes. self.weights = node.weights # TODO: this should be all it takes.
for k, v in self.weights.items(): for k, v in self.weights.items():
vs = getattr(node, k) # hack: key isn't necessarily attribute name! # hack: key isn't necessarily attribute name!
vs = getattr(node, k)
setattr(self, k, vs) setattr(self, k, vs)
self.shared = True self.shared = True

View file

@ -1,13 +1,14 @@
from .float import * from .float import *
from .optimizer_base import * from .optimizer_base import *
class Learner: class Learner:
per_batch = False per_batch = False
def __init__(self, optim, epochs=100, rate=None): def __init__(self, optim, epochs=100, rate=None):
assert isinstance(optim, Optimizer) assert isinstance(optim, Optimizer)
self.optim = optim self.optim = optim
self.start_rate = rate # None is okay; it'll use optim.lr instead. self.start_rate = rate # None is okay; it'll use optim.lr instead.
self.epochs = int(epochs) self.epochs = int(epochs)
self.reset() self.reset()
@ -49,7 +50,7 @@ class Learner:
return False return False
return True return True
def batch(self, progress): # TODO: rename def batch(self, progress): # TODO: rename
# interpolates rates between epochs. # interpolates rates between epochs.
# unlike epochs, we do not store batch number as a state. # unlike epochs, we do not store batch number as a state.
# i.e. calling next() will not respect progress. # i.e. calling next() will not respect progress.
@ -60,6 +61,7 @@ class Learner:
def final_rate(self): def final_rate(self):
return self.rate_at(self.epochs - 1e-8) return self.rate_at(self.epochs - 1e-8)
class AnnealingLearner(Learner): class AnnealingLearner(Learner):
def __init__(self, optim, epochs=100, rate=None, halve_every=10): def __init__(self, optim, epochs=100, rate=None, halve_every=10):
self.halve_every = _f(halve_every) self.halve_every = _f(halve_every)
@ -69,10 +71,12 @@ class AnnealingLearner(Learner):
def rate_at(self, epoch): def rate_at(self, epoch):
return super().rate_at(epoch) * self.anneal**epoch return super().rate_at(epoch) * self.anneal**epoch
def cosmod(x): def cosmod(x):
# plot: https://www.desmos.com/calculator/hlgqmyswy2 # plot: https://www.desmos.com/calculator/hlgqmyswy2
return (_1 + np.cos((x % _1) * _pi)) * _inv2 return (_1 + np.cos((x % _1) * _pi)) * _inv2
class SGDR(Learner): class SGDR(Learner):
# Stochastic Gradient Descent with Restarts # Stochastic Gradient Descent with Restarts
# paper: https://arxiv.org/abs/1608.03983 # paper: https://arxiv.org/abs/1608.03983
@ -112,7 +116,8 @@ class SGDR(Learner):
raise Exception('this should never happen.') raise Exception('this should never happen.')
def rate_at(self, epoch): def rate_at(self, epoch):
base_rate = self.start_rate if self.start_rate is not None else self.optim.lr sr = self.start_rate
base_rate = sr if sr is not None else self.optim.lr
restart, sub_epoch, next_restart = self.split_num(max(1, epoch)) restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
x = _f(sub_epoch - 1) / _f(next_restart) x = _f(sub_epoch - 1) / _f(next_restart)
return base_rate * self.decay**_f(restart) * cosmod(x) return base_rate * self.decay**_f(restart) * cosmod(x)
@ -126,6 +131,7 @@ class SGDR(Learner):
self.restart_callback(restart) self.restart_callback(restart)
return True return True
class TriangularCLR(Learner): class TriangularCLR(Learner):
per_batch = True per_batch = True
@ -141,11 +147,14 @@ class TriangularCLR(Learner):
def _t(self, epoch): def _t(self, epoch):
# NOTE: this could probably be simplified # NOTE: this could probably be simplified
offset = self.frequency / 2 offset = self.frequency / 2
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset return np.abs(((epoch - 1 + offset) % self.frequency) - offset) \
/ offset
def rate_at(self, epoch): def rate_at(self, epoch):
upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr sr = self.start_rate
return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate lr = self.lower_rate
upper_rate = sr if sr is not None else self.optim.lr
return self._t(epoch) * (upper_rate - lr) + lr
def next(self): def next(self):
if not super().next(): if not super().next():
@ -156,14 +165,17 @@ class TriangularCLR(Learner):
self.callback(self.epoch // self.frequency) self.callback(self.epoch // self.frequency)
return True return True
class SineCLR(TriangularCLR): class SineCLR(TriangularCLR):
def _t(self, epoch): def _t(self, epoch):
return np.sin(_pi * _inv2 * super()._t(epoch)) return np.sin(_pi * _inv2 * super()._t(epoch))
class WaveCLR(TriangularCLR): class WaveCLR(TriangularCLR):
def _t(self, epoch): def _t(self, epoch):
return _inv2 * (_1 - np.cos(_pi * super()._t(epoch))) return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
# more # more
class PolyLearner(Learner): class PolyLearner(Learner):
@ -177,4 +189,3 @@ class PolyLearner(Learner):
progress = (epoch - 1) / (self.epochs) progress = (epoch - 1) / (self.epochs)
ret = np.polyval(self.coeffs, progress) ret = np.polyval(self.coeffs, progress)
return np.abs(ret) return np.abs(ret)

View file

@ -2,6 +2,7 @@ import numpy as np
from .float import * from .float import *
class Loss: class Loss:
def forward(self, p, y): def forward(self, p, y):
raise NotImplementedError("unimplemented", self) raise NotImplementedError("unimplemented", self)
@ -9,7 +10,8 @@ class Loss:
def backward(self, p, y): def backward(self, p, y):
raise NotImplementedError("unimplemented", self) raise NotImplementedError("unimplemented", self)
class NLL(Loss): # Negative Log Likelihood
class NLL(Loss): # Negative Log Likelihood
def forward(self, p, y): def forward(self, p, y):
correct = p * y correct = p * y
return np.mean(-correct) return np.mean(-correct)
@ -17,6 +19,7 @@ class NLL(Loss): # Negative Log Likelihood
def backward(self, p, y): def backward(self, p, y):
return -y / len(p) return -y / len(p)
class CategoricalCrossentropy(Loss): class CategoricalCrossentropy(Loss):
# lifted from theano # lifted from theano
@ -33,6 +36,7 @@ class CategoricalCrossentropy(Loss):
df = (p - y) / (p * (1 - p)) df = (p - y) / (p * (1 - p))
return df / len(y) return df / len(y)
class Accuracy(Loss): class Accuracy(Loss):
# returns percentage of categories correctly predicted. # returns percentage of categories correctly predicted.
# utilizes argmax(), so it cannot be used for gradient descent. # utilizes argmax(), so it cannot be used for gradient descent.
@ -45,6 +49,7 @@ class Accuracy(Loss):
def backward(self, p, y): def backward(self, p, y):
raise NotImplementedError("cannot take the gradient of Accuracy") raise NotImplementedError("cannot take the gradient of Accuracy")
class ResidualLoss(Loss): class ResidualLoss(Loss):
def forward(self, p, y): def forward(self, p, y):
return np.mean(self.f(p - y)) return np.mean(self.f(p - y))
@ -53,6 +58,7 @@ class ResidualLoss(Loss):
ret = self.df(p - y) / len(y) ret = self.df(p - y) / len(y)
return ret return ret
class SquaredHalved(ResidualLoss): class SquaredHalved(ResidualLoss):
def f(self, r): def f(self, r):
return np.square(r) / 2 return np.square(r) / 2
@ -60,6 +66,7 @@ class SquaredHalved(ResidualLoss):
def df(self, r): def df(self, r):
return r return r
class Squared(ResidualLoss): class Squared(ResidualLoss):
def f(self, r): def f(self, r):
return np.square(r) return np.square(r)
@ -67,6 +74,7 @@ class Squared(ResidualLoss):
def df(self, r): def df(self, r):
return 2 * r return 2 * r
class Absolute(ResidualLoss): class Absolute(ResidualLoss):
def f(self, r): def f(self, r):
return np.abs(r) return np.abs(r)
@ -74,6 +82,7 @@ class Absolute(ResidualLoss):
def df(self, r): def df(self, r):
return np.sign(r) return np.sign(r)
class Huber(ResidualLoss): class Huber(ResidualLoss):
def __init__(self, delta=1.0): def __init__(self, delta=1.0):
self.delta = _f(delta) self.delta = _f(delta)
@ -88,6 +97,7 @@ class Huber(ResidualLoss):
r, r,
self.delta * np.sign(r)) self.delta * np.sign(r))
# more # more
class SomethingElse(ResidualLoss): class SomethingElse(ResidualLoss):
@ -105,6 +115,7 @@ class SomethingElse(ResidualLoss):
def df(self, r): def df(self, r):
return np.sign(r) * np.abs(r)**self.c return np.sign(r) * np.abs(r)**self.c
class Confidence(Loss): class Confidence(Loss):
# this isn't "confidence" in any meaningful way; (e.g. Bayesian) # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
# it's just a metric of how large the value is of the predicted class. # it's just a metric of how large the value is of the predicted class.
@ -126,4 +137,3 @@ class Confidence(Loss):
detc = p / categories / (1 - 1/categories) detc = p / categories / (1 - 1/categories)
dmax = p == np.max(p, axis=-1, keepdims=True) dmax = p == np.max(p, axis=-1, keepdims=True)
return detc * dmax return detc * dmax

View file

@ -1,14 +1,15 @@
import numpy as np import numpy as np
def rolling(a, window): def rolling(a, window):
# http://stackoverflow.com/a/4924433 # http://stackoverflow.com/a/4924433
shape = (a.size - window + 1, window) shape = (a.size - window + 1, window)
strides = (a.itemsize, a.itemsize) strides = (a.itemsize, a.itemsize)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def rolling_batch(a, window): def rolling_batch(a, window):
# same as rolling, but acts on each batch (axis 0). # same as rolling, but acts on each batch (axis 0).
shape = (a.shape[0], a.shape[-1] - window + 1, window) shape = (a.shape[0], a.shape[-1] - window + 1, window)
strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize) strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

View file

@ -5,14 +5,16 @@ from .nodal import *
from .layer_base import * from .layer_base import *
from .utility import * from .utility import *
class Model: class Model:
def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False): def __init__(self, nodes_in, nodes_out,
loss=None, mloss=None, unsafe=False):
self.loss = loss if loss is not None else SquaredHalved() self.loss = loss if loss is not None else SquaredHalved()
self.mloss = mloss if mloss is not None else loss self.mloss = mloss if mloss is not None else loss
nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
assert type(nodes_in) == list, type(nodes_in) assert type(nodes_in) == list, type(nodes_in)
assert type(nodes_out) == list, type(nodes_out) assert type(nodes_out) == list, type(nodes_out)
self.nodes_in = nodes_in self.nodes_in = nodes_in
self.nodes_out = nodes_out self.nodes_out = nodes_out
@ -29,8 +31,9 @@ class Model:
return self.nodes return self.nodes
def make_weights(self): def make_weights(self):
self.param_count = sum((node.size for node in self.nodes if not node.shared)) self.param_count = sum((node.size for node in self.nodes
self.W = np.zeros(self.param_count, dtype=_f) if not node.shared))
self.W = np.zeros(self.param_count, dtype=_f)
self.dW = np.zeros(self.param_count, dtype=_f) self.dW = np.zeros(self.param_count, dtype=_f)
offset = 0 offset = 0
@ -47,37 +50,42 @@ class Model:
assert size == len(ret[0]), (size, len(ret[0])) assert size == len(ret[0]), (size, len(ret[0]))
return ret return ret
fmt = "Layer {} allocated {} weights than it said it would"
node.init(allocate) node.init(allocate)
assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node) assert inner_offset <= node.size, fmt.format("more", node)
# i don't care if "less" is grammatically incorrect. # i don't care if "less" is grammatically incorrect.
# you're mom is grammatically incorrect. # you're mom is grammatically incorrect.
assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node) assert inner_offset >= node.size, fmt.format("less", node)
offset += node.size offset += node.size
def evaluate(self, input_, deterministic=True): def evaluate(self, input_, deterministic=True):
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead" fmt = "ambiguous input in multi-{} network; use {}() instead"
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead" assert len(self.nodes_in) == 1, fmt.format("input", "evaluate_multi")
assert len(self.nodes_out) == 1, fmt.format("output", "evaluate_multi")
node_in = self.nodes_in[0] node_in = self.nodes_in[0]
node_out = self.nodes_out[0] node_out = self.nodes_out[0]
outputs = self.evaluate_multi({node_in: input_}, deterministic) outputs = self.evaluate_multi({node_in: input_}, deterministic)
return outputs[node_out] return outputs[node_out]
def apply(self, error): # TODO: better name? def apply(self, error): # TODO: better name?
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead" fmt = "ambiguous input in multi-{} network; use {}() instead"
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead" assert len(self.nodes_in) == 1, fmt.format("input", "apply_multi")
assert len(self.nodes_out) == 1, fmt.format("output", "apply_multi")
node_in = self.nodes_in[0] node_in = self.nodes_in[0]
node_out = self.nodes_out[0] node_out = self.nodes_out[0]
inputs = self.apply_multi({node_out: error}) inputs = self.apply_multi({node_out: error})
return inputs[node_in] return inputs[node_in]
def evaluate_multi(self, inputs, deterministic=True): def evaluate_multi(self, inputs, deterministic=True):
fmt = "missing {} for node {}"
values = dict() values = dict()
outputs = dict() outputs = dict()
for node in self.nodes: for node in self.nodes:
if node in self.nodes_in: if node in self.nodes_in:
assert node in inputs, "missing input for node {}".format(node.name) assert node in inputs, fmt.format("input", node.name)
X = inputs[node] X = inputs[node]
values[node] = node._propagate(np.expand_dims(X, 0), deterministic) values[node] = node._propagate(np.expand_dims(X, 0),
deterministic)
else: else:
values[node] = node.propagate(values, deterministic) values[node] = node.propagate(values, deterministic)
if node in self.nodes_out: if node in self.nodes_out:
@ -85,11 +93,12 @@ class Model:
return outputs return outputs
def apply_multi(self, outputs): def apply_multi(self, outputs):
fmt = "missing {} for node {}"
values = dict() values = dict()
inputs = dict() inputs = dict()
for node in reversed(self.nodes): for node in reversed(self.nodes):
if node in self.nodes_out: if node in self.nodes_out:
assert node in outputs, "missing output for node {}".format(node.name) assert node in outputs, fmt.format("output", node.name)
X = outputs[node] X = outputs[node]
values[node] = node._backpropagate(np.expand_dims(X, 0)) values[node] = node._backpropagate(np.expand_dims(X, 0))
else: else:
@ -135,13 +144,17 @@ class Model:
def load_weights(self, fn): def load_weights(self, fn):
# seemingly compatible with keras' Dense layers. # seemingly compatible with keras' Dense layers.
import h5py
open(fn) # just ensure the file exists (python's error is better)
f = h5py.File(fn, 'r')
weights = {} weights = {}
import h5py
open(fn) # just ensure the file exists (python's error is better)
f = h5py.File(fn, 'r')
def visitor(name, obj): def visitor(name, obj):
if isinstance(obj, h5py.Dataset): if isinstance(obj, h5py.Dataset):
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f) weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
f.visititems(visitor) f.visititems(visitor)
f.close() f.close()
@ -194,5 +207,7 @@ class Model:
children = [str(n) for n in node.children] children = [str(n) for n in node.children]
if children: if children:
sep = '->' sep = '->'
print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file) print('\t' + str(node) + sep +
(';\n\t' + str(node) + sep).join(children) + ';',
file=file)
print('}', file=file) print('}', file=file)

View file

@ -3,7 +3,8 @@ class DummyNode:
def __init__(self, children=None, parents=None): def __init__(self, children=None, parents=None):
self.children = children if children is not None else [] self.children = children if children is not None else []
self.parents = parents if parents is not None else [] self.parents = parents if parents is not None else []
def traverse(node_in, node_out, nodes=None, dummy_mode=False): def traverse(node_in, node_out, nodes=None, dummy_mode=False):
# i have no idea if this is any algorithm in particular. # i have no idea if this is any algorithm in particular.
@ -27,7 +28,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
if not seen_up[node]: if not seen_up[node]:
continue continue
parents_added = (parent in nodes for parent in node.parents) parents_added = (parent in nodes for parent in node.parents)
if not node in nodes and all(parents_added): if node not in nodes and all(parents_added):
nodes.append(node) nodes.append(node)
for child in node.children: for child in node.children:
q.append(child) q.append(child)
@ -37,6 +38,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
return nodes return nodes
def traverse_all(nodes_in, nodes_out, nodes=None): def traverse_all(nodes_in, nodes_out, nodes=None):
all_in = DummyNode(children=nodes_in) all_in = DummyNode(children=nodes_in)
all_out = DummyNode(parents=nodes_out) all_out = DummyNode(parents=nodes_out)

View file

@ -7,9 +7,10 @@ from .utility import *
# some of the the following optimizers are blatantly lifted from tiny-dnn: # some of the the following optimizers are blatantly lifted from tiny-dnn:
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
class Momentum(Optimizer): class Momentum(Optimizer):
def __init__(self, lr=0.01, mu=0.9, nesterov=False): def __init__(self, lr=0.01, mu=0.9, nesterov=False):
self.mu = _f(mu) # momentum self.mu = _f(mu) # momentum
self.nesterov = bool(nesterov) self.nesterov = bool(nesterov)
super().__init__(lr) super().__init__(lr)
@ -28,6 +29,7 @@ class Momentum(Optimizer):
return V return V
class Adagrad(Optimizer): class Adagrad(Optimizer):
def __init__(self, lr=0.01, eps=1e-8): def __init__(self, lr=0.01, eps=1e-8):
self.eps = _f(eps) self.eps = _f(eps)
@ -44,6 +46,7 @@ class Adagrad(Optimizer):
self.g += np.square(dW) self.g += np.square(dW)
return -self.lr * dW / (np.sqrt(self.g) + self.eps) return -self.lr * dW / (np.sqrt(self.g) + self.eps)
class RMSprop(Optimizer): class RMSprop(Optimizer):
# RMSprop generalizes* Adagrad, etc. # RMSprop generalizes* Adagrad, etc.
@ -51,7 +54,7 @@ class RMSprop(Optimizer):
# RMSprop.mu == 1 # RMSprop.mu == 1
def __init__(self, lr=1e-4, mu=0.99, eps=1e-8): def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
self.mu = _f(mu) # decay term self.mu = _f(mu) # decay term
self.eps = _f(eps) self.eps = _f(eps)
# one might consider the following equation when specifying mu: # one might consider the following equation when specifying mu:
@ -70,12 +73,13 @@ class RMSprop(Optimizer):
if self.g is None: if self.g is None:
self.g = np.zeros_like(dW) self.g = np.zeros_like(dW)
# basically apply a first-order low-pass filter to delta squared # basically apply a first-order low-pass filter to delta squared,
self.g += (1 - self.mu) * (np.square(dW) - self.g) self.g += (1 - self.mu) * (np.square(dW) - self.g)
# finally sqrt it to complete the running root-mean-square approximation # and sqrt it to complete the running root-mean-square approximation.
return -self.lr * dW / (np.sqrt(self.g) + self.eps) return -self.lr * dW / (np.sqrt(self.g) + self.eps)
class RMSpropCentered(Optimizer): class RMSpropCentered(Optimizer):
# referenced TensorFlow/PyTorch. # referenced TensorFlow/PyTorch.
# paper: https://arxiv.org/pdf/1308.0850v5.pdf # paper: https://arxiv.org/pdf/1308.0850v5.pdf
@ -115,10 +119,11 @@ class RMSpropCentered(Optimizer):
self.delta[:] = self.momentum * self.delta + self.lr * temp self.delta[:] = self.momentum * self.delta + self.lr * temp
return -self.delta return -self.delta
# PyTorch does it this way. # PyTorch does it this way.
#self.delta[:] = self.momentum * self.delta + temp # self.delta[:] = self.momentum * self.delta + temp
#return -self.lr * self.delta # return -self.lr * self.delta
# they are equivalent only when LR is constant, which it might not be. # they are equivalent only when LR is constant, which it might not be.
class Adam(Optimizer): class Adam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980 # paper: https://arxiv.org/abs/1412.6980
# Adam generalizes* RMSprop, and # Adam generalizes* RMSprop, and
@ -130,10 +135,10 @@ class Adam(Optimizer):
# Adam.b2 == RMSprop.mu # Adam.b2 == RMSprop.mu
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.b1_t_default = _f(b1) # decay term power t self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2) # decay term power t self.b2_t_default = _f(b2) # decay term power t
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(lr) super().__init__(lr)
@ -159,18 +164,20 @@ class Adam(Optimizer):
self.vt += (1 - self.b2) * (np.square(dW) - self.vt) self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
return -self.lr * (self.mt / (1 - self.b1_t)) \ return -self.lr * (self.mt / (1 - self.b1_t)) \
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
class Nadam(Optimizer): class Nadam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980 # paper: https://arxiv.org/abs/1412.6980
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf # paper: http://cs229.stanford.edu/proj2015/054_report.pdf
# TODO: double-check this implementation. also read the damn paper. # TODO: double-check this implementation. also read the damn paper.
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 # lifted from:
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py # https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(lr) super().__init__(lr)
@ -208,6 +215,7 @@ class Nadam(Optimizer):
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
# more # more
class FTML(Optimizer): class FTML(Optimizer):
@ -216,8 +224,8 @@ class FTML(Optimizer):
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8): def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
self.iterations = _0 self.iterations = _0
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(lr) super().__init__(lr)
@ -231,10 +239,14 @@ class FTML(Optimizer):
self.b2_t = _1 self.b2_t = _1
def compute(self, dW, W): def compute(self, dW, W):
if self.dt1 is None: self.dt1 = np.zeros_like(dW) if self.dt1 is None:
if self.dt is None: self.dt = np.zeros_like(dW) self.dt1 = np.zeros_like(dW)
if self.vt is None: self.vt = np.zeros_like(dW) if self.dt is None:
if self.zt is None: self.zt = np.zeros_like(dW) self.dt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
if self.zt is None:
self.zt = np.zeros_like(dW)
# NOTE: we could probably rewrite these equations to avoid this copy. # NOTE: we could probably rewrite these equations to avoid this copy.
self.dt1[:] = self.dt[:] self.dt1[:] = self.dt[:]
@ -260,6 +272,7 @@ class FTML(Optimizer):
# subtract by weights to avoid having to override self.update. # subtract by weights to avoid having to override self.update.
return -self.zt / self.dt - W return -self.zt / self.dt - W
class MomentumClip(Optimizer): class MomentumClip(Optimizer):
def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False): def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
self.mu = _f(mu) self.mu = _f(mu)
@ -289,22 +302,25 @@ class MomentumClip(Optimizer):
else: else:
return -self.lr * self.accum return -self.lr * self.accum
class YellowFin(Optimizer): class YellowFin(Optimizer):
# paper: https://arxiv.org/abs/1706.03471 # paper: https://arxiv.org/abs/1706.03471
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/ # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py # author's implementation:
# code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666 # https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
# code lifted:
# https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20, def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
debias=True, clip=1.0): debias=True, clip=1.0):
self.lr_default = _f(lr) self.lr_default = _f(lr)
self.mu_default = _f(mu) self.mu_default = _f(mu)
self.beta = _f(beta) self.beta = _f(beta)
self.window_size = int(window_size) # curv_win_width self.window_size = int(window_size) # curv_win_width
self.debias_enabled = bool(debias) self.debias_enabled = bool(debias)
self.clip = _f(clip) self.clip = _f(clip)
self.mu = _f(mu) # momentum self.mu = _f(mu) # momentum
super().__init__(lr) super().__init__(lr)
def reset(self): def reset(self):
@ -316,13 +332,13 @@ class YellowFin(Optimizer):
self.step = 0 self.step = 0
self.beta_t = self.beta self.beta_t = self.beta
self.curv_win = np.zeros([self.window_size,], dtype=np.float32) self.curv_win = np.zeros([self.window_size, ], dtype=np.float32)
self.h_min = None self.h_min = None
self.h_max = None self.h_max = None
self.g_lpf = 0 self.g_lpf = 0
#self.g_squared_lpf = 0 # self.g_squared_lpf = 0
self.g_norm_squared_lpf = 0 self.g_norm_squared_lpf = 0
self.g_norm_lpf = 0 self.g_norm_lpf = 0
self.h_min_lpf = 0 self.h_min_lpf = 0
@ -332,7 +348,8 @@ class YellowFin(Optimizer):
self.mu_lpf = 0 self.mu_lpf = 0
def get_lr_mu(self): def get_lr_mu(self):
p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var) p = (np.square(self.dist_avg) * np.square(self.h_min)) \
/ (2 * self.g_var)
w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5) w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
w = np.power(w3, 1/3) w = np.power(w3, 1/3)
y = w - p / (3 * w) y = w - p / (3 * w)
@ -360,11 +377,11 @@ class YellowFin(Optimizer):
total_norm = np.linalg.norm(dW) total_norm = np.linalg.norm(dW)
clip_scale = self.clip / (total_norm + 1e-6) clip_scale = self.clip / (total_norm + 1e-6)
if clip_scale < 1: if clip_scale < 1:
#print("clipping gradients; norm: {:10.5f}".format(total_norm)) # print("clipping gradients; norm: {:10.5f}".format(total_norm))
dW *= clip_scale dW *= clip_scale
#fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3' # fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
#print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100)) # print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
b = self.beta b = self.beta
m1b = 1 - self.beta m1b = 1 - self.beta
@ -380,30 +397,31 @@ class YellowFin(Optimizer):
h_min_t = np.min(valid_window) h_min_t = np.min(valid_window)
h_max_t = np.max(valid_window) h_max_t = np.max(valid_window)
self.g_lpf = b * self.g_lpf + m1b * g self.g_lpf = b * self.g_lpf + m1b * g
#self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared # self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \
self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm + m1b * g_norm_squared
self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
g_avg = debias * self.g_lpf g_avg = debias * self.g_lpf
#g_squared_avg = debias * self.g_squared_lpf # g_squared_avg = debias * self.g_squared_lpf
g_norm_squared_avg = debias * self.g_norm_squared_lpf g_norm_squared_avg = debias * self.g_norm_squared_lpf
g_norm_avg = debias * self.g_norm_lpf g_norm_avg = debias * self.g_norm_lpf
self.h_min = debias * self.h_min_lpf self.h_min = debias * self.h_min_lpf
self.h_max = debias * self.h_max_lpf self.h_max = debias * self.h_max_lpf
assert self.h_max >= self.h_min assert self.h_max >= self.h_min
dist = g_norm_avg / g_norm_squared_avg dist = g_norm_avg / g_norm_squared_avg
self.dist_lpf = b * self.dist_lpf + m1b * dist self.dist_lpf = b * self.dist_lpf + m1b * dist
self.dist_avg = debias * self.dist_lpf self.dist_avg = debias * self.dist_lpf
self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg)) self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
# equivalently: # equivalently:
#self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg))) # self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
if self.step > 0: if self.step > 0:
lr_for_real, mu_for_real = self.get_lr_mu() lr_for_real, mu_for_real = self.get_lr_mu()
@ -419,6 +437,7 @@ class YellowFin(Optimizer):
self.beta_t *= self.beta self.beta_t *= self.beta
return V return V
class AddSign(Optimizer): class AddSign(Optimizer):
# paper: https://arxiv.org/abs/1709.07417 # paper: https://arxiv.org/abs/1709.07417
@ -438,10 +457,11 @@ class AddSign(Optimizer):
self.accum[:] = self.accum * self.mu + dW self.accum[:] = self.accum * self.mu + dW
signed = np.sign(dW) * np.sign(self.accum) signed = np.sign(dW) * np.sign(self.accum)
#signed *= decay # signed *= decay
return -self.lr * dW * (self.alpha + signed) return -self.lr * dW * (self.alpha + signed)
class PowerSign(Optimizer): class PowerSign(Optimizer):
# paper: https://arxiv.org/abs/1709.07417 # paper: https://arxiv.org/abs/1709.07417
@ -462,13 +482,14 @@ class PowerSign(Optimizer):
self.accum[:] = self.accum * self.mu + dW self.accum[:] = self.accum * self.mu + dW
signed = np.sign(dW) * np.sign(self.accum) signed = np.sign(dW) * np.sign(self.accum)
#signed *= decay # signed *= decay
if self.use_exp: if self.use_exp:
return -self.lr * dW * np.exp(signed) return -self.lr * dW * np.exp(signed)
else: else:
return -self.lr * dW * np.power(self.alpha, signed) return -self.lr * dW * np.power(self.alpha, signed)
class Neumann(Optimizer): class Neumann(Optimizer):
# paper: https://arxiv.org/abs/1712.03298 # paper: https://arxiv.org/abs/1712.03298
# NOTE: this implementation is missing resetting as described in the paper. # NOTE: this implementation is missing resetting as described in the paper.
@ -478,20 +499,20 @@ class Neumann(Optimizer):
# it seems like using a Learner like SineCLR makes this unnecessary. # it seems like using a Learner like SineCLR makes this unnecessary.
def __init__(self, lr=0.01): def __init__(self, lr=0.01):
self.alpha = _f(1e-7) # cubic. self.alpha = _f(1e-7) # cubic.
self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later. self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing. self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
# momentum is ∝ (in the shape of) 1 - 1/(1 + t) # momentum is ∝ (in the shape of) 1 - 1/(1 + t)
self.mu_min = _f(0.5) self.mu_min = _f(0.5)
self.mu_max = _f(0.9) self.mu_max = _f(0.9)
self.reset_period = 0 # TODO self.reset_period = 0 # TODO
super().__init__(lr) super().__init__(lr)
def reset(self): def reset(self):
# NOTE: mt and vt are different than the pair in Adam-like optimizers. # NOTE: mt and vt are different than the pair in Adam-like optimizers.
self.mt = None # momentum accumulator. self.mt = None # momentum accumulator.
self.vt = None # weight accumulator. self.vt = None # weight accumulator.
self.t = 0 self.t = 0
def compute(self, dW, W): def compute(self, dW, W):
@ -510,7 +531,7 @@ class Neumann(Optimizer):
return return
# momentum quantity: # momentum quantity:
mu = _1 - _1/_f(self.t) # the + 1 is implicit. mu = _1 - _1/_f(self.t) # the + 1 is implicit.
mu = (mu + self.mu_min) * (self.mu_max - self.mu_min) mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
# smoothed change in weights: # smoothed change in weights:
@ -529,4 +550,3 @@ class Neumann(Optimizer):
# weights and accumulator: # weights and accumulator:
W += mu * self.mt - self.lr * dt W += mu * self.mt - self.lr * dt
self.vt = W + self.gamma * (self.vt - W) self.vt = W + self.gamma * (self.vt - W)

View file

@ -2,9 +2,10 @@ import numpy as np
from .float import * from .float import *
class Optimizer: class Optimizer:
def __init__(self, lr=0.1): def __init__(self, lr=0.1):
self.lr = _f(lr) # learning rate self.lr = _f(lr) # learning rate
self.reset() self.reset()
def reset(self): def reset(self):
@ -15,5 +16,3 @@ class Optimizer:
def update(self, dW, W): def update(self, dW, W):
W += self.compute(dW, W) W += self.compute(dW, W)

View file

@ -4,6 +4,7 @@ from .float import *
from .layer_base import * from .layer_base import *
from .initialization import * from .initialization import *
class Bias(Layer): class Bias(Layer):
# TODO: support axes other than -1 and shapes other than 1D. # TODO: support axes other than -1 and shapes other than 1D.
@ -28,6 +29,7 @@ class Bias(Layer):
self.biases.g += dY.sum(0) self.biases.g += dY.sum(0)
return dY return dY
class Dense(Layer): class Dense(Layer):
serialized = { serialized = {
'W': 'coeffs', 'W': 'coeffs',
@ -38,8 +40,10 @@ class Dense(Layer):
super().__init__() super().__init__()
self.dim = int(dim) self.dim = int(dim)
self.output_shape = (dim,) self.output_shape = (dim,)
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) self.coeffs = self._new_weights('coeffs', init=init,
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) regularizer=reg_w)
self.biases = self._new_weights('biases', init=init_zeros,
regularizer=reg_b)
def make_shape(self, parent): def make_shape(self, parent):
shape = parent.output_shape shape = parent.output_shape
@ -101,18 +105,20 @@ class Conv1Dper(Layer):
def forward(self, X): def forward(self, X):
if self.wrap0 == 0: if self.wrap0 == 0:
Xper = np.hstack((X,X[:,:self.wrap1])) Xper = np.hstack((X, X[:, :self.wrap1]))
elif self.wrap1 == 0: elif self.wrap1 == 0:
Xper = np.hstack((X[:,-self.wrap0:],X)) Xper = np.hstack((X[:, -self.wrap0:], X))
else: else:
Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1])) Xper = np.hstack((X[:, -self.wrap0:], X, X[:, :self.wrap1]))
self.cols = rolling_batch(Xper, self.kernel_size) self.cols = rolling_batch(Xper, self.kernel_size)
convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2) convolved = (self.cols * self.coeffs.f[:, ::-1]).sum(2)
return convolved return convolved
def backward(self, dY): def backward(self, dY):
self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True) self.coeffs.g += (dY[:, :, None] * self.cols).sum(0)[:, ::-1].sum(
return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2) 0, keepdims=True)
return (dY[:, :, None] * self.coeffs.f[:, ::-1]).sum(2)
class LayerNorm(Layer): class LayerNorm(Layer):
# paper: https://arxiv.org/abs/1607.06450 # paper: https://arxiv.org/abs/1607.06450
@ -168,7 +174,8 @@ class LayerNorm(Layer):
return dX return dX
class Denses(Layer): # TODO: rename?
class Denses(Layer): # TODO: rename?
# acts as a separate Dense for each row or column. only for 2D arrays. # acts as a separate Dense for each row or column. only for 2D arrays.
serialized = { serialized = {
@ -176,13 +183,16 @@ class Denses(Layer): # TODO: rename?
'b': 'biases', 'b': 'biases',
} }
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1): def __init__(self, dim, init=init_he_uniform,
reg_w=None, reg_b=None, axis=-1):
super().__init__() super().__init__()
self.dim = int(dim) self.dim = int(dim)
self.weight_init = init self.weight_init = init
self.axis = int(axis) self.axis = int(axis)
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w) self.coeffs = self._new_weights('coeffs', init=init,
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b) regularizer=reg_w)
self.biases = self._new_weights('biases', init=init_zeros,
regularizer=reg_b)
def make_shape(self, parent): def make_shape(self, parent):
shape = parent.output_shape shape = parent.output_shape
@ -220,9 +230,11 @@ class Denses(Layer): # TODO: rename?
self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY) self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f) return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
class CosineDense(Dense): class CosineDense(Dense):
# paper: https://arxiv.org/abs/1702.05870 # paper: https://arxiv.org/abs/1702.05870
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36 # another implementation:
# https://github.com/farizrahman4u/keras-contrib/pull/36
# the paper doesn't mention bias, # the paper doesn't mention bias,
# so we treat bias as an additional weight with a constant input of 1. # so we treat bias as an additional weight with a constant input of 1.
# this is correct in Dense layers, so i hope it's correct here too. # this is correct in Dense layers, so i hope it's correct here too.
@ -231,24 +243,25 @@ class CosineDense(Dense):
def forward(self, X): def forward(self, X):
self.X = X self.X = X
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \ self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True)
+ 1 + self.eps) + 1 + self.eps)
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \ self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True)
+ np.square(self.biases.f) + self.eps) + np.square(self.biases.f) + self.eps)
self.dot = X @ self.coeffs.f + self.biases.f self.dot = X @ self.coeffs.f + self.biases.f
Y = self.dot / (self.X_norm * self.W_norm) Y = self.dot / (self.X_norm * self.W_norm)
return Y return Y
def backward(self, dY): def backward(self, dY):
ddot = dY / self.X_norm / self.W_norm ddot = dY / self.X_norm / self.W_norm
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2 dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) \
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2 / self.X_norm**2
dW_norm = -(dY * self.dot / self.X_norm).sum(0, keepdims=True) \
/ self.W_norm**2
self.coeffs.g += self.X.T @ ddot \ self.coeffs.g += self.X.T @ ddot \
+ dW_norm / self.W_norm * self.coeffs.f + dW_norm / self.W_norm * self.coeffs.f
self.biases.g += ddot.sum(0, keepdims=True) \ self.biases.g += ddot.sum(0, keepdims=True) \
+ dW_norm / self.W_norm * self.biases.f + dW_norm / self.W_norm * self.biases.f
dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
return dX return dX

View file

@ -2,9 +2,11 @@ import numpy as np
from .float import * from .float import *
class Regularizer: class Regularizer:
pass pass
class L1L2(Regularizer): class L1L2(Regularizer):
def __init__(self, l1=0.0, l2=0.0): def __init__(self, l1=0.0, l2=0.0):
self.l1 = _f(l1) self.l1 = _f(l1)
@ -26,6 +28,7 @@ class L1L2(Regularizer):
df += self.l2 * 2 * X df += self.l2 * 2 * X
return df return df
# more # more
class SaturateRelu(Regularizer): class SaturateRelu(Regularizer):

View file

@ -4,6 +4,7 @@ from .float import *
from .initialization import * from .initialization import *
from .ritual_base import * from .ritual_base import *
def stochastic_multiply(W, gamma=0.5, allow_negation=False): def stochastic_multiply(W, gamma=0.5, allow_negation=False):
# paper: https://arxiv.org/abs/1606.01981 # paper: https://arxiv.org/abs/1606.01981
@ -23,6 +24,7 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=False):
mult *= np.where(samples < prob, 1, -1) mult *= np.where(samples < prob, 1, -1)
np.multiply(W, mult, out=W) np.multiply(W, mult, out=W)
class StochMRitual(Ritual): class StochMRitual(Ritual):
# paper: https://arxiv.org/abs/1606.01981 # paper: https://arxiv.org/abs/1606.01981
# this probably doesn't make sense for regression problems, # this probably doesn't make sense for regression problems,
@ -38,8 +40,8 @@ class StochMRitual(Ritual):
def learn(self, inputs, outputs): def learn(self, inputs, outputs):
# an experiment: # an experiment:
#assert self.learner.rate < 10, self.learner.rate # assert self.learner.rate < 10, self.learner.rate
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate)) # self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
self.W[:] = self.model.W self.W[:] = self.model.W
for layer in self.model.ordered_nodes: for layer in self.model.ordered_nodes:
@ -57,6 +59,7 @@ class StochMRitual(Ritual):
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W) np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
# np.clip(layer.W, -1, 1, out=layer.W) # np.clip(layer.W, -1, 1, out=layer.W)
class NoisyRitual(Ritual): class NoisyRitual(Ritual):
def __init__(self, learner=None, def __init__(self, learner=None,
input_noise=0, output_noise=0, gradient_noise=0): input_noise=0, output_noise=0, gradient_noise=0):
@ -69,7 +72,7 @@ class NoisyRitual(Ritual):
# this is pretty crude # this is pretty crude
if self.input_noise > 0: if self.input_noise > 0:
s = self.input_noise s = self.input_noise
inputs = inputs + np.random.normal(0, s, size=inputs.shape) inputs = inputs + np.random.normal(0, s, size=inputs.shape)
if self.output_noise > 0: if self.output_noise > 0:
s = self.output_noise s = self.output_noise
outputs = outputs + np.random.normal(0, s, size=outputs.shape) outputs = outputs + np.random.normal(0, s, size=outputs.shape)
@ -80,11 +83,10 @@ class NoisyRitual(Ritual):
if self.gradient_noise > 0: if self.gradient_noise > 0:
size = len(self.model.dW) size = len(self.model.dW)
gamma = 0.55 gamma = 0.55
#s = self.gradient_noise / (1 + self.bn) ** gamma # s = self.gradient_noise / (1 + self.bn) ** gamma
# experiments: # experiments:
s = self.gradient_noise * np.sqrt(self.learner.rate) s = self.gradient_noise * np.sqrt(self.learner.rate)
#s = np.square(self.learner.rate) # s = np.square(self.learner.rate)
#s = self.learner.rate / self.en # s = self.learner.rate / self.en
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size) self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
super().update() super().update()

View file

@ -3,7 +3,8 @@ import numpy as np
from .float import * from .float import *
class Ritual: # i'm just making up names at this point.
class Ritual: # i'm just making up names at this point.
def __init__(self, learner=None): def __init__(self, learner=None):
self.learner = learner if learner is not None else Learner(Optimizer()) self.learner = learner if learner is not None else Learner(Optimizer())
self.model = None self.model = None
@ -77,7 +78,8 @@ class Ritual: # i'm just making up names at this point.
if shuffle: if shuffle:
if gen: if gen:
raise Exception("shuffling is incompatibile with using a generator.") raise Exception(
"shuffling is incompatibile with using a generator.")
indices = np.arange(inputs.shape[0]) indices = np.arange(inputs.shape[0])
np.random.shuffle(indices) np.random.shuffle(indices)
inputs = inputs[indices] inputs = inputs[indices]
@ -90,7 +92,7 @@ class Ritual: # i'm just making up names at this point.
batch_count = inputs.shape[0] // batch_size batch_count = inputs.shape[0] // batch_size
# TODO: lift this restriction # TODO: lift this restriction
assert inputs.shape[0] % batch_size == 0, \ assert inputs.shape[0] % batch_size == 0, \
"inputs is not evenly divisible by batch_size" "inputs is not evenly divisible by batch_size"
prev_batch_size = None prev_batch_size = None
for b in range(batch_count): for b in range(batch_count):
@ -101,17 +103,20 @@ class Ritual: # i'm just making up names at this point.
batch_inputs, batch_outputs = next(generator) batch_inputs, batch_outputs = next(generator)
batch_size = batch_inputs.shape[0] batch_size = batch_inputs.shape[0]
# TODO: lift this restriction # TODO: lift this restriction
assert batch_size == prev_batch_size or prev_batch_size is None, \ fmt = "non-constant batch size (got {}, expected {})"
"non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size) assert (batch_size == prev_batch_size
or prev_batch_size is None), \
fmt.format(batch_size, prev_batch_size)
else: else:
bi = b * batch_size bi = b * batch_size
batch_inputs = inputs[ bi:bi+batch_size] batch_inputs = inputs[bi:bi+batch_size]
batch_outputs = outputs[bi:bi+batch_size] batch_outputs = outputs[bi:bi+batch_size]
if clear_grad: if clear_grad:
self.model.clear_grad() self.model.clear_grad()
self._train_batch(batch_inputs, batch_outputs, b, batch_count, self._train_batch(batch_inputs, batch_outputs, b, batch_count,
test_only, return_losses=='both', return_losses) test_only, return_losses == 'both',
return_losses)
prev_batch_size = batch_size prev_batch_size = batch_size

View file

@ -1,17 +1,23 @@
import sys import sys
def lament(*args, **kwargs): def lament(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs) print(*args, file=sys.stderr, **kwargs)
def lower_priority(): def lower_priority():
"""Set the priority of the process to below-normal.""" """Set the priority of the process to below-normal."""
# via https://stackoverflow.com/a/1023269 # via https://stackoverflow.com/a/1023269
if sys.platform == 'win32': if sys.platform == 'win32':
try: try:
import win32api, win32process, win32con import win32api
import win32process
import win32con
pid = win32api.GetCurrentProcessId() pid = win32api.GetCurrentProcessId()
handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid) handle = win32api.OpenProcess(
win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS) win32con.PROCESS_ALL_ACCESS, True, pid)
win32process.SetPriorityClass(
handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
except ImportError: except ImportError:
lament("you do not have pywin32 installed.") lament("you do not have pywin32 installed.")
lament("the process priority could not be lowered.") lament("the process priority could not be lowered.")
@ -21,9 +27,12 @@ def lower_priority():
import os import os
os.nice(1) os.nice(1)
# more # more
_log_was_update = False _log_was_update = False
def log(left, right, update=False): def log(left, right, update=False):
s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right) s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right)
global _log_was_update global _log_was_update
@ -33,5 +42,6 @@ def log(left, right, update=False):
lament(s) lament(s)
_log_was_update = update _log_was_update = update
class Dummy: class Dummy:
pass pass

View file

@ -1,11 +1,12 @@
import numpy as np import numpy as np
class Weights: class Weights:
# we may or may not contain weights -- or any information, for that matter. # we may or may not contain weights -- or any information, for that matter.
def __init__(self, **kwargs): def __init__(self, **kwargs):
self.f = None # forward weights self.f = None # forward weights
self.g = None # backward weights (gradients) self.g = None # backward weights (gradients)
self.shape = None self.shape = None
self.init = None self.init = None
self.allocator = None self.allocator = None
@ -16,7 +17,7 @@ class Weights:
def configure(self, **kwargs): def configure(self, **kwargs):
for k, v in kwargs.items(): for k, v in kwargs.items():
getattr(self, k) # ensures the key already exists getattr(self, k) # ensures the key already exists
setattr(self, k, v) setattr(self, k, v)
@property @property