basic PEP 8 compliance
rip readability
This commit is contained in:
parent
c81ce0afbb
commit
169303813d
19 changed files with 282 additions and 150 deletions
|
@ -1,5 +1,5 @@
|
|||
# external packages required for full functionality:
|
||||
# numpy scipy h5py sklearn dotmap
|
||||
# numpy scipy h5py sklearn
|
||||
|
||||
# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
|
||||
# this is gonna take some work.
|
||||
|
|
|
@ -6,6 +6,7 @@ from scipy.special import expit as sigmoid
|
|||
from .float import *
|
||||
from .layer_base import *
|
||||
|
||||
|
||||
class Identity(Layer):
|
||||
def forward(self, X):
|
||||
return X
|
||||
|
@ -13,7 +14,8 @@ class Identity(Layer):
|
|||
def backward(self, dY):
|
||||
return dY
|
||||
|
||||
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
|
||||
|
||||
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
|
||||
def forward(self, X):
|
||||
self.sig = sigmoid(X)
|
||||
return self.sig
|
||||
|
@ -21,6 +23,7 @@ class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
|
|||
def backward(self, dY):
|
||||
return dY * self.sig * (1 - self.sig)
|
||||
|
||||
|
||||
class Softplus(Layer):
|
||||
# integral of Sigmoid.
|
||||
|
||||
|
@ -31,6 +34,7 @@ class Softplus(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * sigmoid(self.X)
|
||||
|
||||
|
||||
class Tanh(Layer):
|
||||
def forward(self, X):
|
||||
self.sig = np.tanh(X)
|
||||
|
@ -39,6 +43,7 @@ class Tanh(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * (1 - self.sig * self.sig)
|
||||
|
||||
|
||||
class LeCunTanh(Layer):
|
||||
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
|
||||
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
|
||||
|
@ -53,6 +58,7 @@ class LeCunTanh(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
|
||||
|
||||
|
||||
class Relu(Layer):
|
||||
def forward(self, X):
|
||||
self.cond = X >= 0
|
||||
|
@ -61,12 +67,13 @@ class Relu(Layer):
|
|||
def backward(self, dY):
|
||||
return np.where(self.cond, dY, 0)
|
||||
|
||||
|
||||
class Elu(Layer):
|
||||
# paper: https://arxiv.org/abs/1511.07289
|
||||
|
||||
def __init__(self, alpha=1):
|
||||
super().__init__()
|
||||
self.alpha = _f(alpha) # FIXME: unused
|
||||
self.alpha = _f(alpha) # FIXME: unused
|
||||
|
||||
def forward(self, X):
|
||||
self.cond = X >= 0
|
||||
|
@ -76,6 +83,7 @@ class Elu(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * np.where(self.cond, 1, self.neg + 1)
|
||||
|
||||
|
||||
class GeluApprox(Layer):
|
||||
# paper: https://arxiv.org/abs/1606.08415
|
||||
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
||||
|
@ -88,6 +96,7 @@ class GeluApprox(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
||||
|
||||
|
||||
class Softmax(Layer):
|
||||
def forward(self, X):
|
||||
alpha = np.max(X, axis=-1, keepdims=True)
|
||||
|
@ -99,6 +108,7 @@ class Softmax(Layer):
|
|||
def backward(self, dY):
|
||||
return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
|
||||
|
||||
|
||||
class LogSoftmax(Softmax):
|
||||
def __init__(self, eps=1e-6):
|
||||
super().__init__()
|
||||
|
@ -110,6 +120,7 @@ class LogSoftmax(Softmax):
|
|||
def backward(self, dY):
|
||||
return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
|
||||
|
||||
|
||||
class Cos(Layer):
|
||||
# performs well on MNIST for some strange reason.
|
||||
|
||||
|
@ -120,6 +131,7 @@ class Cos(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * -np.sin(self.X)
|
||||
|
||||
|
||||
class Selu(Layer):
|
||||
# paper: https://arxiv.org/abs/1706.02515
|
||||
|
||||
|
@ -136,6 +148,7 @@ class Selu(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * self.lamb * np.where(self.cond, 1, self.neg)
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class TanhTest(Layer):
|
||||
|
@ -146,6 +159,7 @@ class TanhTest(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
|
||||
|
||||
|
||||
class ExpGB(Layer):
|
||||
# an output layer for one-hot classification problems.
|
||||
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
|
||||
|
@ -163,6 +177,7 @@ class ExpGB(Layer):
|
|||
# this gradient is intentionally incorrect.
|
||||
return dY
|
||||
|
||||
|
||||
class CubicGB(Layer):
|
||||
# an output layer for one-hot classification problems.
|
||||
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
|
||||
|
@ -182,4 +197,3 @@ class CubicGB(Layer):
|
|||
def backward(self, dY):
|
||||
# this gradient is intentionally incorrect.
|
||||
return dY
|
||||
|
||||
|
|
|
@ -2,11 +2,13 @@ import numpy as np
|
|||
|
||||
_f = np.float32
|
||||
|
||||
|
||||
def _check(a):
|
||||
assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
|
||||
assert a.dtype == _f, a.dtype
|
||||
return a
|
||||
|
||||
|
||||
_0 = _f(0)
|
||||
_1 = _f(1)
|
||||
_2 = _f(2)
|
||||
|
|
|
@ -2,28 +2,35 @@ import numpy as np
|
|||
|
||||
# note: these are currently only implemented for 2D shapes.
|
||||
|
||||
|
||||
def init_zeros(size, ins=None, outs=None):
|
||||
return np.zeros(size)
|
||||
|
||||
|
||||
def init_ones(size, ins=None, outs=None):
|
||||
return np.ones(size)
|
||||
|
||||
|
||||
def init_he_normal(size, ins, outs):
|
||||
s = np.sqrt(2 / ins)
|
||||
return np.random.normal(0, s, size=size)
|
||||
|
||||
|
||||
def init_he_uniform(size, ins, outs):
|
||||
s = np.sqrt(6 / ins)
|
||||
return np.random.uniform(-s, s, size=size)
|
||||
|
||||
|
||||
def init_glorot_normal(size, ins, outs):
|
||||
s = np.sqrt(2 / (ins + outs))
|
||||
return np.random.normal(0, s, size=size)
|
||||
|
||||
|
||||
def init_glorot_uniform(size, ins, outs):
|
||||
s = np.sqrt(6 / (ins + outs))
|
||||
return np.random.uniform(-s, s, size=size)
|
||||
|
||||
|
||||
# more
|
||||
|
||||
def init_gaussian_unit(size, ins, outs):
|
||||
|
|
18
onn/layer.py
18
onn/layer.py
|
@ -2,6 +2,7 @@ from .layer_base import *
|
|||
from .initialization import *
|
||||
from .float import *
|
||||
|
||||
|
||||
# Nonparametric Layers {{{1
|
||||
|
||||
class Input(Layer):
|
||||
|
@ -16,9 +17,10 @@ class Input(Layer):
|
|||
return X
|
||||
|
||||
def backward(self, dY):
|
||||
#self.dY = dY
|
||||
# self.dY = dY
|
||||
return np.zeros_like(dY)
|
||||
|
||||
|
||||
class Reshape(Layer):
|
||||
def __init__(self, new_shape):
|
||||
super().__init__()
|
||||
|
@ -33,6 +35,7 @@ class Reshape(Layer):
|
|||
assert dY.shape[0] == self.batch_size
|
||||
return dY.reshape(self.batch_size, *self.input_shape)
|
||||
|
||||
|
||||
class Flatten(Layer):
|
||||
def make_shape(self, parent):
|
||||
shape = parent.output_shape
|
||||
|
@ -47,6 +50,7 @@ class Flatten(Layer):
|
|||
assert dY.shape[0] == self.batch_size
|
||||
return dY.reshape(self.batch_size, *self.input_shape)
|
||||
|
||||
|
||||
class ConstAffine(Layer):
|
||||
def __init__(self, a=1, b=0):
|
||||
super().__init__()
|
||||
|
@ -59,13 +63,15 @@ class ConstAffine(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * self.a
|
||||
|
||||
|
||||
class Sum(Layer):
|
||||
def _propagate(self, edges, deterministic):
|
||||
return np.sum(edges, axis=0)
|
||||
|
||||
def _backpropagate(self, edges):
|
||||
#assert len(edges) == 1, "unimplemented"
|
||||
return edges[0] # TODO: does this always work?
|
||||
# assert len(edges) == 1, "unimplemented"
|
||||
return edges[0] # TODO: does this always work?
|
||||
|
||||
|
||||
class ActivityRegularizer(Layer):
|
||||
def __init__(self, reg):
|
||||
|
@ -81,6 +87,7 @@ class ActivityRegularizer(Layer):
|
|||
def backward(self, dY):
|
||||
return dY + self.reg.backward(self.X)
|
||||
|
||||
|
||||
class Dropout(Layer):
|
||||
def __init__(self, dropout=0.0):
|
||||
super().__init__()
|
||||
|
@ -92,12 +99,13 @@ class Dropout(Layer):
|
|||
return X * self.mask
|
||||
|
||||
def forward_deterministic(self, X):
|
||||
#self.mask = _1
|
||||
# self.mask = _1
|
||||
return X
|
||||
|
||||
def backward(self, dY):
|
||||
return dY * self.mask
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class AlphaDropout(Layer):
|
||||
|
@ -136,6 +144,7 @@ class AlphaDropout(Layer):
|
|||
def backward(self, dY):
|
||||
return dY * self.a * self.mask
|
||||
|
||||
|
||||
class Decimate(Layer):
|
||||
# simple decimaton layer that drops every other sample from the last axis.
|
||||
|
||||
|
@ -168,6 +177,7 @@ class Decimate(Layer):
|
|||
dX.ravel()[1::2] = dY.ravel()
|
||||
return dX
|
||||
|
||||
|
||||
class Undecimate(Layer):
|
||||
# inverse operation of Decimate. not quite interpolation.
|
||||
|
||||
|
|
|
@ -4,26 +4,29 @@ from collections import defaultdict, OrderedDict
|
|||
|
||||
from .weight import *
|
||||
|
||||
|
||||
# used for numbering layers like Keras:
|
||||
_layer_counters = defaultdict(lambda: 0)
|
||||
|
||||
|
||||
class LayerIncompatibility(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Layer:
|
||||
def __init__(self):
|
||||
self.parents = []
|
||||
self.children = []
|
||||
self.weights = OrderedDict()
|
||||
self.loss = None # for activity regularizers
|
||||
self.loss = None # for activity regularizers
|
||||
self.input_shape = None
|
||||
self.output_shape = None
|
||||
kind = self.__class__.__name__
|
||||
global _layer_counters
|
||||
_layer_counters[kind] += 1
|
||||
self.name = "{}_{}".format(kind, _layer_counters[kind])
|
||||
self.unsafe = False # disables assertions for better performance
|
||||
self.shared = False # as in weight sharing
|
||||
self.unsafe = False # disables assertions for better performance
|
||||
self.shared = False # as in weight sharing
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
@ -40,9 +43,9 @@ class Layer:
|
|||
raise NotImplementedError("unimplemented", self)
|
||||
|
||||
def make_shape(self, parent):
|
||||
if self.input_shape == None:
|
||||
if self.input_shape is None:
|
||||
self.input_shape = parent.output_shape
|
||||
if self.output_shape == None:
|
||||
if self.output_shape is None:
|
||||
self.output_shape = self.input_shape
|
||||
|
||||
def do_feed(self, child):
|
||||
|
@ -75,16 +78,19 @@ class Layer:
|
|||
child.make_shape(self)
|
||||
if not child.is_compatible(self):
|
||||
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
|
||||
raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
|
||||
raise LayerIncompatibility(fmt.format(
|
||||
self, child, self.output_shape, child.input_shape))
|
||||
self.do_feed(child)
|
||||
child.be_fed(self)
|
||||
return child
|
||||
|
||||
def validate_input(self, X):
|
||||
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
|
||||
assert X.shape[1:] == self.input_shape, \
|
||||
(str(self), X.shape[1:], self.input_shape)
|
||||
|
||||
def validate_output(self, Y):
|
||||
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
||||
assert Y.shape[1:] == self.output_shape, \
|
||||
(str(self), Y.shape[1:], self.output_shape)
|
||||
|
||||
def _new_weights(self, name, **kwargs):
|
||||
w = Weights(**kwargs)
|
||||
|
@ -93,9 +99,10 @@ class Layer:
|
|||
return w
|
||||
|
||||
def share(self, node):
|
||||
self.weights = node.weights # TODO: this should be all it takes.
|
||||
self.weights = node.weights # TODO: this should be all it takes.
|
||||
for k, v in self.weights.items():
|
||||
vs = getattr(node, k) # hack: key isn't necessarily attribute name!
|
||||
# hack: key isn't necessarily attribute name!
|
||||
vs = getattr(node, k)
|
||||
setattr(self, k, vs)
|
||||
self.shared = True
|
||||
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from .float import *
|
||||
from .optimizer_base import *
|
||||
|
||||
|
||||
class Learner:
|
||||
per_batch = False
|
||||
|
||||
def __init__(self, optim, epochs=100, rate=None):
|
||||
assert isinstance(optim, Optimizer)
|
||||
self.optim = optim
|
||||
self.start_rate = rate # None is okay; it'll use optim.lr instead.
|
||||
self.start_rate = rate # None is okay; it'll use optim.lr instead.
|
||||
self.epochs = int(epochs)
|
||||
self.reset()
|
||||
|
||||
|
@ -49,7 +50,7 @@ class Learner:
|
|||
return False
|
||||
return True
|
||||
|
||||
def batch(self, progress): # TODO: rename
|
||||
def batch(self, progress): # TODO: rename
|
||||
# interpolates rates between epochs.
|
||||
# unlike epochs, we do not store batch number as a state.
|
||||
# i.e. calling next() will not respect progress.
|
||||
|
@ -60,6 +61,7 @@ class Learner:
|
|||
def final_rate(self):
|
||||
return self.rate_at(self.epochs - 1e-8)
|
||||
|
||||
|
||||
class AnnealingLearner(Learner):
|
||||
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
||||
self.halve_every = _f(halve_every)
|
||||
|
@ -69,10 +71,12 @@ class AnnealingLearner(Learner):
|
|||
def rate_at(self, epoch):
|
||||
return super().rate_at(epoch) * self.anneal**epoch
|
||||
|
||||
|
||||
def cosmod(x):
|
||||
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
||||
return (_1 + np.cos((x % _1) * _pi)) * _inv2
|
||||
|
||||
|
||||
class SGDR(Learner):
|
||||
# Stochastic Gradient Descent with Restarts
|
||||
# paper: https://arxiv.org/abs/1608.03983
|
||||
|
@ -112,7 +116,8 @@ class SGDR(Learner):
|
|||
raise Exception('this should never happen.')
|
||||
|
||||
def rate_at(self, epoch):
|
||||
base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
|
||||
sr = self.start_rate
|
||||
base_rate = sr if sr is not None else self.optim.lr
|
||||
restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
|
||||
x = _f(sub_epoch - 1) / _f(next_restart)
|
||||
return base_rate * self.decay**_f(restart) * cosmod(x)
|
||||
|
@ -126,6 +131,7 @@ class SGDR(Learner):
|
|||
self.restart_callback(restart)
|
||||
return True
|
||||
|
||||
|
||||
class TriangularCLR(Learner):
|
||||
per_batch = True
|
||||
|
||||
|
@ -141,11 +147,14 @@ class TriangularCLR(Learner):
|
|||
def _t(self, epoch):
|
||||
# NOTE: this could probably be simplified
|
||||
offset = self.frequency / 2
|
||||
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
|
||||
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) \
|
||||
/ offset
|
||||
|
||||
def rate_at(self, epoch):
|
||||
upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
|
||||
return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
|
||||
sr = self.start_rate
|
||||
lr = self.lower_rate
|
||||
upper_rate = sr if sr is not None else self.optim.lr
|
||||
return self._t(epoch) * (upper_rate - lr) + lr
|
||||
|
||||
def next(self):
|
||||
if not super().next():
|
||||
|
@ -156,14 +165,17 @@ class TriangularCLR(Learner):
|
|||
self.callback(self.epoch // self.frequency)
|
||||
return True
|
||||
|
||||
|
||||
class SineCLR(TriangularCLR):
|
||||
def _t(self, epoch):
|
||||
return np.sin(_pi * _inv2 * super()._t(epoch))
|
||||
|
||||
|
||||
class WaveCLR(TriangularCLR):
|
||||
def _t(self, epoch):
|
||||
return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class PolyLearner(Learner):
|
||||
|
@ -177,4 +189,3 @@ class PolyLearner(Learner):
|
|||
progress = (epoch - 1) / (self.epochs)
|
||||
ret = np.polyval(self.coeffs, progress)
|
||||
return np.abs(ret)
|
||||
|
||||
|
|
14
onn/loss.py
14
onn/loss.py
|
@ -2,6 +2,7 @@ import numpy as np
|
|||
|
||||
from .float import *
|
||||
|
||||
|
||||
class Loss:
|
||||
def forward(self, p, y):
|
||||
raise NotImplementedError("unimplemented", self)
|
||||
|
@ -9,7 +10,8 @@ class Loss:
|
|||
def backward(self, p, y):
|
||||
raise NotImplementedError("unimplemented", self)
|
||||
|
||||
class NLL(Loss): # Negative Log Likelihood
|
||||
|
||||
class NLL(Loss): # Negative Log Likelihood
|
||||
def forward(self, p, y):
|
||||
correct = p * y
|
||||
return np.mean(-correct)
|
||||
|
@ -17,6 +19,7 @@ class NLL(Loss): # Negative Log Likelihood
|
|||
def backward(self, p, y):
|
||||
return -y / len(p)
|
||||
|
||||
|
||||
class CategoricalCrossentropy(Loss):
|
||||
# lifted from theano
|
||||
|
||||
|
@ -33,6 +36,7 @@ class CategoricalCrossentropy(Loss):
|
|||
df = (p - y) / (p * (1 - p))
|
||||
return df / len(y)
|
||||
|
||||
|
||||
class Accuracy(Loss):
|
||||
# returns percentage of categories correctly predicted.
|
||||
# utilizes argmax(), so it cannot be used for gradient descent.
|
||||
|
@ -45,6 +49,7 @@ class Accuracy(Loss):
|
|||
def backward(self, p, y):
|
||||
raise NotImplementedError("cannot take the gradient of Accuracy")
|
||||
|
||||
|
||||
class ResidualLoss(Loss):
|
||||
def forward(self, p, y):
|
||||
return np.mean(self.f(p - y))
|
||||
|
@ -53,6 +58,7 @@ class ResidualLoss(Loss):
|
|||
ret = self.df(p - y) / len(y)
|
||||
return ret
|
||||
|
||||
|
||||
class SquaredHalved(ResidualLoss):
|
||||
def f(self, r):
|
||||
return np.square(r) / 2
|
||||
|
@ -60,6 +66,7 @@ class SquaredHalved(ResidualLoss):
|
|||
def df(self, r):
|
||||
return r
|
||||
|
||||
|
||||
class Squared(ResidualLoss):
|
||||
def f(self, r):
|
||||
return np.square(r)
|
||||
|
@ -67,6 +74,7 @@ class Squared(ResidualLoss):
|
|||
def df(self, r):
|
||||
return 2 * r
|
||||
|
||||
|
||||
class Absolute(ResidualLoss):
|
||||
def f(self, r):
|
||||
return np.abs(r)
|
||||
|
@ -74,6 +82,7 @@ class Absolute(ResidualLoss):
|
|||
def df(self, r):
|
||||
return np.sign(r)
|
||||
|
||||
|
||||
class Huber(ResidualLoss):
|
||||
def __init__(self, delta=1.0):
|
||||
self.delta = _f(delta)
|
||||
|
@ -88,6 +97,7 @@ class Huber(ResidualLoss):
|
|||
r,
|
||||
self.delta * np.sign(r))
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class SomethingElse(ResidualLoss):
|
||||
|
@ -105,6 +115,7 @@ class SomethingElse(ResidualLoss):
|
|||
def df(self, r):
|
||||
return np.sign(r) * np.abs(r)**self.c
|
||||
|
||||
|
||||
class Confidence(Loss):
|
||||
# this isn't "confidence" in any meaningful way; (e.g. Bayesian)
|
||||
# it's just a metric of how large the value is of the predicted class.
|
||||
|
@ -126,4 +137,3 @@ class Confidence(Loss):
|
|||
detc = p / categories / (1 - 1/categories)
|
||||
dmax = p == np.max(p, axis=-1, keepdims=True)
|
||||
return detc * dmax
|
||||
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def rolling(a, window):
|
||||
# http://stackoverflow.com/a/4924433
|
||||
shape = (a.size - window + 1, window)
|
||||
strides = (a.itemsize, a.itemsize)
|
||||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||||
|
||||
|
||||
def rolling_batch(a, window):
|
||||
# same as rolling, but acts on each batch (axis 0).
|
||||
shape = (a.shape[0], a.shape[-1] - window + 1, window)
|
||||
strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
|
||||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||||
|
||||
|
|
53
onn/model.py
53
onn/model.py
|
@ -5,14 +5,16 @@ from .nodal import *
|
|||
from .layer_base import *
|
||||
from .utility import *
|
||||
|
||||
|
||||
class Model:
|
||||
def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
|
||||
def __init__(self, nodes_in, nodes_out,
|
||||
loss=None, mloss=None, unsafe=False):
|
||||
self.loss = loss if loss is not None else SquaredHalved()
|
||||
self.mloss = mloss if mloss is not None else loss
|
||||
|
||||
nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
|
||||
nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
|
||||
nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
|
||||
assert type(nodes_in) == list, type(nodes_in)
|
||||
assert type(nodes_in) == list, type(nodes_in)
|
||||
assert type(nodes_out) == list, type(nodes_out)
|
||||
self.nodes_in = nodes_in
|
||||
self.nodes_out = nodes_out
|
||||
|
@ -29,8 +31,9 @@ class Model:
|
|||
return self.nodes
|
||||
|
||||
def make_weights(self):
|
||||
self.param_count = sum((node.size for node in self.nodes if not node.shared))
|
||||
self.W = np.zeros(self.param_count, dtype=_f)
|
||||
self.param_count = sum((node.size for node in self.nodes
|
||||
if not node.shared))
|
||||
self.W = np.zeros(self.param_count, dtype=_f)
|
||||
self.dW = np.zeros(self.param_count, dtype=_f)
|
||||
|
||||
offset = 0
|
||||
|
@ -47,37 +50,42 @@ class Model:
|
|||
assert size == len(ret[0]), (size, len(ret[0]))
|
||||
return ret
|
||||
|
||||
fmt = "Layer {} allocated {} weights than it said it would"
|
||||
node.init(allocate)
|
||||
assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
|
||||
assert inner_offset <= node.size, fmt.format("more", node)
|
||||
# i don't care if "less" is grammatically incorrect.
|
||||
# you're mom is grammatically incorrect.
|
||||
assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
|
||||
assert inner_offset >= node.size, fmt.format("less", node)
|
||||
offset += node.size
|
||||
|
||||
def evaluate(self, input_, deterministic=True):
|
||||
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
|
||||
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
|
||||
fmt = "ambiguous input in multi-{} network; use {}() instead"
|
||||
assert len(self.nodes_in) == 1, fmt.format("input", "evaluate_multi")
|
||||
assert len(self.nodes_out) == 1, fmt.format("output", "evaluate_multi")
|
||||
node_in = self.nodes_in[0]
|
||||
node_out = self.nodes_out[0]
|
||||
outputs = self.evaluate_multi({node_in: input_}, deterministic)
|
||||
return outputs[node_out]
|
||||
|
||||
def apply(self, error): # TODO: better name?
|
||||
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
|
||||
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
|
||||
def apply(self, error): # TODO: better name?
|
||||
fmt = "ambiguous input in multi-{} network; use {}() instead"
|
||||
assert len(self.nodes_in) == 1, fmt.format("input", "apply_multi")
|
||||
assert len(self.nodes_out) == 1, fmt.format("output", "apply_multi")
|
||||
node_in = self.nodes_in[0]
|
||||
node_out = self.nodes_out[0]
|
||||
inputs = self.apply_multi({node_out: error})
|
||||
return inputs[node_in]
|
||||
|
||||
def evaluate_multi(self, inputs, deterministic=True):
|
||||
fmt = "missing {} for node {}"
|
||||
values = dict()
|
||||
outputs = dict()
|
||||
for node in self.nodes:
|
||||
if node in self.nodes_in:
|
||||
assert node in inputs, "missing input for node {}".format(node.name)
|
||||
assert node in inputs, fmt.format("input", node.name)
|
||||
X = inputs[node]
|
||||
values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
|
||||
values[node] = node._propagate(np.expand_dims(X, 0),
|
||||
deterministic)
|
||||
else:
|
||||
values[node] = node.propagate(values, deterministic)
|
||||
if node in self.nodes_out:
|
||||
|
@ -85,11 +93,12 @@ class Model:
|
|||
return outputs
|
||||
|
||||
def apply_multi(self, outputs):
|
||||
fmt = "missing {} for node {}"
|
||||
values = dict()
|
||||
inputs = dict()
|
||||
for node in reversed(self.nodes):
|
||||
if node in self.nodes_out:
|
||||
assert node in outputs, "missing output for node {}".format(node.name)
|
||||
assert node in outputs, fmt.format("output", node.name)
|
||||
X = outputs[node]
|
||||
values[node] = node._backpropagate(np.expand_dims(X, 0))
|
||||
else:
|
||||
|
@ -135,13 +144,17 @@ class Model:
|
|||
|
||||
def load_weights(self, fn):
|
||||
# seemingly compatible with keras' Dense layers.
|
||||
import h5py
|
||||
open(fn) # just ensure the file exists (python's error is better)
|
||||
f = h5py.File(fn, 'r')
|
||||
weights = {}
|
||||
|
||||
import h5py
|
||||
open(fn) # just ensure the file exists (python's error is better)
|
||||
|
||||
f = h5py.File(fn, 'r')
|
||||
|
||||
def visitor(name, obj):
|
||||
if isinstance(obj, h5py.Dataset):
|
||||
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
|
||||
|
||||
f.visititems(visitor)
|
||||
f.close()
|
||||
|
||||
|
@ -194,5 +207,7 @@ class Model:
|
|||
children = [str(n) for n in node.children]
|
||||
if children:
|
||||
sep = '->'
|
||||
print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
|
||||
print('\t' + str(node) + sep +
|
||||
(';\n\t' + str(node) + sep).join(children) + ';',
|
||||
file=file)
|
||||
print('}', file=file)
|
||||
|
|
|
@ -3,7 +3,8 @@ class DummyNode:
|
|||
|
||||
def __init__(self, children=None, parents=None):
|
||||
self.children = children if children is not None else []
|
||||
self.parents = parents if parents is not None else []
|
||||
self.parents = parents if parents is not None else []
|
||||
|
||||
|
||||
def traverse(node_in, node_out, nodes=None, dummy_mode=False):
|
||||
# i have no idea if this is any algorithm in particular.
|
||||
|
@ -27,7 +28,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
|
|||
if not seen_up[node]:
|
||||
continue
|
||||
parents_added = (parent in nodes for parent in node.parents)
|
||||
if not node in nodes and all(parents_added):
|
||||
if node not in nodes and all(parents_added):
|
||||
nodes.append(node)
|
||||
for child in node.children:
|
||||
q.append(child)
|
||||
|
@ -37,6 +38,7 @@ def traverse(node_in, node_out, nodes=None, dummy_mode=False):
|
|||
|
||||
return nodes
|
||||
|
||||
|
||||
def traverse_all(nodes_in, nodes_out, nodes=None):
|
||||
all_in = DummyNode(children=nodes_in)
|
||||
all_out = DummyNode(parents=nodes_out)
|
||||
|
|
130
onn/optimizer.py
130
onn/optimizer.py
|
@ -7,9 +7,10 @@ from .utility import *
|
|||
# some of the the following optimizers are blatantly lifted from tiny-dnn:
|
||||
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
||||
|
||||
|
||||
class Momentum(Optimizer):
|
||||
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
|
||||
self.mu = _f(mu) # momentum
|
||||
self.mu = _f(mu) # momentum
|
||||
self.nesterov = bool(nesterov)
|
||||
|
||||
super().__init__(lr)
|
||||
|
@ -28,6 +29,7 @@ class Momentum(Optimizer):
|
|||
|
||||
return V
|
||||
|
||||
|
||||
class Adagrad(Optimizer):
|
||||
def __init__(self, lr=0.01, eps=1e-8):
|
||||
self.eps = _f(eps)
|
||||
|
@ -44,6 +46,7 @@ class Adagrad(Optimizer):
|
|||
self.g += np.square(dW)
|
||||
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||
|
||||
|
||||
class RMSprop(Optimizer):
|
||||
# RMSprop generalizes* Adagrad, etc.
|
||||
|
||||
|
@ -51,7 +54,7 @@ class RMSprop(Optimizer):
|
|||
# RMSprop.mu == 1
|
||||
|
||||
def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
|
||||
self.mu = _f(mu) # decay term
|
||||
self.mu = _f(mu) # decay term
|
||||
self.eps = _f(eps)
|
||||
|
||||
# one might consider the following equation when specifying mu:
|
||||
|
@ -70,12 +73,13 @@ class RMSprop(Optimizer):
|
|||
if self.g is None:
|
||||
self.g = np.zeros_like(dW)
|
||||
|
||||
# basically apply a first-order low-pass filter to delta squared
|
||||
# basically apply a first-order low-pass filter to delta squared,
|
||||
self.g += (1 - self.mu) * (np.square(dW) - self.g)
|
||||
|
||||
# finally sqrt it to complete the running root-mean-square approximation
|
||||
# and sqrt it to complete the running root-mean-square approximation.
|
||||
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||
|
||||
|
||||
class RMSpropCentered(Optimizer):
|
||||
# referenced TensorFlow/PyTorch.
|
||||
# paper: https://arxiv.org/pdf/1308.0850v5.pdf
|
||||
|
@ -115,10 +119,11 @@ class RMSpropCentered(Optimizer):
|
|||
self.delta[:] = self.momentum * self.delta + self.lr * temp
|
||||
return -self.delta
|
||||
# PyTorch does it this way.
|
||||
#self.delta[:] = self.momentum * self.delta + temp
|
||||
#return -self.lr * self.delta
|
||||
# self.delta[:] = self.momentum * self.delta + temp
|
||||
# return -self.lr * self.delta
|
||||
# they are equivalent only when LR is constant, which it might not be.
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1412.6980
|
||||
# Adam generalizes* RMSprop, and
|
||||
|
@ -130,10 +135,10 @@ class Adam(Optimizer):
|
|||
# Adam.b2 == RMSprop.mu
|
||||
|
||||
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1_t_default = _f(b1) # decay term power t
|
||||
self.b2_t_default = _f(b2) # decay term power t
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1_t_default = _f(b1) # decay term power t
|
||||
self.b2_t_default = _f(b2) # decay term power t
|
||||
self.eps = _f(eps)
|
||||
|
||||
super().__init__(lr)
|
||||
|
@ -159,18 +164,20 @@ class Adam(Optimizer):
|
|||
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
||||
|
||||
return -self.lr * (self.mt / (1 - self.b1_t)) \
|
||||
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
||||
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
||||
|
||||
|
||||
class Nadam(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1412.6980
|
||||
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
|
||||
# TODO: double-check this implementation. also read the damn paper.
|
||||
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
||||
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
||||
# lifted from:
|
||||
# https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
||||
# https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
||||
|
||||
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.eps = _f(eps)
|
||||
|
||||
super().__init__(lr)
|
||||
|
@ -208,6 +215,7 @@ class Nadam(Optimizer):
|
|||
|
||||
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class FTML(Optimizer):
|
||||
|
@ -216,8 +224,8 @@ class FTML(Optimizer):
|
|||
|
||||
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
||||
self.iterations = _0
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.eps = _f(eps)
|
||||
|
||||
super().__init__(lr)
|
||||
|
@ -231,10 +239,14 @@ class FTML(Optimizer):
|
|||
self.b2_t = _1
|
||||
|
||||
def compute(self, dW, W):
|
||||
if self.dt1 is None: self.dt1 = np.zeros_like(dW)
|
||||
if self.dt is None: self.dt = np.zeros_like(dW)
|
||||
if self.vt is None: self.vt = np.zeros_like(dW)
|
||||
if self.zt is None: self.zt = np.zeros_like(dW)
|
||||
if self.dt1 is None:
|
||||
self.dt1 = np.zeros_like(dW)
|
||||
if self.dt is None:
|
||||
self.dt = np.zeros_like(dW)
|
||||
if self.vt is None:
|
||||
self.vt = np.zeros_like(dW)
|
||||
if self.zt is None:
|
||||
self.zt = np.zeros_like(dW)
|
||||
|
||||
# NOTE: we could probably rewrite these equations to avoid this copy.
|
||||
self.dt1[:] = self.dt[:]
|
||||
|
@ -260,6 +272,7 @@ class FTML(Optimizer):
|
|||
# subtract by weights to avoid having to override self.update.
|
||||
return -self.zt / self.dt - W
|
||||
|
||||
|
||||
class MomentumClip(Optimizer):
|
||||
def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
|
||||
self.mu = _f(mu)
|
||||
|
@ -289,22 +302,25 @@ class MomentumClip(Optimizer):
|
|||
else:
|
||||
return -self.lr * self.accum
|
||||
|
||||
|
||||
class YellowFin(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1706.03471
|
||||
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
|
||||
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
|
||||
# code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
|
||||
# author's implementation:
|
||||
# https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
|
||||
# code lifted:
|
||||
# https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
|
||||
|
||||
def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
|
||||
debias=True, clip=1.0):
|
||||
self.lr_default = _f(lr)
|
||||
self.mu_default = _f(mu)
|
||||
self.beta = _f(beta)
|
||||
self.window_size = int(window_size) # curv_win_width
|
||||
self.window_size = int(window_size) # curv_win_width
|
||||
self.debias_enabled = bool(debias)
|
||||
self.clip = _f(clip)
|
||||
|
||||
self.mu = _f(mu) # momentum
|
||||
self.mu = _f(mu) # momentum
|
||||
super().__init__(lr)
|
||||
|
||||
def reset(self):
|
||||
|
@ -316,13 +332,13 @@ class YellowFin(Optimizer):
|
|||
self.step = 0
|
||||
self.beta_t = self.beta
|
||||
|
||||
self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
|
||||
self.curv_win = np.zeros([self.window_size, ], dtype=np.float32)
|
||||
|
||||
self.h_min = None
|
||||
self.h_max = None
|
||||
|
||||
self.g_lpf = 0
|
||||
#self.g_squared_lpf = 0
|
||||
# self.g_squared_lpf = 0
|
||||
self.g_norm_squared_lpf = 0
|
||||
self.g_norm_lpf = 0
|
||||
self.h_min_lpf = 0
|
||||
|
@ -332,7 +348,8 @@ class YellowFin(Optimizer):
|
|||
self.mu_lpf = 0
|
||||
|
||||
def get_lr_mu(self):
|
||||
p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
|
||||
p = (np.square(self.dist_avg) * np.square(self.h_min)) \
|
||||
/ (2 * self.g_var)
|
||||
w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
|
||||
w = np.power(w3, 1/3)
|
||||
y = w - p / (3 * w)
|
||||
|
@ -360,11 +377,11 @@ class YellowFin(Optimizer):
|
|||
total_norm = np.linalg.norm(dW)
|
||||
clip_scale = self.clip / (total_norm + 1e-6)
|
||||
if clip_scale < 1:
|
||||
#print("clipping gradients; norm: {:10.5f}".format(total_norm))
|
||||
# print("clipping gradients; norm: {:10.5f}".format(total_norm))
|
||||
dW *= clip_scale
|
||||
|
||||
#fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
|
||||
#print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
|
||||
# fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
|
||||
# print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
|
||||
|
||||
b = self.beta
|
||||
m1b = 1 - self.beta
|
||||
|
@ -380,30 +397,31 @@ class YellowFin(Optimizer):
|
|||
h_min_t = np.min(valid_window)
|
||||
h_max_t = np.max(valid_window)
|
||||
|
||||
self.g_lpf = b * self.g_lpf + m1b * g
|
||||
#self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
|
||||
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
|
||||
self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
|
||||
self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
|
||||
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
|
||||
self.g_lpf = b * self.g_lpf + m1b * g
|
||||
# self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
|
||||
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \
|
||||
+ m1b * g_norm_squared
|
||||
self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
|
||||
self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
|
||||
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
|
||||
|
||||
g_avg = debias * self.g_lpf
|
||||
#g_squared_avg = debias * self.g_squared_lpf
|
||||
g_avg = debias * self.g_lpf
|
||||
# g_squared_avg = debias * self.g_squared_lpf
|
||||
g_norm_squared_avg = debias * self.g_norm_squared_lpf
|
||||
g_norm_avg = debias * self.g_norm_lpf
|
||||
self.h_min = debias * self.h_min_lpf
|
||||
self.h_max = debias * self.h_max_lpf
|
||||
g_norm_avg = debias * self.g_norm_lpf
|
||||
self.h_min = debias * self.h_min_lpf
|
||||
self.h_max = debias * self.h_max_lpf
|
||||
assert self.h_max >= self.h_min
|
||||
|
||||
dist = g_norm_avg / g_norm_squared_avg
|
||||
|
||||
self.dist_lpf = b * self.dist_lpf + m1b * dist
|
||||
self.dist_lpf = b * self.dist_lpf + m1b * dist
|
||||
|
||||
self.dist_avg = debias * self.dist_lpf
|
||||
self.dist_avg = debias * self.dist_lpf
|
||||
|
||||
self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
|
||||
# equivalently:
|
||||
#self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
|
||||
# self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
|
||||
|
||||
if self.step > 0:
|
||||
lr_for_real, mu_for_real = self.get_lr_mu()
|
||||
|
@ -419,6 +437,7 @@ class YellowFin(Optimizer):
|
|||
self.beta_t *= self.beta
|
||||
return V
|
||||
|
||||
|
||||
class AddSign(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1709.07417
|
||||
|
||||
|
@ -438,10 +457,11 @@ class AddSign(Optimizer):
|
|||
self.accum[:] = self.accum * self.mu + dW
|
||||
|
||||
signed = np.sign(dW) * np.sign(self.accum)
|
||||
#signed *= decay
|
||||
# signed *= decay
|
||||
|
||||
return -self.lr * dW * (self.alpha + signed)
|
||||
|
||||
|
||||
class PowerSign(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1709.07417
|
||||
|
||||
|
@ -462,13 +482,14 @@ class PowerSign(Optimizer):
|
|||
self.accum[:] = self.accum * self.mu + dW
|
||||
|
||||
signed = np.sign(dW) * np.sign(self.accum)
|
||||
#signed *= decay
|
||||
# signed *= decay
|
||||
|
||||
if self.use_exp:
|
||||
return -self.lr * dW * np.exp(signed)
|
||||
else:
|
||||
return -self.lr * dW * np.power(self.alpha, signed)
|
||||
|
||||
|
||||
class Neumann(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1712.03298
|
||||
# NOTE: this implementation is missing resetting as described in the paper.
|
||||
|
@ -478,20 +499,20 @@ class Neumann(Optimizer):
|
|||
# it seems like using a Learner like SineCLR makes this unnecessary.
|
||||
|
||||
def __init__(self, lr=0.01):
|
||||
self.alpha = _f(1e-7) # cubic.
|
||||
self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
|
||||
self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
|
||||
self.alpha = _f(1e-7) # cubic.
|
||||
self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
|
||||
self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
|
||||
# momentum is ∝ (in the shape of) 1 - 1/(1 + t)
|
||||
self.mu_min = _f(0.5)
|
||||
self.mu_max = _f(0.9)
|
||||
self.reset_period = 0 # TODO
|
||||
self.reset_period = 0 # TODO
|
||||
|
||||
super().__init__(lr)
|
||||
|
||||
def reset(self):
|
||||
# NOTE: mt and vt are different than the pair in Adam-like optimizers.
|
||||
self.mt = None # momentum accumulator.
|
||||
self.vt = None # weight accumulator.
|
||||
self.mt = None # momentum accumulator.
|
||||
self.vt = None # weight accumulator.
|
||||
self.t = 0
|
||||
|
||||
def compute(self, dW, W):
|
||||
|
@ -510,7 +531,7 @@ class Neumann(Optimizer):
|
|||
return
|
||||
|
||||
# momentum quantity:
|
||||
mu = _1 - _1/_f(self.t) # the + 1 is implicit.
|
||||
mu = _1 - _1/_f(self.t) # the + 1 is implicit.
|
||||
mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
|
||||
|
||||
# smoothed change in weights:
|
||||
|
@ -529,4 +550,3 @@ class Neumann(Optimizer):
|
|||
# weights and accumulator:
|
||||
W += mu * self.mt - self.lr * dt
|
||||
self.vt = W + self.gamma * (self.vt - W)
|
||||
|
||||
|
|
|
@ -2,9 +2,10 @@ import numpy as np
|
|||
|
||||
from .float import *
|
||||
|
||||
|
||||
class Optimizer:
|
||||
def __init__(self, lr=0.1):
|
||||
self.lr = _f(lr) # learning rate
|
||||
self.lr = _f(lr) # learning rate
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
|
@ -15,5 +16,3 @@ class Optimizer:
|
|||
|
||||
def update(self, dW, W):
|
||||
W += self.compute(dW, W)
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from .float import *
|
|||
from .layer_base import *
|
||||
from .initialization import *
|
||||
|
||||
|
||||
class Bias(Layer):
|
||||
# TODO: support axes other than -1 and shapes other than 1D.
|
||||
|
||||
|
@ -28,6 +29,7 @@ class Bias(Layer):
|
|||
self.biases.g += dY.sum(0)
|
||||
return dY
|
||||
|
||||
|
||||
class Dense(Layer):
|
||||
serialized = {
|
||||
'W': 'coeffs',
|
||||
|
@ -38,8 +40,10 @@ class Dense(Layer):
|
|||
super().__init__()
|
||||
self.dim = int(dim)
|
||||
self.output_shape = (dim,)
|
||||
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
||||
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
||||
self.coeffs = self._new_weights('coeffs', init=init,
|
||||
regularizer=reg_w)
|
||||
self.biases = self._new_weights('biases', init=init_zeros,
|
||||
regularizer=reg_b)
|
||||
|
||||
def make_shape(self, parent):
|
||||
shape = parent.output_shape
|
||||
|
@ -101,18 +105,20 @@ class Conv1Dper(Layer):
|
|||
|
||||
def forward(self, X):
|
||||
if self.wrap0 == 0:
|
||||
Xper = np.hstack((X,X[:,:self.wrap1]))
|
||||
Xper = np.hstack((X, X[:, :self.wrap1]))
|
||||
elif self.wrap1 == 0:
|
||||
Xper = np.hstack((X[:,-self.wrap0:],X))
|
||||
Xper = np.hstack((X[:, -self.wrap0:], X))
|
||||
else:
|
||||
Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
|
||||
Xper = np.hstack((X[:, -self.wrap0:], X, X[:, :self.wrap1]))
|
||||
self.cols = rolling_batch(Xper, self.kernel_size)
|
||||
convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
|
||||
convolved = (self.cols * self.coeffs.f[:, ::-1]).sum(2)
|
||||
return convolved
|
||||
|
||||
def backward(self, dY):
|
||||
self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
|
||||
return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
|
||||
self.coeffs.g += (dY[:, :, None] * self.cols).sum(0)[:, ::-1].sum(
|
||||
0, keepdims=True)
|
||||
return (dY[:, :, None] * self.coeffs.f[:, ::-1]).sum(2)
|
||||
|
||||
|
||||
class LayerNorm(Layer):
|
||||
# paper: https://arxiv.org/abs/1607.06450
|
||||
|
@ -168,7 +174,8 @@ class LayerNorm(Layer):
|
|||
|
||||
return dX
|
||||
|
||||
class Denses(Layer): # TODO: rename?
|
||||
|
||||
class Denses(Layer): # TODO: rename?
|
||||
# acts as a separate Dense for each row or column. only for 2D arrays.
|
||||
|
||||
serialized = {
|
||||
|
@ -176,13 +183,16 @@ class Denses(Layer): # TODO: rename?
|
|||
'b': 'biases',
|
||||
}
|
||||
|
||||
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
|
||||
def __init__(self, dim, init=init_he_uniform,
|
||||
reg_w=None, reg_b=None, axis=-1):
|
||||
super().__init__()
|
||||
self.dim = int(dim)
|
||||
self.weight_init = init
|
||||
self.axis = int(axis)
|
||||
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
||||
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
||||
self.coeffs = self._new_weights('coeffs', init=init,
|
||||
regularizer=reg_w)
|
||||
self.biases = self._new_weights('biases', init=init_zeros,
|
||||
regularizer=reg_b)
|
||||
|
||||
def make_shape(self, parent):
|
||||
shape = parent.output_shape
|
||||
|
@ -220,9 +230,11 @@ class Denses(Layer): # TODO: rename?
|
|||
self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
|
||||
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
|
||||
|
||||
|
||||
class CosineDense(Dense):
|
||||
# paper: https://arxiv.org/abs/1702.05870
|
||||
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
|
||||
# another implementation:
|
||||
# https://github.com/farizrahman4u/keras-contrib/pull/36
|
||||
# the paper doesn't mention bias,
|
||||
# so we treat bias as an additional weight with a constant input of 1.
|
||||
# this is correct in Dense layers, so i hope it's correct here too.
|
||||
|
@ -231,24 +243,25 @@ class CosineDense(Dense):
|
|||
|
||||
def forward(self, X):
|
||||
self.X = X
|
||||
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
|
||||
+ 1 + self.eps)
|
||||
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
|
||||
+ np.square(self.biases.f) + self.eps)
|
||||
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True)
|
||||
+ 1 + self.eps)
|
||||
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True)
|
||||
+ np.square(self.biases.f) + self.eps)
|
||||
self.dot = X @ self.coeffs.f + self.biases.f
|
||||
Y = self.dot / (self.X_norm * self.W_norm)
|
||||
return Y
|
||||
|
||||
def backward(self, dY):
|
||||
ddot = dY / self.X_norm / self.W_norm
|
||||
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
|
||||
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
|
||||
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) \
|
||||
/ self.X_norm**2
|
||||
dW_norm = -(dY * self.dot / self.X_norm).sum(0, keepdims=True) \
|
||||
/ self.W_norm**2
|
||||
|
||||
self.coeffs.g += self.X.T @ ddot \
|
||||
+ dW_norm / self.W_norm * self.coeffs.f
|
||||
self.coeffs.g += self.X.T @ ddot \
|
||||
+ dW_norm / self.W_norm * self.coeffs.f
|
||||
self.biases.g += ddot.sum(0, keepdims=True) \
|
||||
+ dW_norm / self.W_norm * self.biases.f
|
||||
+ dW_norm / self.W_norm * self.biases.f
|
||||
dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
|
||||
|
||||
return dX
|
||||
|
||||
|
|
|
@ -2,9 +2,11 @@ import numpy as np
|
|||
|
||||
from .float import *
|
||||
|
||||
|
||||
class Regularizer:
|
||||
pass
|
||||
|
||||
|
||||
class L1L2(Regularizer):
|
||||
def __init__(self, l1=0.0, l2=0.0):
|
||||
self.l1 = _f(l1)
|
||||
|
@ -26,6 +28,7 @@ class L1L2(Regularizer):
|
|||
df += self.l2 * 2 * X
|
||||
return df
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class SaturateRelu(Regularizer):
|
||||
|
|
|
@ -4,6 +4,7 @@ from .float import *
|
|||
from .initialization import *
|
||||
from .ritual_base import *
|
||||
|
||||
|
||||
def stochastic_multiply(W, gamma=0.5, allow_negation=False):
|
||||
# paper: https://arxiv.org/abs/1606.01981
|
||||
|
||||
|
@ -23,6 +24,7 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=False):
|
|||
mult *= np.where(samples < prob, 1, -1)
|
||||
np.multiply(W, mult, out=W)
|
||||
|
||||
|
||||
class StochMRitual(Ritual):
|
||||
# paper: https://arxiv.org/abs/1606.01981
|
||||
# this probably doesn't make sense for regression problems,
|
||||
|
@ -38,8 +40,8 @@ class StochMRitual(Ritual):
|
|||
|
||||
def learn(self, inputs, outputs):
|
||||
# an experiment:
|
||||
#assert self.learner.rate < 10, self.learner.rate
|
||||
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
|
||||
# assert self.learner.rate < 10, self.learner.rate
|
||||
# self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
|
||||
|
||||
self.W[:] = self.model.W
|
||||
for layer in self.model.ordered_nodes:
|
||||
|
@ -57,6 +59,7 @@ class StochMRitual(Ritual):
|
|||
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
|
||||
# np.clip(layer.W, -1, 1, out=layer.W)
|
||||
|
||||
|
||||
class NoisyRitual(Ritual):
|
||||
def __init__(self, learner=None,
|
||||
input_noise=0, output_noise=0, gradient_noise=0):
|
||||
|
@ -69,7 +72,7 @@ class NoisyRitual(Ritual):
|
|||
# this is pretty crude
|
||||
if self.input_noise > 0:
|
||||
s = self.input_noise
|
||||
inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
||||
inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
||||
if self.output_noise > 0:
|
||||
s = self.output_noise
|
||||
outputs = outputs + np.random.normal(0, s, size=outputs.shape)
|
||||
|
@ -80,11 +83,10 @@ class NoisyRitual(Ritual):
|
|||
if self.gradient_noise > 0:
|
||||
size = len(self.model.dW)
|
||||
gamma = 0.55
|
||||
#s = self.gradient_noise / (1 + self.bn) ** gamma
|
||||
# s = self.gradient_noise / (1 + self.bn) ** gamma
|
||||
# experiments:
|
||||
s = self.gradient_noise * np.sqrt(self.learner.rate)
|
||||
#s = np.square(self.learner.rate)
|
||||
#s = self.learner.rate / self.en
|
||||
# s = np.square(self.learner.rate)
|
||||
# s = self.learner.rate / self.en
|
||||
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
|
||||
super().update()
|
||||
|
||||
|
|
|
@ -3,7 +3,8 @@ import numpy as np
|
|||
|
||||
from .float import *
|
||||
|
||||
class Ritual: # i'm just making up names at this point.
|
||||
|
||||
class Ritual: # i'm just making up names at this point.
|
||||
def __init__(self, learner=None):
|
||||
self.learner = learner if learner is not None else Learner(Optimizer())
|
||||
self.model = None
|
||||
|
@ -77,7 +78,8 @@ class Ritual: # i'm just making up names at this point.
|
|||
|
||||
if shuffle:
|
||||
if gen:
|
||||
raise Exception("shuffling is incompatibile with using a generator.")
|
||||
raise Exception(
|
||||
"shuffling is incompatibile with using a generator.")
|
||||
indices = np.arange(inputs.shape[0])
|
||||
np.random.shuffle(indices)
|
||||
inputs = inputs[indices]
|
||||
|
@ -90,7 +92,7 @@ class Ritual: # i'm just making up names at this point.
|
|||
batch_count = inputs.shape[0] // batch_size
|
||||
# TODO: lift this restriction
|
||||
assert inputs.shape[0] % batch_size == 0, \
|
||||
"inputs is not evenly divisible by batch_size"
|
||||
"inputs is not evenly divisible by batch_size"
|
||||
|
||||
prev_batch_size = None
|
||||
for b in range(batch_count):
|
||||
|
@ -101,17 +103,20 @@ class Ritual: # i'm just making up names at this point.
|
|||
batch_inputs, batch_outputs = next(generator)
|
||||
batch_size = batch_inputs.shape[0]
|
||||
# TODO: lift this restriction
|
||||
assert batch_size == prev_batch_size or prev_batch_size is None, \
|
||||
"non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
|
||||
fmt = "non-constant batch size (got {}, expected {})"
|
||||
assert (batch_size == prev_batch_size
|
||||
or prev_batch_size is None), \
|
||||
fmt.format(batch_size, prev_batch_size)
|
||||
else:
|
||||
bi = b * batch_size
|
||||
batch_inputs = inputs[ bi:bi+batch_size]
|
||||
batch_inputs = inputs[bi:bi+batch_size]
|
||||
batch_outputs = outputs[bi:bi+batch_size]
|
||||
|
||||
if clear_grad:
|
||||
self.model.clear_grad()
|
||||
self._train_batch(batch_inputs, batch_outputs, b, batch_count,
|
||||
test_only, return_losses=='both', return_losses)
|
||||
test_only, return_losses == 'both',
|
||||
return_losses)
|
||||
|
||||
prev_batch_size = batch_size
|
||||
|
||||
|
|
|
@ -1,17 +1,23 @@
|
|||
import sys
|
||||
|
||||
|
||||
def lament(*args, **kwargs):
|
||||
print(*args, file=sys.stderr, **kwargs)
|
||||
|
||||
|
||||
def lower_priority():
|
||||
"""Set the priority of the process to below-normal."""
|
||||
# via https://stackoverflow.com/a/1023269
|
||||
if sys.platform == 'win32':
|
||||
try:
|
||||
import win32api, win32process, win32con
|
||||
import win32api
|
||||
import win32process
|
||||
import win32con
|
||||
pid = win32api.GetCurrentProcessId()
|
||||
handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
|
||||
win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
|
||||
handle = win32api.OpenProcess(
|
||||
win32con.PROCESS_ALL_ACCESS, True, pid)
|
||||
win32process.SetPriorityClass(
|
||||
handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
|
||||
except ImportError:
|
||||
lament("you do not have pywin32 installed.")
|
||||
lament("the process priority could not be lowered.")
|
||||
|
@ -21,9 +27,12 @@ def lower_priority():
|
|||
import os
|
||||
os.nice(1)
|
||||
|
||||
|
||||
# more
|
||||
|
||||
_log_was_update = False
|
||||
|
||||
|
||||
def log(left, right, update=False):
|
||||
s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right)
|
||||
global _log_was_update
|
||||
|
@ -33,5 +42,6 @@ def log(left, right, update=False):
|
|||
lament(s)
|
||||
_log_was_update = update
|
||||
|
||||
|
||||
class Dummy:
|
||||
pass
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
class Weights:
|
||||
# we may or may not contain weights -- or any information, for that matter.
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.f = None # forward weights
|
||||
self.g = None # backward weights (gradients)
|
||||
self.f = None # forward weights
|
||||
self.g = None # backward weights (gradients)
|
||||
self.shape = None
|
||||
self.init = None
|
||||
self.allocator = None
|
||||
|
@ -16,7 +17,7 @@ class Weights:
|
|||
|
||||
def configure(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
getattr(self, k) # ensures the key already exists
|
||||
getattr(self, k) # ensures the key already exists
|
||||
setattr(self, k, v)
|
||||
|
||||
@property
|
||||
|
|
Loading…
Reference in a new issue