merge and split modules into a package
This commit is contained in:
parent
db65fbdd62
commit
bbdb91fcb1
24 changed files with 2343 additions and 2829 deletions
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# work in progress:
|
||||||
|
onn/component.py
|
||||||
|
onn/extra.py
|
||||||
|
onn/run.py
|
|
@ -35,7 +35,7 @@ numpy scipy h5py sklearn dotmap
|
||||||
|
|
||||||
```python
|
```python
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from onn_core import *
|
from onn import *
|
||||||
bs = 500
|
bs = 500
|
||||||
lr = 0.01
|
lr = 0.01
|
||||||
reg = L1L2(3.2e-5, 3.2e-4)
|
reg = L1L2(3.2e-5, 3.2e-4)
|
||||||
|
|
2
onn_mnist.py → mnist_example.py
Executable file → Normal file
2
onn_mnist.py → mnist_example.py
Executable file → Normal file
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
from onn import *
|
from onn import *
|
||||||
from onn_core import _f
|
from onn.floats import *
|
||||||
from dotmap import DotMap
|
from dotmap import DotMap
|
||||||
|
|
||||||
lower_priority()
|
lower_priority()
|
28
onn/__init__.py
Normal file
28
onn/__init__.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
# external packages required for full functionality:
|
||||||
|
# numpy scipy h5py sklearn dotmap
|
||||||
|
|
||||||
|
# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
|
||||||
|
# this is gonna take some work.
|
||||||
|
|
||||||
|
from .activation import *
|
||||||
|
from .floats import *
|
||||||
|
from .initialization import *
|
||||||
|
from .layer import *
|
||||||
|
from .learner import *
|
||||||
|
from .loss import *
|
||||||
|
from .math import *
|
||||||
|
from .model import *
|
||||||
|
from .nodes import *
|
||||||
|
from .optimizer import *
|
||||||
|
from .parametric import *
|
||||||
|
from .regularizer import *
|
||||||
|
from .ritual import *
|
||||||
|
from .util import *
|
||||||
|
from .weights import *
|
||||||
|
|
||||||
|
# this is similar to default behaviour of having no __all__ variable at all,
|
||||||
|
# but ours ignores modules as well. this allows for `import sys` and such
|
||||||
|
# without clobbering `from our_module import *`.
|
||||||
|
__all__ = [
|
||||||
|
o for o in locals()
|
||||||
|
if type(o) != 'module' and not o.startswith('_')]
|
185
onn/activation.py
Normal file
185
onn/activation.py
Normal file
|
@ -0,0 +1,185 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# just for speed, not strictly essential:
|
||||||
|
from scipy.special import expit as sigmoid
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
from .layer_base import *
|
||||||
|
|
||||||
|
class Identity(Layer):
|
||||||
|
def forward(self, X):
|
||||||
|
return X
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY
|
||||||
|
|
||||||
|
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
|
||||||
|
def forward(self, X):
|
||||||
|
self.sig = sigmoid(X)
|
||||||
|
return self.sig
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * self.sig * (1 - self.sig)
|
||||||
|
|
||||||
|
class Softplus(Layer):
|
||||||
|
# integral of Sigmoid.
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.X = X
|
||||||
|
return np.log(1 + np.exp(X))
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * sigmoid(self.X)
|
||||||
|
|
||||||
|
class Tanh(Layer):
|
||||||
|
def forward(self, X):
|
||||||
|
self.sig = np.tanh(X)
|
||||||
|
return self.sig
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * (1 - self.sig * self.sig)
|
||||||
|
|
||||||
|
class LeCunTanh(Layer):
|
||||||
|
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
|
||||||
|
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
|
||||||
|
# scaled such that f([-1, 1]) = [-1, 1].
|
||||||
|
# helps preserve an input variance of 1.
|
||||||
|
# second derivative peaks around an input of ±1.
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.sig = np.tanh(2 / 3 * X)
|
||||||
|
return 1.7159 * self.sig
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
|
||||||
|
|
||||||
|
class Relu(Layer):
|
||||||
|
def forward(self, X):
|
||||||
|
self.cond = X >= 0
|
||||||
|
return np.where(self.cond, X, 0)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return np.where(self.cond, dY, 0)
|
||||||
|
|
||||||
|
class Elu(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1511.07289
|
||||||
|
|
||||||
|
def __init__(self, alpha=1):
|
||||||
|
super().__init__()
|
||||||
|
self.alpha = _f(alpha) # FIXME: unused
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.cond = X >= 0
|
||||||
|
self.neg = np.exp(X) - 1
|
||||||
|
return np.where(self.cond, X, self.neg)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * np.where(self.cond, 1, self.neg + 1)
|
||||||
|
|
||||||
|
class GeluApprox(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1606.08415
|
||||||
|
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.a = 1.704 * X
|
||||||
|
self.sig = sigmoid(self.a)
|
||||||
|
return X * self.sig
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
||||||
|
|
||||||
|
class Softmax(Layer):
|
||||||
|
def forward(self, X):
|
||||||
|
alpha = np.max(X, axis=-1, keepdims=True)
|
||||||
|
num = np.exp(X - alpha)
|
||||||
|
den = np.sum(num, axis=-1, keepdims=True)
|
||||||
|
self.sm = num / den
|
||||||
|
return self.sm
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
|
||||||
|
|
||||||
|
class LogSoftmax(Softmax):
|
||||||
|
def __init__(self, eps=1e-6):
|
||||||
|
super().__init__()
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return np.log(super().forward(X) + self.eps)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
|
||||||
|
|
||||||
|
class Cos(Layer):
|
||||||
|
# performs well on MNIST for some strange reason.
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.X = X
|
||||||
|
return np.cos(X)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * -np.sin(self.X)
|
||||||
|
|
||||||
|
class Selu(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1706.02515
|
||||||
|
|
||||||
|
def __init__(self, alpha=1.67326324, lamb=1.05070099):
|
||||||
|
super().__init__()
|
||||||
|
self.alpha = _f(alpha)
|
||||||
|
self.lamb = _f(lamb)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.cond = X >= 0
|
||||||
|
self.neg = self.alpha * np.exp(X)
|
||||||
|
return self.lamb * np.where(self.cond, X, self.neg - self.alpha)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * self.lamb * np.where(self.cond, 1, self.neg)
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class TanhTest(Layer):
|
||||||
|
def forward(self, X):
|
||||||
|
self.sig = np.tanh(1 / 2 * X)
|
||||||
|
return 2.4004 * self.sig
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
|
||||||
|
|
||||||
|
class ExpGB(Layer):
|
||||||
|
# an output layer for one-hot classification problems.
|
||||||
|
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
|
||||||
|
# paper: https://arxiv.org/abs/1707.04199
|
||||||
|
|
||||||
|
def __init__(self, alpha=0.1, beta=0.0):
|
||||||
|
super().__init__()
|
||||||
|
self.alpha = _f(alpha)
|
||||||
|
self.beta = _f(beta)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return self.alpha * np.exp(X) + self.beta
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
# this gradient is intentionally incorrect.
|
||||||
|
return dY
|
||||||
|
|
||||||
|
class CubicGB(Layer):
|
||||||
|
# an output layer for one-hot classification problems.
|
||||||
|
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
|
||||||
|
# paper: https://arxiv.org/abs/1707.04199
|
||||||
|
# note: in the paper, it's called pow3GB, which is ugly.
|
||||||
|
|
||||||
|
def __init__(self, alpha=0.1, beta=0.0):
|
||||||
|
# note: the paper suggests defaults of 0.001 and 0.0,
|
||||||
|
# but these didn't seem to work as well in my limited testing.
|
||||||
|
super().__init__()
|
||||||
|
self.alpha = _f(alpha)
|
||||||
|
self.beta = _f(beta)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return self.alpha * X**3 + self.beta
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
# this gradient is intentionally incorrect.
|
||||||
|
return dY
|
||||||
|
|
18
onn/floats.py
Normal file
18
onn/floats.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
_f = np.float32
|
||||||
|
|
||||||
|
def _check(a):
|
||||||
|
assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
|
||||||
|
assert a.dtype == _f, a.dtype
|
||||||
|
return a
|
||||||
|
|
||||||
|
_0 = _f(0)
|
||||||
|
_1 = _f(1)
|
||||||
|
_2 = _f(2)
|
||||||
|
_inv2 = _f(1/2)
|
||||||
|
_sqrt2 = _f(np.sqrt(2))
|
||||||
|
_invsqrt2 = _f(1/np.sqrt(2))
|
||||||
|
_pi = _f(np.pi)
|
||||||
|
|
||||||
|
__all__ = [o for o in locals()]
|
31
onn/initialization.py
Normal file
31
onn/initialization.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# note: these are currently only implemented for 2D shapes.
|
||||||
|
|
||||||
|
def init_zeros(size, ins=None, outs=None):
|
||||||
|
return np.zeros(size)
|
||||||
|
|
||||||
|
def init_ones(size, ins=None, outs=None):
|
||||||
|
return np.ones(size)
|
||||||
|
|
||||||
|
def init_he_normal(size, ins, outs):
|
||||||
|
s = np.sqrt(2 / ins)
|
||||||
|
return np.random.normal(0, s, size=size)
|
||||||
|
|
||||||
|
def init_he_uniform(size, ins, outs):
|
||||||
|
s = np.sqrt(6 / ins)
|
||||||
|
return np.random.uniform(-s, s, size=size)
|
||||||
|
|
||||||
|
def init_glorot_normal(size, ins, outs):
|
||||||
|
s = np.sqrt(2 / (ins + outs))
|
||||||
|
return np.random.normal(0, s, size=size)
|
||||||
|
|
||||||
|
def init_glorot_uniform(size, ins, outs):
|
||||||
|
s = np.sqrt(6 / (ins + outs))
|
||||||
|
return np.random.uniform(-s, s, size=size)
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
def init_gaussian_unit(size, ins, outs):
|
||||||
|
s = np.sqrt(1 / ins)
|
||||||
|
return np.random.normal(0, s, size=size)
|
200
onn/layer.py
Normal file
200
onn/layer.py
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
from .layer_base import *
|
||||||
|
from .initialization import *
|
||||||
|
from .floats import *
|
||||||
|
|
||||||
|
# Nonparametric Layers {{{1
|
||||||
|
|
||||||
|
class Input(Layer):
|
||||||
|
def __init__(self, shape):
|
||||||
|
assert shape is not None
|
||||||
|
super().__init__()
|
||||||
|
self.shape = tuple(shape)
|
||||||
|
self.input_shape = self.shape
|
||||||
|
self.output_shape = self.shape
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return X
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
#self.dY = dY
|
||||||
|
return np.zeros_like(dY)
|
||||||
|
|
||||||
|
class Reshape(Layer):
|
||||||
|
def __init__(self, new_shape):
|
||||||
|
super().__init__()
|
||||||
|
self.shape = tuple(new_shape)
|
||||||
|
self.output_shape = self.shape
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.batch_size = X.shape[0]
|
||||||
|
return X.reshape(self.batch_size, *self.output_shape)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
assert dY.shape[0] == self.batch_size
|
||||||
|
return dY.reshape(self.batch_size, *self.input_shape)
|
||||||
|
|
||||||
|
class Flatten(Layer):
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
self.output_shape = (np.prod(shape),)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.batch_size = X.shape[0]
|
||||||
|
return X.reshape(self.batch_size, *self.output_shape)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
assert dY.shape[0] == self.batch_size
|
||||||
|
return dY.reshape(self.batch_size, *self.input_shape)
|
||||||
|
|
||||||
|
class ConstAffine(Layer):
|
||||||
|
def __init__(self, a=1, b=0):
|
||||||
|
super().__init__()
|
||||||
|
self.a = _f(a)
|
||||||
|
self.b = _f(b)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return self.a * X + self.b
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * self.a
|
||||||
|
|
||||||
|
class Sum(Layer):
|
||||||
|
def _propagate(self, edges, deterministic):
|
||||||
|
return np.sum(edges, axis=0)
|
||||||
|
|
||||||
|
def _backpropagate(self, edges):
|
||||||
|
#assert len(edges) == 1, "unimplemented"
|
||||||
|
return edges[0] # TODO: does this always work?
|
||||||
|
|
||||||
|
class ActivityRegularizer(Layer):
|
||||||
|
def __init__(self, reg):
|
||||||
|
super().__init__()
|
||||||
|
assert isinstance(reg, Regularizer), reg
|
||||||
|
self.reg = reg
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.X = X
|
||||||
|
self.loss = np.sum(self.reg.forward(X))
|
||||||
|
return X
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY + self.reg.backward(self.X)
|
||||||
|
|
||||||
|
class Dropout(Layer):
|
||||||
|
def __init__(self, dropout=0.0):
|
||||||
|
super().__init__()
|
||||||
|
self.p = _f(1 - dropout)
|
||||||
|
assert 0 <= self.p <= 1
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.mask = (np.random.rand(*X.shape) < self.p) / self.p
|
||||||
|
return X * self.mask
|
||||||
|
|
||||||
|
def forward_deterministic(self, X):
|
||||||
|
#self.mask = _1
|
||||||
|
return X
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * self.mask
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class AlphaDropout(Layer):
|
||||||
|
# to be used alongside Selu activations.
|
||||||
|
# paper: https://arxiv.org/abs/1706.02515
|
||||||
|
|
||||||
|
def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099):
|
||||||
|
super().__init__()
|
||||||
|
self.alpha = _f(alpha)
|
||||||
|
self.lamb = _f(lamb)
|
||||||
|
self.saturated = -self.lamb * self.alpha
|
||||||
|
self.dropout = _f(dropout)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dropout(self):
|
||||||
|
return self._dropout
|
||||||
|
|
||||||
|
@dropout.setter
|
||||||
|
def dropout(self, x):
|
||||||
|
self._dropout = _f(x)
|
||||||
|
self.q = 1 - self._dropout
|
||||||
|
assert 0 <= self.q <= 1
|
||||||
|
|
||||||
|
sat = self.saturated
|
||||||
|
|
||||||
|
self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout)
|
||||||
|
self.b = -self.a * (self._dropout * sat)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.mask = np.random.rand(*X.shape) < self.q
|
||||||
|
return self.a * np.where(self.mask, X, self.saturated) + self.b
|
||||||
|
|
||||||
|
def forward_deterministic(self, X):
|
||||||
|
return X
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
return dY * self.a * self.mask
|
||||||
|
|
||||||
|
class Decimate(Layer):
|
||||||
|
# simple decimaton layer that drops every other sample from the last axis.
|
||||||
|
|
||||||
|
def __init__(self, phase='even'):
|
||||||
|
super().__init__()
|
||||||
|
# phase is the set of samples we keep in the forward pass.
|
||||||
|
assert phase in ('even', 'odd'), phase
|
||||||
|
self.phase = phase
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
divy = (shape[-1] + 1) // 2 if self.phase == 'even' else shape[-1] // 2
|
||||||
|
self.output_shape = tuple(list(shape[:-1]) + [divy])
|
||||||
|
self.dX = np.zeros(self.input_shape, dtype=_f)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.batch_size = X.shape[0]
|
||||||
|
if self.phase == 'even':
|
||||||
|
return X.ravel()[0::2].reshape(self.batch_size, *self.output_shape)
|
||||||
|
elif self.phase == 'odd':
|
||||||
|
return X.ravel()[1::2].reshape(self.batch_size, *self.output_shape)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
assert dY.shape[0] == self.batch_size
|
||||||
|
dX = np.zeros((self.batch_size, *self.input_shape), dtype=_f)
|
||||||
|
if self.phase == 'even':
|
||||||
|
dX.ravel()[0::2] = dY.ravel()
|
||||||
|
elif self.phase == 'odd':
|
||||||
|
dX.ravel()[1::2] = dY.ravel()
|
||||||
|
return dX
|
||||||
|
|
||||||
|
class Undecimate(Layer):
|
||||||
|
# inverse operation of Decimate. not quite interpolation.
|
||||||
|
|
||||||
|
def __init__(self, phase='even'):
|
||||||
|
super().__init__()
|
||||||
|
# phase is the set of samples we keep in the backward pass.
|
||||||
|
assert phase in ('even', 'odd'), phase
|
||||||
|
self.phase = phase
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
mult = shape[-1] * 2
|
||||||
|
self.output_shape = tuple(list(shape[:-1]) + [mult])
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.batch_size = X.shape[0]
|
||||||
|
Y = np.zeros((self.batch_size, *self.output_shape), dtype=_f)
|
||||||
|
if self.phase == 'even':
|
||||||
|
Y.ravel()[0::2] = X.ravel()
|
||||||
|
elif self.phase == 'odd':
|
||||||
|
Y.ravel()[1::2] = X.ravel()
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
assert dY.shape[0] == self.batch_size
|
||||||
|
if self.phase == 'even':
|
||||||
|
return dY.ravel()[0::2].reshape(self.batch_size, *self.input_shape)
|
||||||
|
elif self.phase == 'odd':
|
||||||
|
return dY.ravel()[1::2].reshape(self.batch_size, *self.input_shape)
|
143
onn/layer_base.py
Normal file
143
onn/layer_base.py
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from collections import defaultdict, OrderedDict
|
||||||
|
|
||||||
|
from .weights import *
|
||||||
|
|
||||||
|
# used for numbering layers like Keras:
|
||||||
|
_layer_counters = defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
class LayerIncompatibility(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Layer:
|
||||||
|
def __init__(self):
|
||||||
|
self.parents = []
|
||||||
|
self.children = []
|
||||||
|
self.weights = OrderedDict()
|
||||||
|
self.loss = None # for activity regularizers
|
||||||
|
self.input_shape = None
|
||||||
|
self.output_shape = None
|
||||||
|
kind = self.__class__.__name__
|
||||||
|
global _layer_counters
|
||||||
|
_layer_counters[kind] += 1
|
||||||
|
self.name = "{}_{}".format(kind, _layer_counters[kind])
|
||||||
|
self.unsafe = False # disables assertions for better performance
|
||||||
|
self.shared = False # as in weight sharing
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
# methods we might want to override:
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
|
||||||
|
def forward_deterministic(self, X):
|
||||||
|
return self.forward(X)
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
if self.input_shape == None:
|
||||||
|
self.input_shape = parent.output_shape
|
||||||
|
if self.output_shape == None:
|
||||||
|
self.output_shape = self.input_shape
|
||||||
|
|
||||||
|
def do_feed(self, child):
|
||||||
|
self.children.append(child)
|
||||||
|
|
||||||
|
def be_fed(self, parent):
|
||||||
|
self.parents.append(parent)
|
||||||
|
|
||||||
|
# TODO: better names for these (still)
|
||||||
|
def _propagate(self, edges, deterministic):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert len(edges) == 1, self
|
||||||
|
if deterministic:
|
||||||
|
return self.forward_deterministic(edges[0])
|
||||||
|
else:
|
||||||
|
return self.forward(edges[0])
|
||||||
|
|
||||||
|
def _backpropagate(self, edges):
|
||||||
|
if len(edges) == 1:
|
||||||
|
return self.backward(edges[0])
|
||||||
|
return sum((self.backward(dY) for dY in edges))
|
||||||
|
|
||||||
|
# general utility methods:
|
||||||
|
|
||||||
|
def is_compatible(self, parent):
|
||||||
|
return np.all(self.input_shape == parent.output_shape)
|
||||||
|
|
||||||
|
def feed(self, child):
|
||||||
|
assert self.output_shape is not None, self
|
||||||
|
child.make_shape(self)
|
||||||
|
if not child.is_compatible(self):
|
||||||
|
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
|
||||||
|
raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
|
||||||
|
self.do_feed(child)
|
||||||
|
child.be_fed(self)
|
||||||
|
return child
|
||||||
|
|
||||||
|
def validate_input(self, X):
|
||||||
|
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
|
||||||
|
|
||||||
|
def validate_output(self, Y):
|
||||||
|
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
||||||
|
|
||||||
|
def _new_weights(self, name, **kwargs):
|
||||||
|
w = Weights(**kwargs)
|
||||||
|
assert name not in self.weights, name
|
||||||
|
self.weights[name] = w
|
||||||
|
return w
|
||||||
|
|
||||||
|
def share(self, node):
|
||||||
|
self.weights = node.weights # TODO: this should be all it takes.
|
||||||
|
for k, v in self.weights.items():
|
||||||
|
vs = getattr(node, k) # hack: key isn't necessarily attribute name!
|
||||||
|
setattr(self, k, vs)
|
||||||
|
self.shared = True
|
||||||
|
|
||||||
|
def clear_grad(self):
|
||||||
|
for name, w in self.weights.items():
|
||||||
|
w.g[:] = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def size(self):
|
||||||
|
return sum((w.size for w in self.weights.values()))
|
||||||
|
|
||||||
|
def init(self, allocator):
|
||||||
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
||||||
|
for k, w in self.weights.items():
|
||||||
|
w.allocate(ins, outs, allocator=allocator)
|
||||||
|
|
||||||
|
def propagate(self, values, deterministic):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert self.parents, self
|
||||||
|
edges = []
|
||||||
|
for parent in self.parents:
|
||||||
|
if parent in values:
|
||||||
|
X = values[parent]
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_input(X)
|
||||||
|
edges.append(X)
|
||||||
|
Y = self._propagate(edges, deterministic)
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_output(Y)
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def backpropagate(self, values):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert self.children, self
|
||||||
|
edges = []
|
||||||
|
for child in self.children:
|
||||||
|
if child in values:
|
||||||
|
dY = values[child]
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_output(dY)
|
||||||
|
edges.append(dY)
|
||||||
|
dX = self._backpropagate(edges)
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_input(dX)
|
||||||
|
return dX
|
180
onn/learner.py
Normal file
180
onn/learner.py
Normal file
|
@ -0,0 +1,180 @@
|
||||||
|
from .floats import *
|
||||||
|
from .optimizer_base import *
|
||||||
|
|
||||||
|
class Learner:
|
||||||
|
per_batch = False
|
||||||
|
|
||||||
|
def __init__(self, optim, epochs=100, rate=None):
|
||||||
|
assert isinstance(optim, Optimizer)
|
||||||
|
self.optim = optim
|
||||||
|
self.start_rate = rate # None is okay; it'll use optim.lr instead.
|
||||||
|
self.epochs = int(epochs)
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self, optim=False):
|
||||||
|
self.started = False
|
||||||
|
self.epoch = 0
|
||||||
|
if optim:
|
||||||
|
self.optim.reset()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def epoch(self):
|
||||||
|
return self._epoch
|
||||||
|
|
||||||
|
@epoch.setter
|
||||||
|
def epoch(self, new_epoch):
|
||||||
|
self._epoch = int(new_epoch)
|
||||||
|
if 0 <= self.epoch <= self.epochs:
|
||||||
|
self.rate = self.rate_at(self._epoch)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rate(self):
|
||||||
|
return self.optim.lr
|
||||||
|
|
||||||
|
@rate.setter
|
||||||
|
def rate(self, new_rate):
|
||||||
|
self.optim.lr = new_rate
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
if self.start_rate is None:
|
||||||
|
return self.optim.lr
|
||||||
|
return self.start_rate
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
# prepares the next epoch. returns whether or not to continue training.
|
||||||
|
if not self.started:
|
||||||
|
self.started = True
|
||||||
|
self.epoch += 1
|
||||||
|
if self.epoch > self.epochs:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def batch(self, progress): # TODO: rename
|
||||||
|
# interpolates rates between epochs.
|
||||||
|
# unlike epochs, we do not store batch number as a state.
|
||||||
|
# i.e. calling next() will not respect progress.
|
||||||
|
assert 0 <= progress <= 1
|
||||||
|
self.rate = self.rate_at(self._epoch + progress)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def final_rate(self):
|
||||||
|
return self.rate_at(self.epochs - 1e-8)
|
||||||
|
|
||||||
|
class AnnealingLearner(Learner):
|
||||||
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
||||||
|
self.halve_every = _f(halve_every)
|
||||||
|
self.anneal = _f(0.5**(1/self.halve_every))
|
||||||
|
super().__init__(optim, epochs, rate)
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
return super().rate_at(epoch) * self.anneal**epoch
|
||||||
|
|
||||||
|
def cosmod(x):
|
||||||
|
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
||||||
|
return (_1 + np.cos((x % _1) * _pi)) * _inv2
|
||||||
|
|
||||||
|
class SGDR(Learner):
|
||||||
|
# Stochastic Gradient Descent with Restarts
|
||||||
|
# paper: https://arxiv.org/abs/1608.03983
|
||||||
|
# NOTE: this is missing a couple of the proposed features.
|
||||||
|
|
||||||
|
per_batch = True
|
||||||
|
|
||||||
|
def __init__(self, optim, epochs=100, rate=None,
|
||||||
|
restarts=0, restart_decay=0.5, callback=None,
|
||||||
|
expando=0):
|
||||||
|
self.restart_epochs = int(epochs)
|
||||||
|
self.decay = _f(restart_decay)
|
||||||
|
self.restarts = int(restarts)
|
||||||
|
self.restart_callback = callback
|
||||||
|
|
||||||
|
# TODO: rename expando to something not insane
|
||||||
|
self.expando = expando if expando is not None else lambda i: i
|
||||||
|
if type(self.expando) == int:
|
||||||
|
inc = self.expando
|
||||||
|
self.expando = lambda i: i * inc
|
||||||
|
|
||||||
|
self.splits = []
|
||||||
|
epochs = 0
|
||||||
|
for i in range(0, self.restarts + 1):
|
||||||
|
split = epochs + self.restart_epochs + int(self.expando(i))
|
||||||
|
self.splits.append(split)
|
||||||
|
epochs = split
|
||||||
|
super().__init__(optim, epochs, rate)
|
||||||
|
|
||||||
|
def split_num(self, epoch):
|
||||||
|
previous = [0] + self.splits
|
||||||
|
for i, split in enumerate(self.splits):
|
||||||
|
if epoch - 1 < split:
|
||||||
|
sub_epoch = epoch - previous[i]
|
||||||
|
next_restart = split - previous[i]
|
||||||
|
return i, sub_epoch, next_restart
|
||||||
|
raise Exception('this should never happen.')
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
base_rate = self.start_rate if self.start_rate is not None else self.optim.lr
|
||||||
|
restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
|
||||||
|
x = _f(sub_epoch - 1) / _f(next_restart)
|
||||||
|
return base_rate * self.decay**_f(restart) * cosmod(x)
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
if not super().next():
|
||||||
|
return False
|
||||||
|
restart, sub_epoch, next_restart = self.split_num(self.epoch)
|
||||||
|
if restart > 0 and sub_epoch == 1:
|
||||||
|
if self.restart_callback is not None:
|
||||||
|
self.restart_callback(restart)
|
||||||
|
return True
|
||||||
|
|
||||||
|
class TriangularCLR(Learner):
|
||||||
|
per_batch = True
|
||||||
|
|
||||||
|
def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0,
|
||||||
|
frequency=100, callback=None):
|
||||||
|
# NOTE: start_rate is treated as upper_rate
|
||||||
|
self.frequency = int(frequency)
|
||||||
|
assert self.frequency > 0
|
||||||
|
self.callback = callback
|
||||||
|
self.lower_rate = _f(lower_rate)
|
||||||
|
super().__init__(optim, epochs, upper_rate)
|
||||||
|
|
||||||
|
def _t(self, epoch):
|
||||||
|
# NOTE: this could probably be simplified
|
||||||
|
offset = self.frequency / 2
|
||||||
|
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
upper_rate = self.start_rate if self.start_rate is not None else self.optim.lr
|
||||||
|
return self._t(epoch) * (upper_rate - self.lower_rate) + self.lower_rate
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
if not super().next():
|
||||||
|
return False
|
||||||
|
e = self.epoch - 1
|
||||||
|
if e > 0 and e % self.frequency == 0:
|
||||||
|
if self.callback is not None:
|
||||||
|
self.callback(self.epoch // self.frequency)
|
||||||
|
return True
|
||||||
|
|
||||||
|
class SineCLR(TriangularCLR):
|
||||||
|
def _t(self, epoch):
|
||||||
|
return np.sin(_pi * _inv2 * super()._t(epoch))
|
||||||
|
|
||||||
|
class WaveCLR(TriangularCLR):
|
||||||
|
def _t(self, epoch):
|
||||||
|
return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class PolyLearner(Learner):
|
||||||
|
per_batch = True
|
||||||
|
|
||||||
|
def __init__(self, optim, epochs=400, coeffs=(1,)):
|
||||||
|
self.coeffs = tuple(coeffs)
|
||||||
|
super().__init__(optim, epochs, np.polyval(self.coeffs, 0))
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
progress = (epoch - 1) / (self.epochs)
|
||||||
|
ret = np.polyval(self.coeffs, progress)
|
||||||
|
return np.abs(ret)
|
||||||
|
|
129
onn/loss.py
Normal file
129
onn/loss.py
Normal file
|
@ -0,0 +1,129 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
|
||||||
|
class Loss:
|
||||||
|
def forward(self, p, y):
|
||||||
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
|
||||||
|
def backward(self, p, y):
|
||||||
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
|
||||||
|
class NLL(Loss): # Negative Log Likelihood
|
||||||
|
def forward(self, p, y):
|
||||||
|
correct = p * y
|
||||||
|
return np.mean(-correct)
|
||||||
|
|
||||||
|
def backward(self, p, y):
|
||||||
|
return -y / len(p)
|
||||||
|
|
||||||
|
class CategoricalCrossentropy(Loss):
|
||||||
|
# lifted from theano
|
||||||
|
|
||||||
|
def __init__(self, eps=1e-6):
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
def forward(self, p, y):
|
||||||
|
p = np.clip(p, self.eps, 1 - self.eps)
|
||||||
|
f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
|
||||||
|
return np.mean(f)
|
||||||
|
|
||||||
|
def backward(self, p, y):
|
||||||
|
p = np.clip(p, self.eps, 1 - self.eps)
|
||||||
|
df = (p - y) / (p * (1 - p))
|
||||||
|
return df / len(y)
|
||||||
|
|
||||||
|
class Accuracy(Loss):
|
||||||
|
# returns percentage of categories correctly predicted.
|
||||||
|
# utilizes argmax(), so it cannot be used for gradient descent.
|
||||||
|
# use CategoricalCrossentropy or NLL for that instead.
|
||||||
|
|
||||||
|
def forward(self, p, y):
|
||||||
|
correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1)
|
||||||
|
return np.mean(correct)
|
||||||
|
|
||||||
|
def backward(self, p, y):
|
||||||
|
raise NotImplementedError("cannot take the gradient of Accuracy")
|
||||||
|
|
||||||
|
class ResidualLoss(Loss):
|
||||||
|
def forward(self, p, y):
|
||||||
|
return np.mean(self.f(p - y))
|
||||||
|
|
||||||
|
def backward(self, p, y):
|
||||||
|
ret = self.df(p - y) / len(y)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
class SquaredHalved(ResidualLoss):
|
||||||
|
def f(self, r):
|
||||||
|
return np.square(r) / 2
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return r
|
||||||
|
|
||||||
|
class Squared(ResidualLoss):
|
||||||
|
def f(self, r):
|
||||||
|
return np.square(r)
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return 2 * r
|
||||||
|
|
||||||
|
class Absolute(ResidualLoss):
|
||||||
|
def f(self, r):
|
||||||
|
return np.abs(r)
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return np.sign(r)
|
||||||
|
|
||||||
|
class Huber(ResidualLoss):
|
||||||
|
def __init__(self, delta=1.0):
|
||||||
|
self.delta = _f(delta)
|
||||||
|
|
||||||
|
def f(self, r):
|
||||||
|
return np.where(r <= self.delta,
|
||||||
|
np.square(r) / 2,
|
||||||
|
self.delta * (np.abs(r) - self.delta / 2))
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return np.where(r <= self.delta,
|
||||||
|
r,
|
||||||
|
self.delta * np.sign(r))
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class SomethingElse(ResidualLoss):
|
||||||
|
# generalizes Absolute and SquaredHalved.
|
||||||
|
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
||||||
|
def __init__(self, a=4/3):
|
||||||
|
assert 1 <= a <= 2, "parameter out of range"
|
||||||
|
self.a = _f(a / 2)
|
||||||
|
self.b = _f(2 / a)
|
||||||
|
self.c = _f(2 / a - 1)
|
||||||
|
|
||||||
|
def f(self, r):
|
||||||
|
return self.a * np.abs(r)**self.b
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return np.sign(r) * np.abs(r)**self.c
|
||||||
|
|
||||||
|
class Confidence(Loss):
|
||||||
|
# this isn't "confidence" in any meaningful way; (e.g. Bayesian)
|
||||||
|
# it's just a metric of how large the value is of the predicted class.
|
||||||
|
# when using it for loss, it acts like a crappy regularizer.
|
||||||
|
# it really just measures how much of a hot-shot the network thinks it is.
|
||||||
|
|
||||||
|
def forward(self, p, y=None):
|
||||||
|
categories = p.shape[-1]
|
||||||
|
confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
|
||||||
|
# the exponent in softmax puts a maximum on confidence,
|
||||||
|
# but we don't compensate for that. if necessary,
|
||||||
|
# it'd be better to use an activation that doesn't have this limit.
|
||||||
|
return np.mean(confidence)
|
||||||
|
|
||||||
|
def backward(self, p, y=None):
|
||||||
|
# in order to agree with the forward pass,
|
||||||
|
# using this backwards pass as-is will minimize confidence.
|
||||||
|
categories = p.shape[-1]
|
||||||
|
detc = p / categories / (1 - 1/categories)
|
||||||
|
dmax = p == np.max(p, axis=-1, keepdims=True)
|
||||||
|
return detc * dmax
|
||||||
|
|
14
onn/math.py
Normal file
14
onn/math.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def rolling(a, window):
|
||||||
|
# http://stackoverflow.com/a/4924433
|
||||||
|
shape = (a.size - window + 1, window)
|
||||||
|
strides = (a.itemsize, a.itemsize)
|
||||||
|
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||||||
|
|
||||||
|
def rolling_batch(a, window):
|
||||||
|
# same as rolling, but acts on each batch (axis 0).
|
||||||
|
shape = (a.shape[0], a.shape[-1] - window + 1, window)
|
||||||
|
strides = (np.prod(a.shape[1:]) * a.itemsize, a.itemsize, a.itemsize)
|
||||||
|
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||||||
|
|
197
onn/model.py
Normal file
197
onn/model.py
Normal file
|
@ -0,0 +1,197 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
from .nodes import *
|
||||||
|
from .layer_base import *
|
||||||
|
|
||||||
|
class Model:
|
||||||
|
def __init__(self, nodes_in, nodes_out, loss=None, mloss=None, unsafe=False):
|
||||||
|
self.loss = loss if loss is not None else SquaredHalved()
|
||||||
|
self.mloss = mloss if mloss is not None else loss
|
||||||
|
|
||||||
|
nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
|
||||||
|
nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
|
||||||
|
assert type(nodes_in) == list, type(nodes_in)
|
||||||
|
assert type(nodes_out) == list, type(nodes_out)
|
||||||
|
self.nodes_in = nodes_in
|
||||||
|
self.nodes_out = nodes_out
|
||||||
|
|
||||||
|
self.nodes = traverse_all(self.nodes_in, self.nodes_out)
|
||||||
|
self.make_weights()
|
||||||
|
for node in self.nodes:
|
||||||
|
node.unsafe = unsafe
|
||||||
|
# TODO: handle the same layer being in more than one node.
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ordered_nodes(self):
|
||||||
|
# deprecated? we don't guarantee an order like we did before.
|
||||||
|
return self.nodes
|
||||||
|
|
||||||
|
def make_weights(self):
|
||||||
|
self.param_count = sum((node.size for node in self.nodes if not node.shared))
|
||||||
|
self.W = np.zeros(self.param_count, dtype=_f)
|
||||||
|
self.dW = np.zeros(self.param_count, dtype=_f)
|
||||||
|
|
||||||
|
offset = 0
|
||||||
|
for node in self.nodes:
|
||||||
|
if node.size > 0 and not node.shared:
|
||||||
|
inner_offset = 0
|
||||||
|
|
||||||
|
def allocate(size):
|
||||||
|
nonlocal inner_offset
|
||||||
|
o = offset + inner_offset
|
||||||
|
ret = self.W[o:o+size], self.dW[o:o+size]
|
||||||
|
inner_offset += size
|
||||||
|
assert len(ret[0]) == len(ret[1])
|
||||||
|
assert size == len(ret[0]), (size, len(ret[0]))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
node.init(allocate)
|
||||||
|
assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
|
||||||
|
# i don't care if "less" is grammatically incorrect.
|
||||||
|
# you're mom is grammatically incorrect.
|
||||||
|
assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
|
||||||
|
offset += node.size
|
||||||
|
|
||||||
|
def evaluate(self, input_, deterministic=True):
|
||||||
|
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use evaluate_multi() instead"
|
||||||
|
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use evaluate_multi() instead"
|
||||||
|
node_in = self.nodes_in[0]
|
||||||
|
node_out = self.nodes_out[0]
|
||||||
|
outputs = self.evaluate_multi({node_in: input_}, deterministic)
|
||||||
|
return outputs[node_out]
|
||||||
|
|
||||||
|
def apply(self, error): # TODO: better name?
|
||||||
|
assert len(self.nodes_in) == 1, "ambiguous input in multi-input network; use apply_multi() instead"
|
||||||
|
assert len(self.nodes_out) == 1, "ambiguous output in multi-output network; use apply_multi() instead"
|
||||||
|
node_in = self.nodes_in[0]
|
||||||
|
node_out = self.nodes_out[0]
|
||||||
|
inputs = self.apply_multi({node_out: error})
|
||||||
|
return inputs[node_in]
|
||||||
|
|
||||||
|
def evaluate_multi(self, inputs, deterministic=True):
|
||||||
|
values = dict()
|
||||||
|
outputs = dict()
|
||||||
|
for node in self.nodes:
|
||||||
|
if node in self.nodes_in:
|
||||||
|
assert node in inputs, "missing input for node {}".format(node.name)
|
||||||
|
X = inputs[node]
|
||||||
|
values[node] = node._propagate(np.expand_dims(X, 0), deterministic)
|
||||||
|
else:
|
||||||
|
values[node] = node.propagate(values, deterministic)
|
||||||
|
if node in self.nodes_out:
|
||||||
|
outputs[node] = values[node]
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def apply_multi(self, outputs):
|
||||||
|
values = dict()
|
||||||
|
inputs = dict()
|
||||||
|
for node in reversed(self.nodes):
|
||||||
|
if node in self.nodes_out:
|
||||||
|
assert node in outputs, "missing output for node {}".format(node.name)
|
||||||
|
X = outputs[node]
|
||||||
|
values[node] = node._backpropagate(np.expand_dims(X, 0))
|
||||||
|
else:
|
||||||
|
values[node] = node.backpropagate(values)
|
||||||
|
if node in self.nodes_in:
|
||||||
|
inputs[node] = values[node]
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
def forward(self, inputs, outputs, measure=False, deterministic=False):
|
||||||
|
predicted = self.evaluate(inputs, deterministic=deterministic)
|
||||||
|
if measure:
|
||||||
|
error = self.mloss.forward(predicted, outputs)
|
||||||
|
else:
|
||||||
|
error = self.loss.forward(predicted, outputs)
|
||||||
|
return error, predicted
|
||||||
|
|
||||||
|
def backward(self, predicted, outputs, measure=False):
|
||||||
|
if measure:
|
||||||
|
error = self.mloss.backward(predicted, outputs)
|
||||||
|
else:
|
||||||
|
error = self.loss.backward(predicted, outputs)
|
||||||
|
# input_delta is rarely useful; it's just to match the forward pass.
|
||||||
|
input_delta = self.apply(error)
|
||||||
|
return self.dW, input_delta
|
||||||
|
|
||||||
|
def clear_grad(self):
|
||||||
|
for node in self.nodes:
|
||||||
|
node.clear_grad()
|
||||||
|
|
||||||
|
def regulate_forward(self):
|
||||||
|
loss = _0
|
||||||
|
for node in self.nodes:
|
||||||
|
if node.loss is not None:
|
||||||
|
loss += node.loss
|
||||||
|
for k, w in node.weights.items():
|
||||||
|
loss += w.forward()
|
||||||
|
return loss
|
||||||
|
|
||||||
|
def regulate(self):
|
||||||
|
for node in self.nodes:
|
||||||
|
for k, w in node.weights.items():
|
||||||
|
w.update()
|
||||||
|
|
||||||
|
def load_weights(self, fn):
|
||||||
|
# seemingly compatible with keras' Dense layers.
|
||||||
|
import h5py
|
||||||
|
open(fn) # just ensure the file exists (python's error is better)
|
||||||
|
f = h5py.File(fn, 'r')
|
||||||
|
weights = {}
|
||||||
|
def visitor(name, obj):
|
||||||
|
if isinstance(obj, h5py.Dataset):
|
||||||
|
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
|
||||||
|
f.visititems(visitor)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
used = {}
|
||||||
|
for k in weights.keys():
|
||||||
|
used[k] = False
|
||||||
|
|
||||||
|
nodes = [node for node in self.nodes if node.size > 0]
|
||||||
|
# TODO: support shared weights.
|
||||||
|
for node in nodes:
|
||||||
|
full_name = str(node).lower()
|
||||||
|
for s_name, o_name in node.serialized.items():
|
||||||
|
key = full_name + '_' + s_name
|
||||||
|
data = weights[key]
|
||||||
|
target = getattr(node, o_name)
|
||||||
|
target.f[:] = data
|
||||||
|
used[key] = True
|
||||||
|
|
||||||
|
for k, v in used.items():
|
||||||
|
if not v:
|
||||||
|
lament("WARNING: unused weight", k)
|
||||||
|
|
||||||
|
def save_weights(self, fn, overwrite=False):
|
||||||
|
import h5py
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
f = h5py.File(fn, 'w')
|
||||||
|
|
||||||
|
counts = defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
nodes = [node for node in self.nodes if node.size > 0]
|
||||||
|
# TODO: support shared weights.
|
||||||
|
for node in nodes:
|
||||||
|
full_name = str(node).lower()
|
||||||
|
grp = f.create_group(full_name)
|
||||||
|
for s_name, o_name in node.serialized.items():
|
||||||
|
key = full_name + '_' + s_name
|
||||||
|
target = getattr(node, o_name)
|
||||||
|
data = grp.create_dataset(key, target.shape, dtype=_f)
|
||||||
|
data[:] = target.f
|
||||||
|
counts[key] += 1
|
||||||
|
if counts[key] > 1:
|
||||||
|
lament("WARNING: rewrote weight", key)
|
||||||
|
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
def print_graph(self, file=sys.stdout):
|
||||||
|
print('digraph G {', file=file)
|
||||||
|
for node in self.nodes:
|
||||||
|
children = [str(n) for n in node.children]
|
||||||
|
if children:
|
||||||
|
sep = '->'
|
||||||
|
print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
|
||||||
|
print('}', file=file)
|
43
onn/nodes.py
Normal file
43
onn/nodes.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
class DummyNode:
|
||||||
|
name = "Dummy"
|
||||||
|
|
||||||
|
def __init__(self, children=None, parents=None):
|
||||||
|
self.children = children if children is not None else []
|
||||||
|
self.parents = parents if parents is not None else []
|
||||||
|
|
||||||
|
def traverse(node_in, node_out, nodes=None, dummy_mode=False):
|
||||||
|
# i have no idea if this is any algorithm in particular.
|
||||||
|
nodes = nodes if nodes is not None else []
|
||||||
|
|
||||||
|
seen_up = {}
|
||||||
|
q = [node_out]
|
||||||
|
while len(q) > 0:
|
||||||
|
node = q.pop(0)
|
||||||
|
seen_up[node] = True
|
||||||
|
for parent in node.parents:
|
||||||
|
q.append(parent)
|
||||||
|
|
||||||
|
if dummy_mode:
|
||||||
|
seen_up[node_in] = True
|
||||||
|
|
||||||
|
nodes = []
|
||||||
|
q = [node_in]
|
||||||
|
while len(q) > 0:
|
||||||
|
node = q.pop(0)
|
||||||
|
if not seen_up[node]:
|
||||||
|
continue
|
||||||
|
parents_added = (parent in nodes for parent in node.parents)
|
||||||
|
if not node in nodes and all(parents_added):
|
||||||
|
nodes.append(node)
|
||||||
|
for child in node.children:
|
||||||
|
q.append(child)
|
||||||
|
|
||||||
|
if dummy_mode:
|
||||||
|
nodes.remove(node_in)
|
||||||
|
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
def traverse_all(nodes_in, nodes_out, nodes=None):
|
||||||
|
all_in = DummyNode(children=nodes_in)
|
||||||
|
all_out = DummyNode(parents=nodes_out)
|
||||||
|
return traverse(all_in, all_out, nodes, dummy_mode=True)
|
531
onn/optimizer.py
Normal file
531
onn/optimizer.py
Normal file
|
@ -0,0 +1,531 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
from .optimizer_base import *
|
||||||
|
|
||||||
|
# some of the the following optimizers are blatantly lifted from tiny-dnn:
|
||||||
|
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
||||||
|
|
||||||
|
class Momentum(Optimizer):
|
||||||
|
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
|
||||||
|
self.mu = _f(mu) # momentum
|
||||||
|
self.nesterov = bool(nesterov)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.Vprev = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.Vprev is None:
|
||||||
|
self.Vprev = np.copy(dW)
|
||||||
|
|
||||||
|
V = self.mu * self.Vprev - self.lr * dW
|
||||||
|
self.Vprev[:] = V
|
||||||
|
if self.nesterov:
|
||||||
|
return self.mu * V - self.lr * dW
|
||||||
|
|
||||||
|
return V
|
||||||
|
|
||||||
|
class Adagrad(Optimizer):
|
||||||
|
def __init__(self, lr=0.01, eps=1e-8):
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.g = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.g is None:
|
||||||
|
self.g = np.zeros_like(dW)
|
||||||
|
|
||||||
|
self.g += np.square(dW)
|
||||||
|
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||||
|
|
||||||
|
class RMSprop(Optimizer):
|
||||||
|
# RMSprop generalizes* Adagrad, etc.
|
||||||
|
|
||||||
|
# * RMSprop == Adagrad when
|
||||||
|
# RMSprop.mu == 1
|
||||||
|
|
||||||
|
def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
|
||||||
|
self.mu = _f(mu) # decay term
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
# one might consider the following equation when specifying mu:
|
||||||
|
# mu = e**(-1/t)
|
||||||
|
# default: t = -1/ln(0.99) = ~99.5
|
||||||
|
# therefore the default of mu=0.99 means
|
||||||
|
# an input decays to 1/e its original amplitude over 99.5 batches.
|
||||||
|
# (this is from DSP, so how relevant it is in SGD is debatable)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.g = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.g is None:
|
||||||
|
self.g = np.zeros_like(dW)
|
||||||
|
|
||||||
|
# basically apply a first-order low-pass filter to delta squared
|
||||||
|
self.g += (1 - self.mu) * (np.square(dW) - self.g)
|
||||||
|
|
||||||
|
# finally sqrt it to complete the running root-mean-square approximation
|
||||||
|
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||||
|
|
||||||
|
class RMSpropCentered(Optimizer):
|
||||||
|
# referenced TensorFlow/PyTorch.
|
||||||
|
# paper: https://arxiv.org/pdf/1308.0850v5.pdf
|
||||||
|
|
||||||
|
def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8):
|
||||||
|
self.aleph = _f(aleph)
|
||||||
|
self.momentum = _f(momentum)
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.g = None
|
||||||
|
self.mt = None
|
||||||
|
self.vt = None
|
||||||
|
self.delta = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.g is None:
|
||||||
|
self.g = np.zeros_like(dW)
|
||||||
|
if self.mt is None:
|
||||||
|
self.mt = np.zeros_like(dW)
|
||||||
|
if self.vt is None:
|
||||||
|
self.vt = np.zeros_like(dW)
|
||||||
|
if self.delta is None:
|
||||||
|
self.delta = np.zeros_like(dW)
|
||||||
|
|
||||||
|
self.mt += (1 - self.aleph) * (dW - self.mt)
|
||||||
|
self.vt += (1 - self.aleph) * (np.square(dW) - self.vt)
|
||||||
|
|
||||||
|
# PyTorch has the epsilon outside of the sqrt,
|
||||||
|
# TensorFlow and the paper have it within.
|
||||||
|
# in onn, we generally do it outside, as this seems to work better.
|
||||||
|
temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps)
|
||||||
|
|
||||||
|
# TensorFlow does it this way.
|
||||||
|
self.delta[:] = self.momentum * self.delta + self.lr * temp
|
||||||
|
return -self.delta
|
||||||
|
# PyTorch does it this way.
|
||||||
|
#self.delta[:] = self.momentum * self.delta + temp
|
||||||
|
#return -self.lr * self.delta
|
||||||
|
# they are equivalent only when LR is constant, which it might not be.
|
||||||
|
|
||||||
|
class Adam(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1412.6980
|
||||||
|
# Adam generalizes* RMSprop, and
|
||||||
|
# adds a decay term to the regular (non-squared) delta, and performs
|
||||||
|
# debiasing to compensate for the filtered deltas starting from zero.
|
||||||
|
|
||||||
|
# * Adam == RMSprop when
|
||||||
|
# Adam.b1 == 0
|
||||||
|
# Adam.b2 == RMSprop.mu
|
||||||
|
|
||||||
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
|
self.b1 = _f(b1) # decay term
|
||||||
|
self.b2 = _f(b2) # decay term
|
||||||
|
self.b1_t_default = _f(b1) # decay term power t
|
||||||
|
self.b2_t_default = _f(b2) # decay term power t
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.mt = None
|
||||||
|
self.vt = None
|
||||||
|
self.b1_t = self.b1_t_default
|
||||||
|
self.b2_t = self.b2_t_default
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.mt is None:
|
||||||
|
self.mt = np.zeros_like(dW)
|
||||||
|
if self.vt is None:
|
||||||
|
self.vt = np.zeros_like(dW)
|
||||||
|
|
||||||
|
# decay gain
|
||||||
|
self.b1_t *= self.b1
|
||||||
|
self.b2_t *= self.b2
|
||||||
|
|
||||||
|
# filter
|
||||||
|
self.mt += (1 - self.b1) * (dW - self.mt)
|
||||||
|
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
||||||
|
|
||||||
|
return -self.lr * (self.mt / (1 - self.b1_t)) \
|
||||||
|
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
||||||
|
|
||||||
|
class Nadam(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1412.6980
|
||||||
|
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
|
||||||
|
# TODO: double-check this implementation. also read the damn paper.
|
||||||
|
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
||||||
|
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
||||||
|
|
||||||
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
|
self.b1 = _f(b1) # decay term
|
||||||
|
self.b2 = _f(b2) # decay term
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.mt = None
|
||||||
|
self.vt = None
|
||||||
|
self.t = 0
|
||||||
|
self.sched = 1
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
self.t += 1
|
||||||
|
|
||||||
|
if self.mt is None:
|
||||||
|
self.mt = np.zeros_like(dW)
|
||||||
|
if self.vt is None:
|
||||||
|
self.vt = np.zeros_like(dW)
|
||||||
|
|
||||||
|
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
|
||||||
|
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
|
||||||
|
|
||||||
|
sched0 = self.sched * ut0
|
||||||
|
sched1 = self.sched * ut0 * ut1
|
||||||
|
self.sched = sched0
|
||||||
|
|
||||||
|
gp = dW / (1 - sched0)
|
||||||
|
|
||||||
|
self.mt += (1 - self.b1) * (dW - self.mt)
|
||||||
|
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
||||||
|
|
||||||
|
mtp = self.mt / (1 - sched1)
|
||||||
|
vtp = self.vt / (1 - self.b2**self.t)
|
||||||
|
|
||||||
|
mt_bar = (1 - ut0) * gp + ut1 * mtp
|
||||||
|
|
||||||
|
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class FTML(Optimizer):
|
||||||
|
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
||||||
|
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
||||||
|
|
||||||
|
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
||||||
|
self.iterations = _0
|
||||||
|
self.b1 = _f(b1) # decay term
|
||||||
|
self.b2 = _f(b2) # decay term
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.dt1 = None
|
||||||
|
self.dt = None
|
||||||
|
self.vt = None
|
||||||
|
self.zt = None
|
||||||
|
self.b1_t = _1
|
||||||
|
self.b2_t = _1
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.dt1 is None: self.dt1 = np.zeros_like(dW)
|
||||||
|
if self.dt is None: self.dt = np.zeros_like(dW)
|
||||||
|
if self.vt is None: self.vt = np.zeros_like(dW)
|
||||||
|
if self.zt is None: self.zt = np.zeros_like(dW)
|
||||||
|
|
||||||
|
# NOTE: we could probably rewrite these equations to avoid this copy.
|
||||||
|
self.dt1[:] = self.dt[:]
|
||||||
|
|
||||||
|
self.b1_t *= self.b1
|
||||||
|
self.b2_t *= self.b2
|
||||||
|
|
||||||
|
# hardly an elegant solution.
|
||||||
|
lr = max(self.lr, self.eps)
|
||||||
|
|
||||||
|
# same as Adam's vt.
|
||||||
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
||||||
|
|
||||||
|
# you can factor "inner" out of Adam as well.
|
||||||
|
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
|
||||||
|
self.dt[:] = (1 - self.b1_t) / lr * inner
|
||||||
|
|
||||||
|
sigma_t = self.dt - self.b1 * self.dt1
|
||||||
|
|
||||||
|
# Adam's mt minus the sigma term.
|
||||||
|
self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
|
||||||
|
|
||||||
|
# subtract by weights to avoid having to override self.update.
|
||||||
|
return -self.zt / self.dt - W
|
||||||
|
|
||||||
|
class MomentumClip(Optimizer):
|
||||||
|
def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
|
||||||
|
self.mu = _f(mu)
|
||||||
|
self.clip = _f(clip)
|
||||||
|
self.nesterov = bool(nesterov)
|
||||||
|
self.debug = bool(debug)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.accum = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.accum is None:
|
||||||
|
self.accum = np.zeros_like(dW)
|
||||||
|
|
||||||
|
total_norm = np.linalg.norm(dW)
|
||||||
|
clip_scale = self.clip / (total_norm + 1e-6)
|
||||||
|
if clip_scale < 1:
|
||||||
|
if self.debug:
|
||||||
|
lament("clipping gradients; norm: {:10.5f}".format(total_norm))
|
||||||
|
dW *= clip_scale
|
||||||
|
|
||||||
|
self.accum[:] = self.accum * self.mu + dW
|
||||||
|
if self.nesterov:
|
||||||
|
return -self.lr * (self.accum * self.mu + dW)
|
||||||
|
else:
|
||||||
|
return -self.lr * self.accum
|
||||||
|
|
||||||
|
class YellowFin(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1706.03471
|
||||||
|
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
|
||||||
|
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
|
||||||
|
# code lifted: https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
|
||||||
|
|
||||||
|
def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
|
||||||
|
debias=True, clip=1.0):
|
||||||
|
self.lr_default = _f(lr)
|
||||||
|
self.mu_default = _f(mu)
|
||||||
|
self.beta = _f(beta)
|
||||||
|
self.window_size = int(window_size) # curv_win_width
|
||||||
|
self.debias_enabled = bool(debias)
|
||||||
|
self.clip = _f(clip)
|
||||||
|
|
||||||
|
self.mu = _f(mu) # momentum
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.accum = None
|
||||||
|
|
||||||
|
self.lr = self.lr_default
|
||||||
|
self.mu = self.mu_default
|
||||||
|
|
||||||
|
self.step = 0
|
||||||
|
self.beta_t = self.beta
|
||||||
|
|
||||||
|
self.curv_win = np.zeros([self.window_size,], dtype=np.float32)
|
||||||
|
|
||||||
|
self.h_min = None
|
||||||
|
self.h_max = None
|
||||||
|
|
||||||
|
self.g_lpf = 0
|
||||||
|
#self.g_squared_lpf = 0
|
||||||
|
self.g_norm_squared_lpf = 0
|
||||||
|
self.g_norm_lpf = 0
|
||||||
|
self.h_min_lpf = 0
|
||||||
|
self.h_max_lpf = 0
|
||||||
|
self.dist_lpf = 0
|
||||||
|
self.lr_lpf = 0
|
||||||
|
self.mu_lpf = 0
|
||||||
|
|
||||||
|
def get_lr_mu(self):
|
||||||
|
p = (np.square(self.dist_avg) * np.square(self.h_min)) / (2 * self.g_var)
|
||||||
|
w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
|
||||||
|
w = np.power(w3, 1/3)
|
||||||
|
y = w - p / (3 * w)
|
||||||
|
sqrt_mu1 = y + 1
|
||||||
|
|
||||||
|
sqrt_h_min = np.sqrt(self.h_min)
|
||||||
|
sqrt_h_max = np.sqrt(self.h_max)
|
||||||
|
sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min)
|
||||||
|
|
||||||
|
sqrt_mu = max(sqrt_mu1, sqrt_mu2)
|
||||||
|
if sqrt_mu2 > sqrt_mu1:
|
||||||
|
print('note: taking dr calculation. something may have exploded.')
|
||||||
|
|
||||||
|
lr = np.square(1 - sqrt_mu) / self.h_min
|
||||||
|
mu = np.square(sqrt_mu)
|
||||||
|
return lr, mu
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.accum is None:
|
||||||
|
self.accum = np.zeros_like(dW)
|
||||||
|
|
||||||
|
# TODO: prevent allocations everywhere by using [:].
|
||||||
|
# assuming that really works. i haven't actually checked.
|
||||||
|
|
||||||
|
total_norm = np.linalg.norm(dW)
|
||||||
|
clip_scale = self.clip / (total_norm + 1e-6)
|
||||||
|
if clip_scale < 1:
|
||||||
|
#print("clipping gradients; norm: {:10.5f}".format(total_norm))
|
||||||
|
dW *= clip_scale
|
||||||
|
|
||||||
|
#fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
|
||||||
|
#print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
|
||||||
|
|
||||||
|
b = self.beta
|
||||||
|
m1b = 1 - self.beta
|
||||||
|
debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1
|
||||||
|
|
||||||
|
g = dW
|
||||||
|
g_squared = np.square(g)
|
||||||
|
g_norm_squared = np.sum(g_squared)
|
||||||
|
g_norm = np.sqrt(g_norm_squared)
|
||||||
|
|
||||||
|
self.curv_win[self.step % self.window_size] = g_norm_squared
|
||||||
|
valid_window = self.curv_win[:min(self.window_size, self.step + 1)]
|
||||||
|
h_min_t = np.min(valid_window)
|
||||||
|
h_max_t = np.max(valid_window)
|
||||||
|
|
||||||
|
self.g_lpf = b * self.g_lpf + m1b * g
|
||||||
|
#self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
|
||||||
|
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf + m1b * g_norm_squared
|
||||||
|
self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
|
||||||
|
self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
|
||||||
|
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
|
||||||
|
|
||||||
|
g_avg = debias * self.g_lpf
|
||||||
|
#g_squared_avg = debias * self.g_squared_lpf
|
||||||
|
g_norm_squared_avg = debias * self.g_norm_squared_lpf
|
||||||
|
g_norm_avg = debias * self.g_norm_lpf
|
||||||
|
self.h_min = debias * self.h_min_lpf
|
||||||
|
self.h_max = debias * self.h_max_lpf
|
||||||
|
assert self.h_max >= self.h_min
|
||||||
|
|
||||||
|
dist = g_norm_avg / g_norm_squared_avg
|
||||||
|
|
||||||
|
self.dist_lpf = b * self.dist_lpf + m1b * dist
|
||||||
|
|
||||||
|
self.dist_avg = debias * self.dist_lpf
|
||||||
|
|
||||||
|
self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
|
||||||
|
# equivalently:
|
||||||
|
#self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
|
||||||
|
|
||||||
|
if self.step > 0:
|
||||||
|
lr_for_real, mu_for_real = self.get_lr_mu()
|
||||||
|
self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real
|
||||||
|
self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real
|
||||||
|
self.mu = debias * self.mu_lpf
|
||||||
|
self.lr = debias * self.lr_lpf
|
||||||
|
|
||||||
|
self.accum[:] = self.accum * self.mu - self.lr * dW
|
||||||
|
V = self.accum
|
||||||
|
|
||||||
|
self.step += 1
|
||||||
|
self.beta_t *= self.beta
|
||||||
|
return V
|
||||||
|
|
||||||
|
class AddSign(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1709.07417
|
||||||
|
|
||||||
|
def __init__(self, lr=0.01, mu=0.9, alpha=1):
|
||||||
|
self.mu = _f(mu)
|
||||||
|
self.alpha = _f(alpha)
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.accum = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.accum is None:
|
||||||
|
self.accum = np.zeros_like(dW)
|
||||||
|
|
||||||
|
self.accum[:] = self.accum * self.mu + dW
|
||||||
|
|
||||||
|
signed = np.sign(dW) * np.sign(self.accum)
|
||||||
|
#signed *= decay
|
||||||
|
|
||||||
|
return -self.lr * dW * (self.alpha + signed)
|
||||||
|
|
||||||
|
class PowerSign(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1709.07417
|
||||||
|
|
||||||
|
def __init__(self, lr=0.01, mu=0.9, alpha=np.e):
|
||||||
|
self.mu = _f(mu)
|
||||||
|
self.alpha = _f(alpha)
|
||||||
|
self.use_exp = np.isclose(self.alpha, _f(np.e))
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.accum = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.accum is None:
|
||||||
|
self.accum = np.zeros_like(dW)
|
||||||
|
|
||||||
|
self.accum[:] = self.accum * self.mu + dW
|
||||||
|
|
||||||
|
signed = np.sign(dW) * np.sign(self.accum)
|
||||||
|
#signed *= decay
|
||||||
|
|
||||||
|
if self.use_exp:
|
||||||
|
return -self.lr * dW * np.exp(signed)
|
||||||
|
else:
|
||||||
|
return -self.lr * dW * np.power(self.alpha, signed)
|
||||||
|
|
||||||
|
class Neumann(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1712.03298
|
||||||
|
# NOTE: this implementation is missing resetting as described in the paper.
|
||||||
|
# resetting is totally disabled for now.
|
||||||
|
# NOTE: this implementation does not use vanilla SGD for its first epochs.
|
||||||
|
# you should do this yourself if you need it.
|
||||||
|
# it seems like using a Learner like SineCLR makes this unnecessary.
|
||||||
|
|
||||||
|
def __init__(self, lr=0.01):
|
||||||
|
self.alpha = _f(1e-7) # cubic.
|
||||||
|
self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
|
||||||
|
self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
|
||||||
|
# momentum is ∝ (in the shape of) 1 - 1/(1 + t)
|
||||||
|
self.mu_min = _f(0.5)
|
||||||
|
self.mu_max = _f(0.9)
|
||||||
|
self.reset_period = 0 # TODO
|
||||||
|
|
||||||
|
super().__init__(lr)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
# NOTE: mt and vt are different than the pair in Adam-like optimizers.
|
||||||
|
self.mt = None # momentum accumulator.
|
||||||
|
self.vt = None # weight accumulator.
|
||||||
|
self.t = 0
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
raise Exception("compute() is not available for this Optimizer.")
|
||||||
|
|
||||||
|
def update(self, dW, W):
|
||||||
|
self.t += 1
|
||||||
|
|
||||||
|
if self.mt is None:
|
||||||
|
self.mt = np.zeros_like(dW)
|
||||||
|
if self.vt is None:
|
||||||
|
self.vt = np.zeros_like(dW)
|
||||||
|
|
||||||
|
if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0:
|
||||||
|
self.mt = -self.lr * dW
|
||||||
|
return
|
||||||
|
|
||||||
|
# momentum quantity:
|
||||||
|
mu = _1 - _1/_f(self.t) # the + 1 is implicit.
|
||||||
|
mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
|
||||||
|
|
||||||
|
# smoothed change in weights:
|
||||||
|
delta = W - self.vt
|
||||||
|
delta_norm_squared = np.square(delta).sum()
|
||||||
|
delta_norm = np.sqrt(delta_norm_squared)
|
||||||
|
|
||||||
|
# regularization terms: (push and pull)
|
||||||
|
cubic_reg = self.alpha * delta_norm_squared
|
||||||
|
repulsive_reg = self.beta * dW.size / delta_norm_squared
|
||||||
|
dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm)
|
||||||
|
|
||||||
|
# plain momentum:
|
||||||
|
self.mt = mu * self.mt - self.lr * dt
|
||||||
|
|
||||||
|
# weights and accumulator:
|
||||||
|
W += mu * self.mt - self.lr * dt
|
||||||
|
self.vt = W + self.gamma * (self.vt - W)
|
||||||
|
|
19
onn/optimizer_base.py
Normal file
19
onn/optimizer_base.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
|
||||||
|
class Optimizer:
|
||||||
|
def __init__(self, lr=0.1):
|
||||||
|
self.lr = _f(lr) # learning rate
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
return -self.lr * dW
|
||||||
|
|
||||||
|
def update(self, dW, W):
|
||||||
|
W += self.compute(dW, W)
|
||||||
|
|
||||||
|
|
254
onn/parametric.py
Normal file
254
onn/parametric.py
Normal file
|
@ -0,0 +1,254 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
from .layer_base import *
|
||||||
|
from .initialization import *
|
||||||
|
|
||||||
|
class Bias(Layer):
|
||||||
|
# TODO: support axes other than -1 and shapes other than 1D.
|
||||||
|
|
||||||
|
serialized = {
|
||||||
|
'b': 'biases',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, init=init_zeros, reg_b=None):
|
||||||
|
super().__init__()
|
||||||
|
self.biases = self._new_weights('biases', init=init, regularizer=reg_b)
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
self.output_shape = shape
|
||||||
|
self.biases.shape = (shape[-1],)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return X + self.biases.f
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
self.biases.g += dY.sum(0)
|
||||||
|
return dY
|
||||||
|
|
||||||
|
class Dense(Layer):
|
||||||
|
serialized = {
|
||||||
|
'W': 'coeffs',
|
||||||
|
'b': 'biases',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None):
|
||||||
|
super().__init__()
|
||||||
|
self.dim = int(dim)
|
||||||
|
self.output_shape = (dim,)
|
||||||
|
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
||||||
|
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
assert len(shape) == 1, shape
|
||||||
|
self.coeffs.shape = (shape[0], self.dim)
|
||||||
|
self.biases.shape = (1, self.dim)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.X = X
|
||||||
|
return X @ self.coeffs.f + self.biases.f
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
self.coeffs.g += self.X.T @ dY
|
||||||
|
self.biases.g += dY.sum(0, keepdims=True)
|
||||||
|
return dY @ self.coeffs.f.T
|
||||||
|
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class Conv1Dper(Layer):
|
||||||
|
# periodic (circular) convolution.
|
||||||
|
# currently only supports one channel I/O.
|
||||||
|
# some notes:
|
||||||
|
# we could use FFTs for larger convolutions.
|
||||||
|
# i think storing the coefficients backwards would
|
||||||
|
# eliminate reversal in the critical code.
|
||||||
|
|
||||||
|
serialize = {
|
||||||
|
'W': 'coeffs',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, kernel_size, pos=None,
|
||||||
|
init=init_glorot_uniform, reg_w=None):
|
||||||
|
super().__init__()
|
||||||
|
self.kernel_size = int(kernel_size)
|
||||||
|
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
||||||
|
if pos is None:
|
||||||
|
self.wrap0 = (self.kernel_size - 0) // 2
|
||||||
|
self.wrap1 = (self.kernel_size - 1) // 2
|
||||||
|
elif pos == 'alt':
|
||||||
|
self.wrap0 = (self.kernel_size - 1) // 2
|
||||||
|
self.wrap1 = (self.kernel_size - 0) // 2
|
||||||
|
elif pos == 'left':
|
||||||
|
self.wrap0 = 0
|
||||||
|
self.wrap1 = self.kernel_size - 1
|
||||||
|
elif pos == 'right':
|
||||||
|
self.wrap0 = self.kernel_size - 1
|
||||||
|
self.wrap1 = 0
|
||||||
|
else:
|
||||||
|
raise Exception("pos parameter not understood: {}".format(pos))
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
assert len(shape) == 1, shape
|
||||||
|
self.output_shape = shape
|
||||||
|
self.coeffs.shape = (1, self.kernel_size)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
if self.wrap0 == 0:
|
||||||
|
Xper = np.hstack((X,X[:,:self.wrap1]))
|
||||||
|
elif self.wrap1 == 0:
|
||||||
|
Xper = np.hstack((X[:,-self.wrap0:],X))
|
||||||
|
else:
|
||||||
|
Xper = np.hstack((X[:,-self.wrap0:],X,X[:,:self.wrap1]))
|
||||||
|
self.cols = rolling_batch(Xper, self.kernel_size)
|
||||||
|
convolved = (self.cols * self.coeffs.f[:,::-1]).sum(2)
|
||||||
|
return convolved
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
|
||||||
|
return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
|
||||||
|
|
||||||
|
class LayerNorm(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1607.06450
|
||||||
|
# note: nonparametric when affine == False
|
||||||
|
|
||||||
|
def __init__(self, eps=1e-5, affine=True):
|
||||||
|
super().__init__()
|
||||||
|
self.eps = _f(eps)
|
||||||
|
self.affine = bool(affine)
|
||||||
|
|
||||||
|
if self.affine:
|
||||||
|
self.gamma = self._new_weights('gamma', init=init_ones)
|
||||||
|
self.beta = self._new_weights('beta', init=init_zeros)
|
||||||
|
self.serialized = {
|
||||||
|
'gamma': 'gamma',
|
||||||
|
'beta': 'beta',
|
||||||
|
}
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
self.output_shape = shape
|
||||||
|
assert len(shape) == 1, shape
|
||||||
|
if self.affine:
|
||||||
|
self.gamma.shape = (shape[0],)
|
||||||
|
self.beta.shape = (shape[0],)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.mean = X.mean(0)
|
||||||
|
self.center = X - self.mean
|
||||||
|
self.var = self.center.var(0) + self.eps
|
||||||
|
self.std = np.sqrt(self.var)
|
||||||
|
|
||||||
|
self.Xnorm = self.center / self.std
|
||||||
|
if self.affine:
|
||||||
|
return self.gamma.f * self.Xnorm + self.beta.f
|
||||||
|
return self.Xnorm
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
length = dY.shape[0]
|
||||||
|
|
||||||
|
if self.affine:
|
||||||
|
dXnorm = dY * self.gamma.f
|
||||||
|
self.gamma.g += (dY * self.Xnorm).sum(0)
|
||||||
|
self.beta.g += dY.sum(0)
|
||||||
|
else:
|
||||||
|
dXnorm = dY
|
||||||
|
|
||||||
|
dstd = (dXnorm * self.center).sum(0) / -self.var
|
||||||
|
dcenter = dXnorm / self.std + dstd / self.std * self.center / length
|
||||||
|
dmean = -dcenter.sum(0)
|
||||||
|
dX = dcenter + dmean / length
|
||||||
|
|
||||||
|
return dX
|
||||||
|
|
||||||
|
class Denses(Layer): # TODO: rename?
|
||||||
|
# acts as a separate Dense for each row or column. only for 2D arrays.
|
||||||
|
|
||||||
|
serialized = {
|
||||||
|
'W': 'coeffs',
|
||||||
|
'b': 'biases',
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
|
||||||
|
super().__init__()
|
||||||
|
self.dim = int(dim)
|
||||||
|
self.weight_init = init
|
||||||
|
self.axis = int(axis)
|
||||||
|
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
||||||
|
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
||||||
|
|
||||||
|
def make_shape(self, parent):
|
||||||
|
shape = parent.output_shape
|
||||||
|
self.input_shape = shape
|
||||||
|
assert len(shape) == 2, shape
|
||||||
|
|
||||||
|
assert -len(shape) <= self.axis < len(shape)
|
||||||
|
self.axis = self.axis % len(shape)
|
||||||
|
|
||||||
|
self.output_shape = list(shape)
|
||||||
|
self.output_shape[self.axis] = self.dim
|
||||||
|
self.output_shape = tuple(self.output_shape)
|
||||||
|
|
||||||
|
in_rows = self.input_shape[0]
|
||||||
|
in_cols = self.input_shape[1]
|
||||||
|
out_rows = self.output_shape[0]
|
||||||
|
out_cols = self.output_shape[1]
|
||||||
|
|
||||||
|
self.coeffs.shape = (in_rows, in_cols, self.dim)
|
||||||
|
self.biases.shape = (1, out_rows, out_cols)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.X = X
|
||||||
|
if self.axis == 0:
|
||||||
|
return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f
|
||||||
|
elif self.axis == 1:
|
||||||
|
return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
self.biases.g += dY.sum(0, keepdims=True)
|
||||||
|
if self.axis == 0:
|
||||||
|
self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY)
|
||||||
|
return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
|
||||||
|
elif self.axis == 1:
|
||||||
|
self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
|
||||||
|
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
|
||||||
|
|
||||||
|
class CosineDense(Dense):
|
||||||
|
# paper: https://arxiv.org/abs/1702.05870
|
||||||
|
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
|
||||||
|
# the paper doesn't mention bias,
|
||||||
|
# so we treat bias as an additional weight with a constant input of 1.
|
||||||
|
# this is correct in Dense layers, so i hope it's correct here too.
|
||||||
|
|
||||||
|
eps = 1e-4
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
self.X = X
|
||||||
|
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
|
||||||
|
+ 1 + self.eps)
|
||||||
|
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
|
||||||
|
+ np.square(self.biases.f) + self.eps)
|
||||||
|
self.dot = X @ self.coeffs.f + self.biases.f
|
||||||
|
Y = self.dot / (self.X_norm * self.W_norm)
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def backward(self, dY):
|
||||||
|
ddot = dY / self.X_norm / self.W_norm
|
||||||
|
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
|
||||||
|
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
|
||||||
|
|
||||||
|
self.coeffs.g += self.X.T @ ddot \
|
||||||
|
+ dW_norm / self.W_norm * self.coeffs.f
|
||||||
|
self.biases.g += ddot.sum(0, keepdims=True) \
|
||||||
|
+ dW_norm / self.W_norm * self.biases.f
|
||||||
|
dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
|
||||||
|
|
||||||
|
return dX
|
||||||
|
|
45
onn/regularizer.py
Normal file
45
onn/regularizer.py
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
|
||||||
|
class Regularizer:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class L1L2(Regularizer):
|
||||||
|
def __init__(self, l1=0.0, l2=0.0):
|
||||||
|
self.l1 = _f(l1)
|
||||||
|
self.l2 = _f(l2)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
f = _0
|
||||||
|
if self.l1:
|
||||||
|
f += np.sum(self.l1 * np.abs(X))
|
||||||
|
if self.l2:
|
||||||
|
f += np.sum(self.l2 * np.square(X))
|
||||||
|
return f
|
||||||
|
|
||||||
|
def backward(self, X):
|
||||||
|
df = np.zeros_like(X)
|
||||||
|
if self.l1:
|
||||||
|
df += self.l1 * np.sign(X)
|
||||||
|
if self.l2:
|
||||||
|
df += self.l2 * 2 * X
|
||||||
|
return df
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
class SaturateRelu(Regularizer):
|
||||||
|
# paper: https://arxiv.org/abs/1703.09202
|
||||||
|
# TODO: test this (and ActivityRegularizer) more thoroughly.
|
||||||
|
# i've looked at the histogram of the resulting weights.
|
||||||
|
# it seems like only the layers after this are affected
|
||||||
|
# the way they should be.
|
||||||
|
|
||||||
|
def __init__(self, lamb=0.0):
|
||||||
|
self.lamb = _f(lamb)
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
return self.lamb * np.where(X >= 0, X, 0)
|
||||||
|
|
||||||
|
def backward(self, X):
|
||||||
|
return self.lamb * np.where(X >= 0, 1, 0)
|
90
onn/ritual.py
Normal file
90
onn/ritual.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
from .initialization import *
|
||||||
|
from .ritual_base import *
|
||||||
|
|
||||||
|
def stochastic_multiply(W, gamma=0.5, allow_negation=False):
|
||||||
|
# paper: https://arxiv.org/abs/1606.01981
|
||||||
|
|
||||||
|
assert W.ndim == 1, W.ndim
|
||||||
|
assert 0 < gamma < 1, gamma
|
||||||
|
size = len(W)
|
||||||
|
alpha = np.max(np.abs(W))
|
||||||
|
# NOTE: numpy gives [low, high) but the paper advocates [low, high]
|
||||||
|
mult = np.random.uniform(gamma, 1/gamma, size=size)
|
||||||
|
if allow_negation:
|
||||||
|
# NOTE: i have yet to see this do anything but cause divergence.
|
||||||
|
# i've referenced the paper several times yet still don't understand
|
||||||
|
# what i'm doing wrong, so i'm disabling it by default in my code.
|
||||||
|
# maybe i just need *a lot* more weights to compensate.
|
||||||
|
prob = (W / alpha + 1) / 2
|
||||||
|
samples = np.random.random_sample(size=size)
|
||||||
|
mult *= np.where(samples < prob, 1, -1)
|
||||||
|
np.multiply(W, mult, out=W)
|
||||||
|
|
||||||
|
class StochMRitual(Ritual):
|
||||||
|
# paper: https://arxiv.org/abs/1606.01981
|
||||||
|
# this probably doesn't make sense for regression problems,
|
||||||
|
# let alone small models, but here it is anyway!
|
||||||
|
|
||||||
|
def __init__(self, learner=None, gamma=0.5):
|
||||||
|
super().__init__(learner)
|
||||||
|
self.gamma = _f(gamma)
|
||||||
|
|
||||||
|
def prepare(self, model):
|
||||||
|
self.W = np.copy(model.W)
|
||||||
|
super().prepare(model)
|
||||||
|
|
||||||
|
def learn(self, inputs, outputs):
|
||||||
|
# an experiment:
|
||||||
|
#assert self.learner.rate < 10, self.learner.rate
|
||||||
|
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
|
||||||
|
|
||||||
|
self.W[:] = self.model.W
|
||||||
|
for layer in self.model.ordered_nodes:
|
||||||
|
if isinstance(layer, Dense):
|
||||||
|
stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
|
||||||
|
residual = super().learn(inputs, outputs)
|
||||||
|
self.model.W[:] = self.W
|
||||||
|
return residual
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
super().update()
|
||||||
|
f = 0.5
|
||||||
|
for layer in self.model.ordered_nodes:
|
||||||
|
if isinstance(layer, Dense):
|
||||||
|
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
|
||||||
|
# np.clip(layer.W, -1, 1, out=layer.W)
|
||||||
|
|
||||||
|
class NoisyRitual(Ritual):
|
||||||
|
def __init__(self, learner=None,
|
||||||
|
input_noise=0, output_noise=0, gradient_noise=0):
|
||||||
|
self.input_noise = _f(input_noise)
|
||||||
|
self.output_noise = _f(output_noise)
|
||||||
|
self.gradient_noise = _f(gradient_noise)
|
||||||
|
super().__init__(learner)
|
||||||
|
|
||||||
|
def learn(self, inputs, outputs):
|
||||||
|
# this is pretty crude
|
||||||
|
if self.input_noise > 0:
|
||||||
|
s = self.input_noise
|
||||||
|
inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
||||||
|
if self.output_noise > 0:
|
||||||
|
s = self.output_noise
|
||||||
|
outputs = outputs + np.random.normal(0, s, size=outputs.shape)
|
||||||
|
return super().learn(inputs, outputs)
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
# gradient noise paper: https://arxiv.org/abs/1511.06807
|
||||||
|
if self.gradient_noise > 0:
|
||||||
|
size = len(self.model.dW)
|
||||||
|
gamma = 0.55
|
||||||
|
#s = self.gradient_noise / (1 + self.bn) ** gamma
|
||||||
|
# experiments:
|
||||||
|
s = self.gradient_noise * np.sqrt(self.learner.rate)
|
||||||
|
#s = np.square(self.learner.rate)
|
||||||
|
#s = self.learner.rate / self.en
|
||||||
|
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
|
||||||
|
super().update()
|
||||||
|
|
132
onn/ritual_base.py
Normal file
132
onn/ritual_base.py
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
import types
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .floats import *
|
||||||
|
|
||||||
|
class Ritual: # i'm just making up names at this point.
|
||||||
|
def __init__(self, learner=None):
|
||||||
|
self.learner = learner if learner is not None else Learner(Optimizer())
|
||||||
|
self.model = None
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.learner.reset(optim=True)
|
||||||
|
self.en = 0
|
||||||
|
self.bn = 0
|
||||||
|
|
||||||
|
def learn(self, inputs, outputs):
|
||||||
|
error, predicted = self.model.forward(inputs, outputs)
|
||||||
|
self.model.backward(predicted, outputs)
|
||||||
|
self.model.regulate()
|
||||||
|
return error, predicted
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
optim = self.learner.optim
|
||||||
|
optim.model = self.model
|
||||||
|
optim.update(self.model.dW, self.model.W)
|
||||||
|
|
||||||
|
def prepare(self, model):
|
||||||
|
self.en = 0
|
||||||
|
self.bn = 0
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def _train_batch(self, batch_inputs, batch_outputs, b, batch_count,
|
||||||
|
test_only=False, loss_logging=False, mloss_logging=True):
|
||||||
|
if not test_only and self.learner.per_batch:
|
||||||
|
self.learner.batch(b / batch_count)
|
||||||
|
|
||||||
|
if test_only:
|
||||||
|
predicted = self.model.evaluate(batch_inputs, deterministic=True)
|
||||||
|
else:
|
||||||
|
error, predicted = self.learn(batch_inputs, batch_outputs)
|
||||||
|
self.model.regulate_forward()
|
||||||
|
self.update()
|
||||||
|
|
||||||
|
if loss_logging:
|
||||||
|
batch_loss = self.model.loss.forward(predicted, batch_outputs)
|
||||||
|
if np.isnan(batch_loss):
|
||||||
|
raise Exception("nan")
|
||||||
|
self.losses.append(batch_loss)
|
||||||
|
self.cumsum_loss += batch_loss
|
||||||
|
|
||||||
|
if mloss_logging:
|
||||||
|
# NOTE: this can use the non-deterministic predictions. fixme?
|
||||||
|
batch_mloss = self.model.mloss.forward(predicted, batch_outputs)
|
||||||
|
if np.isnan(batch_mloss):
|
||||||
|
raise Exception("nan")
|
||||||
|
self.mlosses.append(batch_mloss)
|
||||||
|
self.cumsum_mloss += batch_mloss
|
||||||
|
|
||||||
|
def train_batched(self, inputs_or_generator, outputs_or_batch_count,
|
||||||
|
batch_size=None,
|
||||||
|
return_losses=False, test_only=False, shuffle=True,
|
||||||
|
clear_grad=True):
|
||||||
|
assert isinstance(return_losses, bool) or return_losses == 'both'
|
||||||
|
assert self.model is not None
|
||||||
|
|
||||||
|
gen = isinstance(inputs_or_generator, types.GeneratorType)
|
||||||
|
if gen:
|
||||||
|
generator = inputs_or_generator
|
||||||
|
batch_count = outputs_or_batch_count
|
||||||
|
assert isinstance(batch_count, int), type(batch_count)
|
||||||
|
else:
|
||||||
|
inputs = inputs_or_generator
|
||||||
|
outputs = outputs_or_batch_count
|
||||||
|
|
||||||
|
if not test_only:
|
||||||
|
self.en += 1
|
||||||
|
|
||||||
|
if shuffle:
|
||||||
|
if gen:
|
||||||
|
raise Exception("shuffling is incompatibile with using a generator.")
|
||||||
|
indices = np.arange(inputs.shape[0])
|
||||||
|
np.random.shuffle(indices)
|
||||||
|
inputs = inputs[indices]
|
||||||
|
outputs = outputs[indices]
|
||||||
|
|
||||||
|
self.cumsum_loss, self.cumsum_mloss = _0, _0
|
||||||
|
self.losses, self.mlosses = [], []
|
||||||
|
|
||||||
|
if not gen:
|
||||||
|
batch_count = inputs.shape[0] // batch_size
|
||||||
|
# TODO: lift this restriction
|
||||||
|
assert inputs.shape[0] % batch_size == 0, \
|
||||||
|
"inputs is not evenly divisible by batch_size"
|
||||||
|
|
||||||
|
prev_batch_size = None
|
||||||
|
for b in range(batch_count):
|
||||||
|
if not test_only:
|
||||||
|
self.bn += 1
|
||||||
|
|
||||||
|
if gen:
|
||||||
|
batch_inputs, batch_outputs = next(generator)
|
||||||
|
batch_size = batch_inputs.shape[0]
|
||||||
|
# TODO: lift this restriction
|
||||||
|
assert batch_size == prev_batch_size or prev_batch_size is None, \
|
||||||
|
"non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
|
||||||
|
else:
|
||||||
|
bi = b * batch_size
|
||||||
|
batch_inputs = inputs[ bi:bi+batch_size]
|
||||||
|
batch_outputs = outputs[bi:bi+batch_size]
|
||||||
|
|
||||||
|
if clear_grad:
|
||||||
|
self.model.clear_grad()
|
||||||
|
self._train_batch(batch_inputs, batch_outputs, b, batch_count,
|
||||||
|
test_only, return_losses=='both', return_losses)
|
||||||
|
|
||||||
|
prev_batch_size = batch_size
|
||||||
|
|
||||||
|
avg_mloss = self.cumsum_mloss / _f(batch_count)
|
||||||
|
if return_losses == 'both':
|
||||||
|
avg_loss = self.cumsum_loss / _f(batch_count)
|
||||||
|
return avg_loss, avg_mloss, self.losses, self.mlosses
|
||||||
|
elif return_losses:
|
||||||
|
return avg_mloss, self.mlosses
|
||||||
|
return avg_mloss
|
||||||
|
|
||||||
|
def test_batched(self, inputs, outputs, *args, **kwargs):
|
||||||
|
return self.train_batched(inputs, outputs, *args,
|
||||||
|
test_only=True, **kwargs)
|
||||||
|
|
||||||
|
def train_batched_gen(self, generator, batch_count, *args, **kwargs):
|
||||||
|
return self.train_batched(generator, batch_count, *args,
|
||||||
|
shuffle=False, **kwargs)
|
37
onn/util.py
Normal file
37
onn/util.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def lament(*args, **kwargs):
|
||||||
|
print(*args, file=sys.stderr, **kwargs)
|
||||||
|
|
||||||
|
def lower_priority():
|
||||||
|
"""Set the priority of the process to below-normal."""
|
||||||
|
# via https://stackoverflow.com/a/1023269
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
try:
|
||||||
|
import win32api, win32process, win32con
|
||||||
|
pid = win32api.GetCurrentProcessId()
|
||||||
|
handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
|
||||||
|
win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
|
||||||
|
except ImportError:
|
||||||
|
lament("you do not have pywin32 installed.")
|
||||||
|
lament("the process priority could not be lowered.")
|
||||||
|
lament("consider: python -m pip install pypiwin32")
|
||||||
|
lament("consider: conda install pywin32")
|
||||||
|
else:
|
||||||
|
import os
|
||||||
|
os.nice(1)
|
||||||
|
|
||||||
|
# more
|
||||||
|
|
||||||
|
_log_was_update = False
|
||||||
|
def log(left, right, update=False):
|
||||||
|
s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right)
|
||||||
|
global _log_was_update
|
||||||
|
if update and _log_was_update:
|
||||||
|
lament('\x1B[F' + s)
|
||||||
|
else:
|
||||||
|
lament(s)
|
||||||
|
_log_was_update = update
|
||||||
|
|
||||||
|
class Dummy:
|
||||||
|
pass
|
58
onn/weights.py
Normal file
58
onn/weights.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class Weights:
|
||||||
|
# we may or may not contain weights -- or any information, for that matter.
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.f = None # forward weights
|
||||||
|
self.g = None # backward weights (gradients)
|
||||||
|
self.shape = None
|
||||||
|
self.init = None
|
||||||
|
self.allocator = None
|
||||||
|
self.regularizer = None
|
||||||
|
self._allocated = False
|
||||||
|
|
||||||
|
self.configure(**kwargs)
|
||||||
|
|
||||||
|
def configure(self, **kwargs):
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
getattr(self, k) # ensures the key already exists
|
||||||
|
setattr(self, k, v)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def size(self):
|
||||||
|
assert self.shape is not None
|
||||||
|
return np.prod(self.shape)
|
||||||
|
|
||||||
|
def allocate(self, *args, **kwargs):
|
||||||
|
if self._allocated:
|
||||||
|
raise Exception("attempted to allocate existing weights")
|
||||||
|
self.configure(**kwargs)
|
||||||
|
|
||||||
|
# intentionally not using isinstance
|
||||||
|
assert type(self.shape) == tuple, self.shape
|
||||||
|
|
||||||
|
f, g = self.allocator(self.size)
|
||||||
|
assert len(f) == self.size, "{} != {}".format(f.shape, self.size)
|
||||||
|
assert len(g) == self.size, "{} != {}".format(g.shape, self.size)
|
||||||
|
f[:] = self.init(self.size, *args)
|
||||||
|
g[:] = self.init(self.size, *args)
|
||||||
|
self.f = f.reshape(self.shape)
|
||||||
|
self.g = g.reshape(self.shape)
|
||||||
|
|
||||||
|
self._allocated = True
|
||||||
|
|
||||||
|
def forward(self):
|
||||||
|
if self.regularizer is None:
|
||||||
|
return 0.0
|
||||||
|
return self.regularizer.forward(self.f)
|
||||||
|
|
||||||
|
def backward(self):
|
||||||
|
if self.regularizer is None:
|
||||||
|
return 0.0
|
||||||
|
return self.regularizer.backward(self.f)
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
if self.regularizer is None:
|
||||||
|
return
|
||||||
|
self.g += self.regularizer.backward(self.f)
|
1398
onn_core.py
1398
onn_core.py
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue