2017-01-09 03:37:35 -08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2017-02-08 22:43:41 -08:00
|
|
|
# external packages required for full functionality:
|
|
|
|
# numpy scipy h5py sklearn dotmap
|
|
|
|
|
2017-02-16 19:21:24 -08:00
|
|
|
# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
|
|
|
|
# this is gonna take some work.
|
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
from optim_nn_core import *
|
2017-06-21 06:07:57 -07:00
|
|
|
from optim_nn_core import _check, _f, _0, _1
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
import sys
|
2017-02-16 14:10:33 -08:00
|
|
|
|
|
|
|
def lament(*args, **kwargs):
|
|
|
|
print(*args, file=sys.stderr, **kwargs)
|
2017-01-12 08:04:42 -08:00
|
|
|
|
2017-02-18 18:43:42 -08:00
|
|
|
_log_was_update = False
|
|
|
|
def log(left, right, update=False):
|
|
|
|
s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right)
|
|
|
|
global _log_was_update
|
|
|
|
if update and _log_was_update:
|
|
|
|
lament('\x1B[F' + s)
|
|
|
|
else:
|
|
|
|
lament(s)
|
|
|
|
_log_was_update = update
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-02-12 17:41:47 -08:00
|
|
|
class Dummy:
|
|
|
|
pass
|
|
|
|
|
2017-06-17 09:45:50 -07:00
|
|
|
# Initializations {{{1
|
|
|
|
|
|
|
|
def init_gaussian_unit(size, ins, outs):
|
|
|
|
s = np.sqrt(1 / ins)
|
|
|
|
return np.random.normal(0, s, size=size)
|
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
# Loss functions {{{1
|
2017-01-10 13:21:26 -08:00
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
class SquaredHalved(ResidualLoss):
|
2017-01-09 03:37:35 -08:00
|
|
|
def f(self, r):
|
|
|
|
return np.square(r) / 2
|
|
|
|
|
|
|
|
def df(self, r):
|
|
|
|
return r
|
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
class SomethingElse(ResidualLoss):
|
2017-02-13 17:53:10 -08:00
|
|
|
# generalizes Absolute and SquaredHalved
|
2017-01-12 08:04:42 -08:00
|
|
|
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
|
|
|
def __init__(self, a=4/3):
|
|
|
|
assert 1 <= a <= 2, "parameter out of range"
|
2017-02-14 13:02:30 -08:00
|
|
|
self.a = _f(a / 2)
|
|
|
|
self.b = _f(2 / a)
|
|
|
|
self.c = _f(2 / a - 1)
|
2017-01-12 08:04:42 -08:00
|
|
|
|
|
|
|
def f(self, r):
|
|
|
|
return self.a * np.abs(r)**self.b
|
|
|
|
|
|
|
|
def df(self, r):
|
|
|
|
return np.sign(r) * np.abs(r)**self.c
|
|
|
|
|
2017-03-18 22:11:19 -07:00
|
|
|
class Confidence(Loss):
|
|
|
|
# this isn't "confidence" in any meaningful way; (e.g. Bayesian)
|
|
|
|
# it's just a metric of how large the value is of the predicted class.
|
|
|
|
# when using it for loss, it acts like a crappy regularizer.
|
|
|
|
# it really just measures how much of a hot-shot the network thinks it is.
|
|
|
|
|
|
|
|
def forward(self, p, y=None):
|
|
|
|
categories = p.shape[-1]
|
|
|
|
confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
|
|
|
|
# the exponent in softmax puts a maximum on confidence,
|
|
|
|
# but we don't compensate for that. if necessary,
|
|
|
|
# it'd be better to use an activation that doesn't have this limit.
|
|
|
|
return np.mean(confidence)
|
|
|
|
|
|
|
|
def backward(self, p, y=None):
|
|
|
|
# in order to agree with the forward pass,
|
|
|
|
# using this backwards pass as-is will minimize confidence.
|
|
|
|
categories = p.shape[-1]
|
|
|
|
detc = p / categories / (1 - 1/categories)
|
|
|
|
dmax = p == np.max(p, axis=-1, keepdims=True)
|
|
|
|
return detc * dmax
|
|
|
|
|
|
|
|
class NLL(Loss): # Negative Log Likelihood
|
|
|
|
def forward(self, p, y):
|
|
|
|
correct = p * y
|
|
|
|
return np.mean(-correct)
|
|
|
|
|
|
|
|
def backward(self, p, y):
|
|
|
|
return -y / len(p)
|
|
|
|
|
2017-04-10 21:46:54 -07:00
|
|
|
# Regularizers {{{1
|
|
|
|
|
|
|
|
class SaturateRelu(Regularizer):
|
|
|
|
# paper: https://arxiv.org/abs/1703.09202
|
|
|
|
# TODO: test this (and ActivityRegularizer) more thoroughly.
|
|
|
|
# i've looked at the histogram of the resulting weights.
|
|
|
|
# it seems like only the layers after this are affected
|
|
|
|
# the way they should be.
|
|
|
|
|
|
|
|
def __init__(self, lamb=0.0):
|
|
|
|
self.lamb = _f(lamb)
|
|
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
return self.lamb * np.where(X >= 0, X, 0)
|
|
|
|
|
|
|
|
def backward(self, X):
|
|
|
|
return self.lamb * np.where(X >= 0, 1, 0)
|
|
|
|
|
2017-06-21 05:20:27 -07:00
|
|
|
# Optimizers {{{1
|
|
|
|
|
|
|
|
class FTML(Optimizer):
|
|
|
|
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
|
|
|
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
|
|
|
|
|
|
|
def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
|
|
|
self.iterations = _0
|
|
|
|
self.b1 = _f(b1) # decay term
|
|
|
|
self.b2 = _f(b2) # decay term
|
|
|
|
self.eps = _f(eps)
|
|
|
|
|
|
|
|
super().__init__(alpha)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.dt1 = None
|
|
|
|
self.dt = None
|
|
|
|
self.vt = None
|
|
|
|
self.zt = None
|
|
|
|
self.b1_t = _1
|
|
|
|
self.b2_t = _1
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.dt1 is None: self.dt1 = np.zeros_like(dW)
|
|
|
|
if self.dt is None: self.dt = np.zeros_like(dW)
|
|
|
|
if self.vt is None: self.vt = np.zeros_like(dW)
|
|
|
|
if self.zt is None: self.zt = np.zeros_like(dW)
|
|
|
|
|
|
|
|
# NOTE: we could probably rewrite these equations to avoid this copy.
|
|
|
|
self.dt1[:] = self.dt[:]
|
|
|
|
|
|
|
|
self.b1_t *= self.b1
|
|
|
|
self.b2_t *= self.b2
|
|
|
|
|
|
|
|
# hardly an elegant solution.
|
|
|
|
alpha = max(self.alpha, self.eps)
|
|
|
|
|
|
|
|
# same as Adam's vt.
|
|
|
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
|
|
|
|
|
|
|
# you can factor out "inner" out of Adam as well.
|
|
|
|
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
|
|
|
|
self.dt[:] = (1 - self.b1_t) / alpha * inner
|
|
|
|
|
|
|
|
sigma_t = self.dt - self.b1 * self.dt1
|
|
|
|
|
|
|
|
# Adam's mt minus the sigma term.
|
|
|
|
self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
|
|
|
|
|
|
|
|
# subtract by weights to avoid having to override self.update.
|
|
|
|
return -self.zt / self.dt - W
|
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
# Nonparametric Layers {{{1
|
|
|
|
|
2017-06-17 09:45:50 -07:00
|
|
|
class AlphaDropout(Layer):
|
|
|
|
# to be used alongside Selu activations.
|
|
|
|
# paper: https://arxiv.org/abs/1706.02515
|
|
|
|
|
|
|
|
def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099):
|
|
|
|
super().__init__()
|
|
|
|
self.alpha = _f(alpha)
|
|
|
|
self.lamb = _f(lamb)
|
|
|
|
self.saturated = -self.lamb * self.alpha
|
|
|
|
self.dropout = _f(dropout)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def dropout(self):
|
|
|
|
return self._dropout
|
|
|
|
|
|
|
|
@dropout.setter
|
|
|
|
def dropout(self, x):
|
|
|
|
self._dropout = _f(x)
|
|
|
|
self.q = 1 - self._dropout
|
|
|
|
assert 0 <= self.q <= 1
|
|
|
|
|
|
|
|
sat = self.saturated
|
|
|
|
|
|
|
|
self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout)
|
|
|
|
self.b = -self.a * (self._dropout * sat)
|
|
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
self.mask = np.random.rand(*X.shape) < self.q
|
|
|
|
return self.a * np.where(self.mask, X, self.saturated) + self.b
|
|
|
|
|
|
|
|
def forward_deterministic(self, X):
|
|
|
|
return X
|
|
|
|
|
|
|
|
def backward(self, dY):
|
|
|
|
return dY * self.a * self.mask
|
|
|
|
|
|
|
|
# Activations {{{2
|
|
|
|
|
|
|
|
class Selu(Layer):
|
|
|
|
# paper: https://arxiv.org/abs/1706.02515
|
|
|
|
|
|
|
|
def __init__(self, alpha=1.67326324, lamb=1.05070099):
|
|
|
|
super().__init__()
|
|
|
|
self.alpha = _f(alpha)
|
|
|
|
self.lamb = _f(lamb)
|
|
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
self.cond = X >= 0
|
|
|
|
self.neg = self.alpha * np.exp(X)
|
|
|
|
return self.lamb * np.where(self.cond, X, self.neg - self.alpha)
|
|
|
|
|
|
|
|
def backward(self, dY):
|
|
|
|
return dY * self.lamb * np.where(self.cond, 1, self.neg)
|
|
|
|
|
|
|
|
class TanhTest(Layer):
|
|
|
|
def forward(self, X):
|
|
|
|
self.sig = np.tanh(1 / 2 * X)
|
|
|
|
return 2.4004 * self.sig
|
|
|
|
|
|
|
|
def backward(self, dY):
|
|
|
|
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
|
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
# Parametric Layers {{{1
|
2017-02-01 22:03:12 -08:00
|
|
|
|
2017-02-17 13:51:15 -08:00
|
|
|
class LayerNorm(Layer):
|
|
|
|
# paper: https://arxiv.org/abs/1607.06450
|
|
|
|
# note: nonparametric when affine == False
|
|
|
|
|
|
|
|
def __init__(self, eps=1e-5, affine=True):
|
|
|
|
super().__init__()
|
|
|
|
self.eps = _f(eps)
|
|
|
|
self.affine = bool(affine)
|
|
|
|
|
2017-02-27 14:14:58 -08:00
|
|
|
if self.affine:
|
2017-04-10 02:53:54 -07:00
|
|
|
self.gamma = self._new_weights('gamma', init=init_ones)
|
|
|
|
self.beta = self._new_weights('beta', init=init_zeros)
|
2017-02-27 14:14:58 -08:00
|
|
|
self.serialized = {
|
|
|
|
'gamma': 'gamma',
|
|
|
|
'beta': 'beta',
|
|
|
|
}
|
|
|
|
|
2017-04-10 01:26:38 -07:00
|
|
|
def make_shape(self, parent):
|
|
|
|
shape = parent.output_shape
|
|
|
|
self.input_shape = shape
|
2017-04-10 02:53:54 -07:00
|
|
|
self.output_shape = shape
|
2017-04-10 03:34:58 -07:00
|
|
|
assert len(shape) == 1, shape
|
2017-02-17 13:51:15 -08:00
|
|
|
if self.affine:
|
2017-04-10 02:53:54 -07:00
|
|
|
self.gamma.shape = (shape[0],)
|
|
|
|
self.beta.shape = (shape[0],)
|
2017-02-17 13:51:15 -08:00
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def forward(self, X):
|
2017-02-17 13:51:15 -08:00
|
|
|
self.mean = X.mean(0)
|
|
|
|
self.center = X - self.mean
|
|
|
|
self.var = self.center.var(0) + self.eps
|
|
|
|
self.std = np.sqrt(self.var)
|
|
|
|
|
|
|
|
self.Xnorm = self.center / self.std
|
|
|
|
if self.affine:
|
2017-04-10 02:53:54 -07:00
|
|
|
return self.gamma.f * self.Xnorm + self.beta.f
|
2017-02-17 13:51:15 -08:00
|
|
|
return self.Xnorm
|
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def backward(self, dY):
|
2017-02-17 13:51:15 -08:00
|
|
|
length = dY.shape[0]
|
|
|
|
|
|
|
|
if self.affine:
|
2017-04-10 02:53:54 -07:00
|
|
|
dXnorm = dY * self.gamma.f
|
|
|
|
self.gamma.g[:] = (dY * self.Xnorm).sum(0)
|
|
|
|
self.beta.g[:] = dY.sum(0)
|
2017-02-17 13:51:15 -08:00
|
|
|
else:
|
|
|
|
dXnorm = dY
|
|
|
|
|
|
|
|
dstd = (dXnorm * self.center).sum(0) / -self.var
|
|
|
|
dcenter = dXnorm / self.std + dstd / self.std * self.center / length
|
|
|
|
dmean = -dcenter.sum(0)
|
|
|
|
dX = dcenter + dmean / length
|
|
|
|
|
|
|
|
return dX
|
|
|
|
|
2017-02-26 17:52:17 -08:00
|
|
|
class Denses(Layer): # TODO: rename?
|
|
|
|
# acts as a separate Dense for each row or column. only for 2D arrays.
|
|
|
|
|
2017-02-27 14:14:58 -08:00
|
|
|
serialized = {
|
|
|
|
'W': 'coeffs',
|
|
|
|
'b': 'biases',
|
|
|
|
}
|
|
|
|
|
2017-04-10 07:36:08 -07:00
|
|
|
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
|
2017-02-26 17:52:17 -08:00
|
|
|
super().__init__()
|
|
|
|
self.dim = int(dim)
|
|
|
|
self.weight_init = init
|
|
|
|
self.axis = int(axis)
|
2017-04-10 07:36:08 -07:00
|
|
|
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
|
|
|
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
2017-02-26 17:52:17 -08:00
|
|
|
|
2017-04-10 01:26:38 -07:00
|
|
|
def make_shape(self, parent):
|
|
|
|
shape = parent.output_shape
|
|
|
|
self.input_shape = shape
|
2017-04-10 03:34:58 -07:00
|
|
|
assert len(shape) == 2, shape
|
2017-02-26 17:52:17 -08:00
|
|
|
|
|
|
|
assert -len(shape) <= self.axis < len(shape)
|
|
|
|
self.axis = self.axis % len(shape)
|
|
|
|
|
|
|
|
self.output_shape = list(shape)
|
|
|
|
self.output_shape[self.axis] = self.dim
|
|
|
|
self.output_shape = tuple(self.output_shape)
|
|
|
|
|
|
|
|
in_rows = self.input_shape[0]
|
|
|
|
in_cols = self.input_shape[1]
|
|
|
|
out_rows = self.output_shape[0]
|
|
|
|
out_cols = self.output_shape[1]
|
|
|
|
|
2017-04-10 02:53:54 -07:00
|
|
|
self.coeffs.shape = (in_rows, in_cols, self.dim)
|
|
|
|
self.biases.shape = (1, out_rows, out_cols)
|
2017-02-26 17:52:17 -08:00
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def forward(self, X):
|
2017-02-26 17:52:17 -08:00
|
|
|
self.X = X
|
|
|
|
if self.axis == 0:
|
2017-04-10 02:53:54 -07:00
|
|
|
return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f
|
2017-02-26 17:52:17 -08:00
|
|
|
elif self.axis == 1:
|
2017-04-10 02:53:54 -07:00
|
|
|
return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f
|
2017-02-26 17:52:17 -08:00
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def backward(self, dY):
|
2017-04-10 02:53:54 -07:00
|
|
|
self.biases.g[:] = dY.sum(0, keepdims=True)
|
2017-02-26 17:52:17 -08:00
|
|
|
if self.axis == 0:
|
2017-04-10 02:53:54 -07:00
|
|
|
self.coeffs.g[:] = np.einsum('ixj,ikj->xjk', self.X, dY)
|
|
|
|
return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
|
2017-02-26 17:52:17 -08:00
|
|
|
elif self.axis == 1:
|
2017-04-10 02:53:54 -07:00
|
|
|
self.coeffs.g[:] = np.einsum('ijx,ijk->jxk', self.X, dY)
|
|
|
|
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
|
2017-02-26 17:52:17 -08:00
|
|
|
|
2017-02-16 19:21:24 -08:00
|
|
|
class DenseOneLess(Dense):
|
2017-04-10 02:53:54 -07:00
|
|
|
def init(self, allocator):
|
|
|
|
super().init(allocator)
|
2017-02-16 19:21:24 -08:00
|
|
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
|
|
|
assert ins == outs, (ins, outs)
|
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def forward(self, X):
|
2017-04-10 02:53:54 -07:00
|
|
|
np.fill_diagonal(self.coeffs.f, 0)
|
2017-02-16 19:21:24 -08:00
|
|
|
self.X = X
|
2017-04-10 02:53:54 -07:00
|
|
|
return X.dot(self.coeffs.f) + self.biases
|
2017-02-16 19:21:24 -08:00
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def backward(self, dY):
|
2017-04-10 02:53:54 -07:00
|
|
|
self.coeffs.g[:] = self.X.T.dot(dY)
|
|
|
|
self.biases.g[:] = dY.sum(0, keepdims=True)
|
|
|
|
np.fill_diagonal(self.coeffs.g, 0)
|
|
|
|
return dY.dot(self.coeffs.f.T)
|
2017-02-16 19:21:24 -08:00
|
|
|
|
2017-02-25 23:41:38 -08:00
|
|
|
class CosineDense(Dense):
|
|
|
|
# paper: https://arxiv.org/abs/1702.05870
|
|
|
|
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
|
|
|
|
# the paper doesn't mention bias,
|
|
|
|
# so we treat bias as an additional weight with a constant input of 1.
|
|
|
|
# this is correct in Dense layers, so i hope it's correct here too.
|
|
|
|
|
|
|
|
eps = 1e-4
|
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def forward(self, X):
|
2017-02-25 23:41:38 -08:00
|
|
|
self.X = X
|
|
|
|
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
|
|
|
|
+ 1 + self.eps)
|
2017-04-10 02:53:54 -07:00
|
|
|
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
|
|
|
|
+ np.square(self.biases.f) + self.eps)
|
|
|
|
self.dot = X.dot(self.coeffs.f) + self.biases.f
|
2017-02-25 23:41:38 -08:00
|
|
|
Y = self.dot / (self.X_norm * self.W_norm)
|
|
|
|
return Y
|
|
|
|
|
2017-02-27 01:07:25 -08:00
|
|
|
def backward(self, dY):
|
2017-02-25 23:41:38 -08:00
|
|
|
ddot = dY / self.X_norm / self.W_norm
|
|
|
|
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
|
|
|
|
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
|
|
|
|
|
2017-04-10 02:53:54 -07:00
|
|
|
self.coeffs.g[:] = self.X.T.dot(ddot) \
|
|
|
|
+ dW_norm / self.W_norm * self.coeffs.f
|
|
|
|
self.biases.g[:] = ddot.sum(0, keepdims=True) \
|
|
|
|
+ dW_norm / self.W_norm * self.biases.f
|
|
|
|
dX = ddot.dot(self.coeffs.f.T) + dX_norm / self.X_norm * self.X
|
2017-02-25 23:41:38 -08:00
|
|
|
|
|
|
|
return dX
|
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
# Rituals {{{1
|
2017-01-12 14:45:07 -08:00
|
|
|
|
2017-03-12 03:53:14 -07:00
|
|
|
def stochastic_multiply(W, gamma=0.5, allow_negation=False):
|
2017-02-03 22:15:54 -08:00
|
|
|
# paper: https://arxiv.org/abs/1606.01981
|
2017-02-08 18:10:25 -08:00
|
|
|
|
2017-02-03 22:15:54 -08:00
|
|
|
assert W.ndim == 1, W.ndim
|
|
|
|
assert 0 < gamma < 1, gamma
|
|
|
|
size = len(W)
|
|
|
|
alpha = np.max(np.abs(W))
|
|
|
|
# NOTE: numpy gives [low, high) but the paper advocates [low, high]
|
|
|
|
mult = np.random.uniform(gamma, 1/gamma, size=size)
|
2017-03-12 03:53:14 -07:00
|
|
|
if allow_negation:
|
|
|
|
# NOTE: i have yet to see this do anything but cause divergence.
|
|
|
|
# i've referenced the paper several times yet still don't understand
|
|
|
|
# what i'm doing wrong, so i'm disabling it by default in my code.
|
|
|
|
# maybe i just need *a lot* more weights to compensate.
|
2017-02-03 22:15:54 -08:00
|
|
|
prob = (W / alpha + 1) / 2
|
|
|
|
samples = np.random.random_sample(size=size)
|
|
|
|
mult *= np.where(samples < prob, 1, -1)
|
|
|
|
np.multiply(W, mult, out=W)
|
|
|
|
|
|
|
|
class StochMRitual(Ritual):
|
|
|
|
# paper: https://arxiv.org/abs/1606.01981
|
|
|
|
# this probably doesn't make sense for regression problems,
|
|
|
|
# let alone small models, but here it is anyway!
|
|
|
|
|
|
|
|
def __init__(self, learner=None, loss=None, mloss=None, gamma=0.5):
|
|
|
|
super().__init__(learner, loss, mloss)
|
2017-02-14 13:02:30 -08:00
|
|
|
self.gamma = _f(gamma)
|
2017-02-03 22:15:54 -08:00
|
|
|
|
|
|
|
def prepare(self, model):
|
|
|
|
self.W = np.copy(model.W)
|
|
|
|
super().prepare(model)
|
|
|
|
|
|
|
|
def learn(self, inputs, outputs):
|
|
|
|
# an experiment:
|
|
|
|
#assert self.learner.rate < 10, self.learner.rate
|
|
|
|
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
|
|
|
|
|
|
|
|
self.W[:] = self.model.W
|
|
|
|
for layer in self.model.ordered_nodes:
|
|
|
|
if isinstance(layer, Dense):
|
2017-03-12 03:53:14 -07:00
|
|
|
stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
|
2017-02-03 22:15:54 -08:00
|
|
|
residual = super().learn(inputs, outputs)
|
|
|
|
self.model.W[:] = self.W
|
|
|
|
return residual
|
|
|
|
|
|
|
|
def update(self):
|
|
|
|
super().update()
|
|
|
|
f = 0.5
|
|
|
|
for layer in self.model.ordered_nodes:
|
|
|
|
if isinstance(layer, Dense):
|
|
|
|
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
|
2017-02-08 18:10:25 -08:00
|
|
|
# np.clip(layer.W, -1, 1, out=layer.W)
|
|
|
|
|
|
|
|
class NoisyRitual(Ritual):
|
|
|
|
def __init__(self, learner=None, loss=None, mloss=None,
|
|
|
|
input_noise=0, output_noise=0, gradient_noise=0):
|
2017-02-14 13:02:30 -08:00
|
|
|
self.input_noise = _f(input_noise)
|
|
|
|
self.output_noise = _f(output_noise)
|
|
|
|
self.gradient_noise = _f(gradient_noise)
|
2017-02-08 18:10:25 -08:00
|
|
|
super().__init__(learner, loss, mloss)
|
|
|
|
|
2017-02-08 22:43:41 -08:00
|
|
|
def learn(self, inputs, outputs):
|
|
|
|
# this is pretty crude
|
2017-03-12 03:53:14 -07:00
|
|
|
if self.input_noise > 0:
|
|
|
|
s = self.input_noise
|
|
|
|
inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
|
|
|
if self.output_noise > 0:
|
|
|
|
s = self.output_noise
|
|
|
|
outputs = outputs + np.random.normal(0, s, size=outputs.shape)
|
|
|
|
return super().learn(inputs, outputs)
|
2017-02-08 22:43:41 -08:00
|
|
|
|
2017-02-08 18:10:25 -08:00
|
|
|
def update(self):
|
|
|
|
# gradient noise paper: https://arxiv.org/abs/1511.06807
|
|
|
|
if self.gradient_noise > 0:
|
|
|
|
size = len(self.model.dW)
|
|
|
|
gamma = 0.55
|
2017-03-12 03:53:14 -07:00
|
|
|
#s = self.gradient_noise / (1 + self.bn) ** gamma
|
2017-02-08 18:10:25 -08:00
|
|
|
# experiments:
|
2017-03-12 03:53:14 -07:00
|
|
|
s = self.gradient_noise * np.sqrt(self.learner.rate)
|
2017-02-08 18:10:25 -08:00
|
|
|
#s = np.square(self.learner.rate)
|
|
|
|
#s = self.learner.rate / self.en
|
2017-03-12 03:53:14 -07:00
|
|
|
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
|
2017-02-08 18:10:25 -08:00
|
|
|
super().update()
|
2017-02-03 22:15:54 -08:00
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
# Learners {{{1
|
2017-02-01 22:21:25 -08:00
|
|
|
|
|
|
|
class DumbLearner(AnnealingLearner):
|
|
|
|
# this is my own awful contraption. it's not really "SGD with restarts".
|
2017-02-16 14:10:33 -08:00
|
|
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10,
|
|
|
|
restarts=0, restart_advance=20, callback=None):
|
2017-02-01 22:21:25 -08:00
|
|
|
self.restart_epochs = int(epochs)
|
|
|
|
self.restarts = int(restarts)
|
|
|
|
self.restart_advance = float(restart_advance)
|
|
|
|
self.restart_callback = callback
|
|
|
|
epochs = self.restart_epochs * (self.restarts + 1)
|
|
|
|
super().__init__(optim, epochs, rate, halve_every)
|
|
|
|
|
|
|
|
def rate_at(self, epoch):
|
|
|
|
sub_epoch = epoch % self.restart_epochs
|
|
|
|
restart = epoch // self.restart_epochs
|
|
|
|
return super().rate_at(sub_epoch) * (self.anneal**self.restart_advance)**restart
|
|
|
|
|
|
|
|
def next(self):
|
|
|
|
if not super().next():
|
|
|
|
return False
|
|
|
|
sub_epoch = self.epoch % self.restart_epochs
|
|
|
|
restart = self.epoch // self.restart_epochs
|
|
|
|
if restart > 0 and sub_epoch == 0:
|
|
|
|
if self.restart_callback is not None:
|
|
|
|
self.restart_callback(restart)
|
|
|
|
return True
|
|
|
|
|
2017-02-17 13:51:15 -08:00
|
|
|
# Components {{{1
|
|
|
|
|
|
|
|
def _mr_make_norm(norm):
|
|
|
|
def _mr_norm(y, width, depth, block, multi, activation, style, FC, d):
|
|
|
|
skip = y
|
|
|
|
merger = Sum()
|
|
|
|
skip.feed(merger)
|
|
|
|
z_start = skip
|
|
|
|
z_start = z_start.feed(norm())
|
|
|
|
z_start = z_start.feed(activation())
|
|
|
|
for _ in range(multi):
|
|
|
|
z = z_start
|
|
|
|
for j in range(block):
|
|
|
|
if j > 0:
|
|
|
|
z = z.feed(norm())
|
|
|
|
z = z.feed(activation())
|
|
|
|
z = z.feed(FC())
|
|
|
|
z.feed(merger)
|
|
|
|
y = merger
|
|
|
|
return y
|
|
|
|
return _mr_norm
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
|
|
|
|
skip = y
|
|
|
|
merger = Sum()
|
|
|
|
skip.feed(merger)
|
|
|
|
z_start = skip.feed(activation())
|
|
|
|
for _ in range(multi):
|
|
|
|
z = z_start
|
|
|
|
for j in range(block):
|
|
|
|
if j > 0:
|
|
|
|
z = z.feed(activation())
|
|
|
|
z = z.feed(FC())
|
|
|
|
z.feed(merger)
|
|
|
|
y = merger
|
|
|
|
return y
|
|
|
|
|
|
|
|
def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
|
|
|
|
# this is my own awful contraption.
|
|
|
|
is_last = d + 1 == depth
|
|
|
|
needs_sum = not is_last or multi > 1
|
|
|
|
skip = y
|
|
|
|
if needs_sum:
|
|
|
|
merger = Sum()
|
|
|
|
if not is_last:
|
|
|
|
skip.feed(merger)
|
|
|
|
z_start = skip.feed(activation())
|
|
|
|
for _ in range(multi):
|
|
|
|
z = z_start
|
|
|
|
for j in range(block):
|
|
|
|
if j > 0:
|
|
|
|
z = z.feed(activation())
|
|
|
|
z = z.feed(FC())
|
|
|
|
if needs_sum:
|
|
|
|
z.feed(merger)
|
|
|
|
if needs_sum:
|
|
|
|
y = merger
|
|
|
|
else:
|
|
|
|
y = z
|
|
|
|
return y
|
|
|
|
|
|
|
|
_mr_styles = dict(
|
2017-02-17 13:51:15 -08:00
|
|
|
lnorm=_mr_make_norm(LayerNorm),
|
2017-02-16 14:10:33 -08:00
|
|
|
batchless=_mr_batchless,
|
|
|
|
onelesssum=_mr_onelesssum,
|
|
|
|
)
|
|
|
|
|
2017-01-13 03:29:19 -08:00
|
|
|
def multiresnet(x, width, depth, block=2, multi=1,
|
|
|
|
activation=Relu, style='batchless',
|
|
|
|
init=init_he_normal):
|
2017-02-25 23:41:38 -08:00
|
|
|
if style == 'cossim':
|
|
|
|
style = 'batchless'
|
|
|
|
DenseClass = CosineDense
|
|
|
|
else:
|
|
|
|
DenseClass = Dense
|
2017-02-16 14:10:33 -08:00
|
|
|
if style not in _mr_styles:
|
|
|
|
raise Exception('unknown resnet style', style)
|
|
|
|
|
2017-01-12 08:04:42 -08:00
|
|
|
y = x
|
|
|
|
last_size = x.output_shape[0]
|
|
|
|
|
|
|
|
for d in range(depth):
|
|
|
|
size = width
|
2017-02-25 23:41:38 -08:00
|
|
|
FC = lambda: DenseClass(size, init)
|
2017-01-12 08:04:42 -08:00
|
|
|
|
|
|
|
if last_size != size:
|
2017-02-16 14:10:33 -08:00
|
|
|
y = y.feed(FC())
|
|
|
|
|
|
|
|
y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d)
|
2017-01-12 08:04:42 -08:00
|
|
|
|
|
|
|
last_size = size
|
|
|
|
|
|
|
|
return y
|
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
# Toy Data {{{1
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-04-23 10:40:47 -07:00
|
|
|
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform,
|
2017-06-17 09:45:50 -07:00
|
|
|
glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform,
|
|
|
|
gaussian_unit=init_gaussian_unit)
|
2017-04-23 10:40:47 -07:00
|
|
|
activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh,
|
2017-06-17 09:45:50 -07:00
|
|
|
relu=Relu, elu=Elu, gelu=GeluApprox, selu=Selu,
|
|
|
|
softplus=Softplus)
|
2017-01-13 03:29:19 -08:00
|
|
|
|
2017-02-17 22:53:44 -08:00
|
|
|
def prettyize(data):
|
|
|
|
if isinstance(data, np.ndarray):
|
|
|
|
s = ', '.join(('{:8.2e}'.format(n) for n in data))
|
|
|
|
s = '[' + s + ']'
|
|
|
|
else:
|
|
|
|
s = '{:8.2e}'.format(data)
|
|
|
|
return s
|
|
|
|
|
2017-02-08 22:43:41 -08:00
|
|
|
def normalize_data(data, mean=None, std=None):
|
|
|
|
# in-place
|
|
|
|
if mean is None or std is None:
|
|
|
|
mean = np.mean(data, axis=0)
|
|
|
|
std = np.std(data, axis=0)
|
2017-02-17 22:53:44 -08:00
|
|
|
mean_str = prettyize(mean)
|
|
|
|
std_str = prettyize(std)
|
|
|
|
lament('nod(...,\n {},\n {})'.format(mean_str, std_str))
|
2017-02-08 22:43:41 -08:00
|
|
|
sys.exit(1)
|
2017-02-15 10:43:57 -08:00
|
|
|
data -= _f(mean)
|
|
|
|
data /= _f(std)
|
2017-02-08 22:43:41 -08:00
|
|
|
|
|
|
|
def toy_data(train_samples, valid_samples, problem=2):
|
|
|
|
total_samples = train_samples + valid_samples
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod = normalize_data # shorthand to keep a sane indentation
|
|
|
|
|
2017-02-15 10:43:57 -08:00
|
|
|
if problem == 0:
|
2017-02-16 14:10:33 -08:00
|
|
|
from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs
|
2017-02-15 10:43:57 -08:00
|
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
|
|
|
valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod(inputs, 127.5, 73.9)
|
|
|
|
nod(outputs, 44.8, 21.7)
|
|
|
|
nod(valid_inputs, 127.5, 73.9)
|
|
|
|
nod(valid_outputs, 44.8, 21.7)
|
2017-02-15 10:43:57 -08:00
|
|
|
|
|
|
|
elif problem == 1:
|
2017-02-08 22:43:41 -08:00
|
|
|
from sklearn.datasets import make_friedman1
|
|
|
|
inputs, outputs = make_friedman1(total_samples)
|
2017-02-14 13:02:30 -08:00
|
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
2017-02-08 22:43:41 -08:00
|
|
|
outputs = np.expand_dims(outputs, -1)
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod(inputs, 0.5, 1/np.sqrt(12))
|
|
|
|
nod(outputs, 14.4, 4.9)
|
2017-02-08 22:43:41 -08:00
|
|
|
|
|
|
|
elif problem == 2:
|
|
|
|
from sklearn.datasets import make_friedman2
|
|
|
|
inputs, outputs = make_friedman2(total_samples)
|
2017-02-14 13:02:30 -08:00
|
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
2017-02-08 22:43:41 -08:00
|
|
|
outputs = np.expand_dims(outputs, -1)
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod(inputs,
|
|
|
|
[5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
|
|
|
|
[2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
|
2017-02-08 22:43:41 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod(outputs, [482], [380])
|
2017-02-08 22:43:41 -08:00
|
|
|
|
|
|
|
elif problem == 3:
|
|
|
|
from sklearn.datasets import make_friedman3
|
|
|
|
inputs, outputs = make_friedman3(total_samples)
|
2017-02-14 13:02:30 -08:00
|
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
2017-02-08 22:43:41 -08:00
|
|
|
outputs = np.expand_dims(outputs, -1)
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod(inputs,
|
|
|
|
[4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
|
|
|
|
[2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
|
2017-02-08 22:43:41 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
nod(outputs, [1.32327931], [0.31776295])
|
2017-02-08 22:43:41 -08:00
|
|
|
|
|
|
|
else:
|
|
|
|
raise Exception("unknown toy data set", problem)
|
|
|
|
|
2017-02-15 10:43:57 -08:00
|
|
|
if problem != 0:
|
|
|
|
# split off a validation set
|
|
|
|
indices = np.arange(inputs.shape[0])
|
|
|
|
np.random.shuffle(indices)
|
|
|
|
valid_inputs = inputs[indices][-valid_samples:]
|
|
|
|
valid_outputs = outputs[indices][-valid_samples:]
|
|
|
|
inputs = inputs[indices][:-valid_samples]
|
|
|
|
outputs = outputs[indices][:-valid_samples]
|
2017-02-08 22:43:41 -08:00
|
|
|
|
|
|
|
return (inputs, outputs), (valid_inputs, valid_outputs)
|
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
# Model Creation {{{1
|
|
|
|
|
2017-02-17 18:37:04 -08:00
|
|
|
def optim_from_config(config):
|
2017-01-10 04:27:49 -08:00
|
|
|
if config.optim == 'adam':
|
2017-02-13 17:53:10 -08:00
|
|
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
|
|
|
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
|
|
|
|
b1 = np.exp(-1/d1)
|
|
|
|
b2 = np.exp(-1/d2)
|
2017-02-17 22:53:44 -08:00
|
|
|
o = Nadam if config.nesterov else Adam
|
|
|
|
optim = o(b1=b1, b2=b2)
|
2017-06-21 05:20:27 -07:00
|
|
|
elif config.optim == 'ftml':
|
2017-06-21 05:31:41 -07:00
|
|
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 2
|
2017-06-21 05:20:27 -07:00
|
|
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
|
|
|
|
b1 = np.exp(-1/d1)
|
|
|
|
b2 = np.exp(-1/d2)
|
|
|
|
optim = FTML(b1=b1, b2=b2)
|
2017-02-13 17:53:10 -08:00
|
|
|
elif config.optim in ('rms', 'rmsprop'):
|
|
|
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
|
|
|
|
mu = np.exp(-1/d2)
|
|
|
|
optim = RMSprop(mu=mu)
|
2017-01-10 04:27:49 -08:00
|
|
|
elif config.optim == 'sgd':
|
2017-02-25 23:41:38 -08:00
|
|
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 0
|
|
|
|
if d1 > 0:
|
|
|
|
b1 = np.exp(-1/d1)
|
|
|
|
optim = Momentum(mu=b1, nesterov=config.nesterov)
|
2017-01-10 04:27:49 -08:00
|
|
|
else:
|
|
|
|
optim = Optimizer()
|
|
|
|
else:
|
|
|
|
raise Exception('unknown optimizer', config.optim)
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-02-17 18:37:04 -08:00
|
|
|
return optim
|
2017-02-01 22:21:25 -08:00
|
|
|
|
2017-02-17 18:37:04 -08:00
|
|
|
def learner_from_config(config, optim, rscb):
|
2017-02-08 18:10:25 -08:00
|
|
|
if config.learner == 'sgdr':
|
|
|
|
expando = config.expando if 'expando' in config else None
|
2017-02-02 14:25:40 -08:00
|
|
|
learner = SGDR(optim, epochs=config.epochs, rate=config.learn,
|
2017-02-08 18:10:25 -08:00
|
|
|
restart_decay=config.restart_decay, restarts=config.restarts,
|
|
|
|
callback=rscb, expando=expando)
|
2017-02-02 14:25:40 -08:00
|
|
|
# final learning rate isn't of interest here; it's gonna be close to 0.
|
2017-02-08 22:43:41 -08:00
|
|
|
log('total epochs', learner.epochs)
|
2017-02-08 18:10:25 -08:00
|
|
|
elif config.learner == 'anneal':
|
|
|
|
learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
|
|
|
|
halve_every=config.learn_halve_every)
|
2017-02-25 23:41:38 -08:00
|
|
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
2017-02-08 18:10:25 -08:00
|
|
|
elif config.learner == 'dumb':
|
2017-02-02 14:25:40 -08:00
|
|
|
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
|
|
|
halve_every=config.learn_halve_every,
|
2017-02-16 14:10:33 -08:00
|
|
|
restarts=config.restarts,
|
|
|
|
restart_advance=config.learn_restart_advance,
|
2017-02-02 14:25:40 -08:00
|
|
|
callback=rscb)
|
|
|
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
2017-02-08 18:10:25 -08:00
|
|
|
elif config.learner == 'sgd':
|
|
|
|
learner = Learner(optim, epochs=config.epochs, rate=config.learn)
|
|
|
|
else:
|
|
|
|
raise Exception('unknown learner', config.learner)
|
2017-02-01 22:21:25 -08:00
|
|
|
|
2017-02-17 18:37:04 -08:00
|
|
|
return learner
|
|
|
|
|
|
|
|
def lookup_loss(maybe_name):
|
|
|
|
if isinstance(maybe_name, Loss):
|
|
|
|
return maybe_name
|
|
|
|
elif maybe_name == 'mse':
|
|
|
|
return Squared()
|
|
|
|
elif maybe_name == 'mshe': # mushy
|
|
|
|
return SquaredHalved()
|
|
|
|
elif maybe_name == 'mae':
|
|
|
|
return Absolute()
|
|
|
|
elif maybe_name == 'msee':
|
|
|
|
return SomethingElse()
|
|
|
|
raise Exception('unknown objective', maybe_name)
|
|
|
|
|
|
|
|
def ritual_from_config(config, learner, loss, mloss):
|
2017-02-08 18:10:25 -08:00
|
|
|
if config.ritual == 'default':
|
2017-02-03 22:15:54 -08:00
|
|
|
ritual = Ritual(learner=learner, loss=loss, mloss=mloss)
|
|
|
|
elif config.ritual == 'stochm':
|
|
|
|
ritual = StochMRitual(learner=learner, loss=loss, mloss=mloss)
|
2017-02-08 18:10:25 -08:00
|
|
|
elif config.ritual == 'noisy':
|
|
|
|
ritual = NoisyRitual(learner=learner, loss=loss, mloss=mloss,
|
2017-02-08 22:43:41 -08:00
|
|
|
input_noise=1e-1, output_noise=1e-2,
|
|
|
|
gradient_noise=2e-7)
|
2017-02-03 22:15:54 -08:00
|
|
|
else:
|
|
|
|
raise Exception('unknown ritual', config.ritual)
|
2017-01-10 14:33:12 -08:00
|
|
|
|
2017-02-17 18:37:04 -08:00
|
|
|
return ritual
|
|
|
|
|
|
|
|
def model_from_config(config, input_features, output_features, callbacks):
|
|
|
|
init = inits[config.init]
|
|
|
|
activation = activations[config.activation]
|
|
|
|
|
|
|
|
x = Input(shape=(input_features,))
|
|
|
|
y = x
|
|
|
|
y = multiresnet(y,
|
|
|
|
config.res_width, config.res_depth,
|
|
|
|
config.res_block, config.res_multi,
|
|
|
|
activation=activation, init=init,
|
|
|
|
style=config.parallel_style)
|
|
|
|
if y.output_shape[0] != output_features:
|
|
|
|
y = y.feed(Dense(output_features, init))
|
|
|
|
|
|
|
|
model = Model(x, y, unsafe=config.unsafe)
|
|
|
|
|
|
|
|
if config.fn_load is not None:
|
|
|
|
log('loading weights', config.fn_load)
|
|
|
|
model.load_weights(config.fn_load)
|
|
|
|
|
|
|
|
optim = optim_from_config(config)
|
|
|
|
|
|
|
|
def rscb(restart):
|
|
|
|
callbacks.restart()
|
|
|
|
log("restarting", restart)
|
|
|
|
if config.restart_optim:
|
|
|
|
optim.reset()
|
|
|
|
|
|
|
|
learner = learner_from_config(config, optim, rscb)
|
|
|
|
|
|
|
|
loss = lookup_loss(config.loss)
|
|
|
|
mloss = lookup_loss(config.mloss) if config.mloss else loss
|
|
|
|
|
|
|
|
ritual = ritual_from_config(config, learner, loss, mloss)
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
return model, learner, ritual
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
# main program {{{1
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
def run(program, args=None):
|
|
|
|
args = args if args else []
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-13 17:53:10 -08:00
|
|
|
np.random.seed(42069)
|
|
|
|
|
|
|
|
# Config {{{2
|
2017-02-12 17:29:52 -08:00
|
|
|
|
|
|
|
from dotmap import DotMap
|
|
|
|
config = DotMap(
|
|
|
|
fn_load = None,
|
|
|
|
fn_save = 'optim_nn.h5',
|
|
|
|
log_fn = 'losses.npz',
|
|
|
|
|
|
|
|
# multi-residual network parameters
|
|
|
|
res_width = 28,
|
|
|
|
res_depth = 2,
|
|
|
|
res_block = 3, # normally 2 for plain resnet
|
|
|
|
res_multi = 2, # normally 1 for plain resnet
|
|
|
|
|
|
|
|
# style of resnet (order of layers, which layers, etc.)
|
|
|
|
parallel_style = 'onelesssum',
|
2017-06-21 05:20:27 -07:00
|
|
|
activation = 'gelu',
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-06-21 05:20:27 -07:00
|
|
|
#optim = 'ftml',
|
|
|
|
#optim_decay1 = 2,
|
|
|
|
#optim_decay2 = 100,
|
|
|
|
#nesterov = False,
|
2017-02-25 23:41:38 -08:00
|
|
|
optim = 'adam', # note: most features only implemented for Adam
|
2017-04-23 10:40:47 -07:00
|
|
|
optim_decay1 = 24, # first momentum given in epochs (optional)
|
2017-02-25 23:41:38 -08:00
|
|
|
optim_decay2 = 100, # second momentum given in epochs (optional)
|
|
|
|
nesterov = True,
|
2017-02-12 17:29:52 -08:00
|
|
|
batch_size = 64,
|
|
|
|
|
|
|
|
# learning parameters
|
|
|
|
learner = 'sgdr',
|
2017-06-17 09:46:39 -07:00
|
|
|
learn = 0.00125,
|
2017-02-12 17:29:52 -08:00
|
|
|
epochs = 24,
|
2017-02-17 22:53:44 -08:00
|
|
|
learn_halve_every = 16, # only used with anneal/dumb
|
2017-06-17 09:46:39 -07:00
|
|
|
restarts = 4,
|
2017-02-12 17:29:52 -08:00
|
|
|
restart_decay = 0.25, # only used with SGDR
|
2017-02-16 14:10:33 -08:00
|
|
|
expando = lambda i: 24 * i,
|
2017-02-12 17:29:52 -08:00
|
|
|
|
|
|
|
# misc
|
2017-06-21 05:20:27 -07:00
|
|
|
init = 'he_normal',
|
2017-04-23 10:40:47 -07:00
|
|
|
loss = 'mse',
|
2017-02-12 17:29:52 -08:00
|
|
|
mloss = 'mse',
|
|
|
|
ritual = 'default',
|
|
|
|
restart_optim = False, # restarts also reset internal state of optimizer
|
2017-06-17 09:46:39 -07:00
|
|
|
warmup = False, # train a couple epochs on gaussian noise and reset
|
2017-02-18 18:43:42 -08:00
|
|
|
|
|
|
|
# logging/output
|
2017-02-17 22:53:44 -08:00
|
|
|
log10_loss = True, # personally, i'm sick of looking linear loss values!
|
2017-02-25 23:41:38 -08:00
|
|
|
#fancy_logs = True, # unimplemented (can't turn it off yet)
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-25 23:41:38 -08:00
|
|
|
problem = 2,
|
2017-02-15 10:43:57 -08:00
|
|
|
compare = (
|
|
|
|
# best results for ~10,000 parameters
|
|
|
|
# training/validation pairs for each problem (starting from problem 0):
|
2017-04-10 21:47:30 -07:00
|
|
|
(10**-3.120, 10**-2.901),
|
2017-02-16 14:10:33 -08:00
|
|
|
# 1080 epochs on these...
|
2017-04-10 21:47:30 -07:00
|
|
|
(10**-6.747, 10**-6.555),
|
|
|
|
(10**-7.774, 10**-7.626),
|
|
|
|
(10**-6.278, 10**-5.234), # overfitting? bad valid set?
|
2017-02-15 10:43:57 -08:00
|
|
|
),
|
2017-02-12 17:29:52 -08:00
|
|
|
|
|
|
|
unsafe = True, # aka gotta go fast mode
|
|
|
|
)
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
for k in ['parallel_style', 'activation', 'optim', 'learner',
|
|
|
|
'init', 'loss', 'mloss', 'ritual']:
|
2017-02-12 17:29:52 -08:00
|
|
|
config[k] = config[k].lower()
|
|
|
|
|
2017-06-17 09:46:39 -07:00
|
|
|
config.learn *= np.sqrt(config.batch_size)
|
|
|
|
|
2017-02-12 17:29:52 -08:00
|
|
|
config.pprint()
|
|
|
|
|
2017-02-13 17:53:10 -08:00
|
|
|
# Toy Data {{{2
|
2017-02-12 17:29:52 -08:00
|
|
|
|
|
|
|
(inputs, outputs), (valid_inputs, valid_outputs) = \
|
|
|
|
toy_data(2**14, 2**11, problem=config.problem)
|
|
|
|
input_features = inputs.shape[-1]
|
|
|
|
output_features = outputs.shape[-1]
|
|
|
|
|
2017-03-12 03:53:14 -07:00
|
|
|
# Our Test Model
|
|
|
|
|
2017-02-12 17:41:47 -08:00
|
|
|
callbacks = Dummy()
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
model, learner, ritual = \
|
2017-02-12 17:41:47 -08:00
|
|
|
model_from_config(config, input_features, output_features, callbacks)
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-17 18:37:04 -08:00
|
|
|
# Model Information {{{2
|
2017-02-12 17:29:52 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
for node in model.ordered_nodes:
|
|
|
|
children = [str(n) for n in node.children]
|
|
|
|
if children:
|
|
|
|
sep = '->'
|
2017-02-17 18:37:04 -08:00
|
|
|
print(str(node) + sep + ('\n' + str(node) + sep).join(children))
|
2017-02-12 17:29:52 -08:00
|
|
|
log('parameters', model.param_count)
|
|
|
|
|
2017-02-13 17:53:10 -08:00
|
|
|
# Training {{{2
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-01-12 08:04:42 -08:00
|
|
|
batch_losses = []
|
|
|
|
train_losses = []
|
|
|
|
valid_losses = []
|
|
|
|
|
|
|
|
def measure_error():
|
|
|
|
def print_error(name, inputs, outputs, comparison=None):
|
|
|
|
predicted = model.forward(inputs)
|
2017-02-15 20:18:53 -08:00
|
|
|
err = ritual.measure(predicted, outputs)
|
2017-02-17 22:53:44 -08:00
|
|
|
if config.log10_loss:
|
2017-02-18 18:43:42 -08:00
|
|
|
print(name, "{:12.6e}".format(err))
|
|
|
|
if comparison:
|
|
|
|
err10 = np.log10(err)
|
|
|
|
cmp10 = np.log10(comparison)
|
|
|
|
color = '\x1B[31m' if err10 > cmp10 else '\x1B[32m'
|
|
|
|
log(name + " log10-loss", "{:+6.3f} {}({:+6.3f})\x1B[0m".format(err10, color, err10 - cmp10))
|
|
|
|
else:
|
|
|
|
log(name + " log10-loss", "{:+6.3f}".format(err, np.log10(err)))
|
|
|
|
else:
|
|
|
|
log(name + " loss", "{:12.6e}".format(err))
|
|
|
|
if comparison:
|
|
|
|
fmt = "10**({:+7.4f}) times"
|
|
|
|
log("improvement", fmt.format(np.log10(comparison / err)))
|
2017-01-12 08:04:42 -08:00
|
|
|
return err
|
|
|
|
|
|
|
|
train_err = print_error("train",
|
2017-02-08 22:43:41 -08:00
|
|
|
inputs, outputs,
|
2017-02-15 10:43:57 -08:00
|
|
|
config.compare[config.problem][0])
|
2017-01-12 08:04:42 -08:00
|
|
|
valid_err = print_error("valid",
|
2017-02-08 22:43:41 -08:00
|
|
|
valid_inputs, valid_outputs,
|
2017-02-15 10:43:57 -08:00
|
|
|
config.compare[config.problem][1])
|
2017-01-12 08:04:42 -08:00
|
|
|
train_losses.append(train_err)
|
|
|
|
valid_losses.append(valid_err)
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-02-12 17:41:47 -08:00
|
|
|
callbacks.restart = measure_error
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
training = config.epochs > 0 and config.restarts >= 0
|
|
|
|
|
2017-02-03 22:15:54 -08:00
|
|
|
ritual.prepare(model)
|
2017-02-17 13:51:15 -08:00
|
|
|
|
2017-04-10 02:53:54 -07:00
|
|
|
if training and config.warmup and not config.fn_load:
|
2017-02-17 13:51:15 -08:00
|
|
|
log("warming", "up")
|
2017-02-17 22:53:44 -08:00
|
|
|
|
|
|
|
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
|
|
|
|
temp_optim = learner.optim
|
2017-02-17 23:48:42 -08:00
|
|
|
temp_loss = ritual.loss
|
|
|
|
learner.optim = Optimizer(alpha=0.001)
|
|
|
|
ritual.loss = Absolute() # less likely to blow up; more general
|
|
|
|
|
|
|
|
# NOTE: experiment: trying const batches and batch_size
|
|
|
|
bs = 256
|
|
|
|
target = 1 * 1024 * 1024
|
|
|
|
# 4 being sizeof(float)
|
|
|
|
batches = (target / 4 / np.prod(inputs.shape[1:])) // bs * bs
|
|
|
|
ins = [int(batches)] + list( inputs.shape[1:])
|
|
|
|
outs = [int(batches)] + list(outputs.shape[1:])
|
|
|
|
|
|
|
|
for _ in range(4):
|
2017-02-17 22:53:44 -08:00
|
|
|
ritual.train_batched(
|
2017-02-17 23:48:42 -08:00
|
|
|
np.random.normal(size=ins),
|
|
|
|
np.random.normal(size=outs),
|
|
|
|
batch_size=bs)
|
2017-02-17 22:53:44 -08:00
|
|
|
ritual.reset()
|
|
|
|
|
|
|
|
learner.optim = temp_optim
|
2017-02-17 23:48:42 -08:00
|
|
|
ritual.loss = temp_loss
|
2017-02-17 18:37:04 -08:00
|
|
|
|
|
|
|
if training:
|
2017-02-17 13:51:15 -08:00
|
|
|
measure_error()
|
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
while training and learner.next():
|
2017-02-03 22:15:54 -08:00
|
|
|
avg_loss, losses = ritual.train_batched(
|
2017-06-17 16:41:02 -07:00
|
|
|
inputs, outputs,
|
2017-02-01 22:21:25 -08:00
|
|
|
config.batch_size,
|
|
|
|
return_losses=True)
|
|
|
|
batch_losses += losses
|
|
|
|
|
2017-02-17 22:53:44 -08:00
|
|
|
if config.log10_loss:
|
|
|
|
fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}"
|
2017-03-22 14:41:24 -07:00
|
|
|
log("info", fmt.format(learner.epoch, learner.rate, np.log10(avg_loss)),
|
2017-02-18 18:43:42 -08:00
|
|
|
update=True)
|
2017-02-17 22:53:44 -08:00
|
|
|
else:
|
|
|
|
fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
|
2017-03-22 14:41:24 -07:00
|
|
|
log("info", fmt.format(learner.epoch, learner.rate, avg_loss),
|
2017-02-18 18:43:42 -08:00
|
|
|
update=True)
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-01-12 08:04:42 -08:00
|
|
|
measure_error()
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
if training and config.fn_save is not None:
|
2017-01-13 03:29:19 -08:00
|
|
|
log('saving weights', config.fn_save)
|
|
|
|
model.save_weights(config.fn_save, overwrite=True)
|
2017-01-09 03:37:35 -08:00
|
|
|
|
2017-02-16 14:10:33 -08:00
|
|
|
if training and config.log_fn is not None:
|
2017-02-12 17:29:52 -08:00
|
|
|
log('saving losses', config.log_fn)
|
2017-01-12 08:04:42 -08:00
|
|
|
np.savez_compressed(config.log_fn,
|
2017-02-14 13:02:30 -08:00
|
|
|
batch_losses=np.array(batch_losses, dtype=_f),
|
|
|
|
train_losses=np.array(train_losses, dtype=_f),
|
|
|
|
valid_losses=np.array(valid_losses, dtype=_f))
|
2017-01-12 14:45:07 -08:00
|
|
|
|
2017-02-13 17:53:10 -08:00
|
|
|
# Evaluation {{{2
|
|
|
|
# TODO: write this portion again
|
|
|
|
|
2017-01-12 14:45:07 -08:00
|
|
|
return 0
|
|
|
|
|
2017-02-15 20:18:53 -08:00
|
|
|
# run main program {{{1
|
2017-02-13 17:53:10 -08:00
|
|
|
|
2017-01-12 14:45:07 -08:00
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(run(sys.argv[0], sys.argv[1:]))
|