Connor Olding
c02fba01e2
use updated filenames. don't use emnist by default. tweak expando integer handling. add some comments.
1011 lines
33 KiB
Python
Executable file
1011 lines
33 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
# external packages required for full functionality:
|
|
# numpy scipy h5py sklearn dotmap
|
|
|
|
# BIG TODO: ensure numpy isn't upcasting to float64 *anywhere*.
|
|
# this is gonna take some work.
|
|
|
|
from onn_core import *
|
|
from onn_core import _check, _f, _0, _1
|
|
|
|
import sys
|
|
|
|
def lament(*args, **kwargs):
|
|
print(*args, file=sys.stderr, **kwargs)
|
|
|
|
_log_was_update = False
|
|
def log(left, right, update=False):
|
|
s = "\x1B[1m {:>20}:\x1B[0m {}".format(left, right)
|
|
global _log_was_update
|
|
if update and _log_was_update:
|
|
lament('\x1B[F' + s)
|
|
else:
|
|
lament(s)
|
|
_log_was_update = update
|
|
|
|
class Dummy:
|
|
pass
|
|
|
|
# Initializations {{{1
|
|
|
|
def init_gaussian_unit(size, ins, outs):
|
|
s = np.sqrt(1 / ins)
|
|
return np.random.normal(0, s, size=size)
|
|
|
|
# Loss functions {{{1
|
|
|
|
class SquaredHalved(ResidualLoss):
|
|
def f(self, r):
|
|
return np.square(r) / 2
|
|
|
|
def df(self, r):
|
|
return r
|
|
|
|
class SomethingElse(ResidualLoss):
|
|
# generalizes Absolute and SquaredHalved
|
|
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
|
def __init__(self, a=4/3):
|
|
assert 1 <= a <= 2, "parameter out of range"
|
|
self.a = _f(a / 2)
|
|
self.b = _f(2 / a)
|
|
self.c = _f(2 / a - 1)
|
|
|
|
def f(self, r):
|
|
return self.a * np.abs(r)**self.b
|
|
|
|
def df(self, r):
|
|
return np.sign(r) * np.abs(r)**self.c
|
|
|
|
class Confidence(Loss):
|
|
# this isn't "confidence" in any meaningful way; (e.g. Bayesian)
|
|
# it's just a metric of how large the value is of the predicted class.
|
|
# when using it for loss, it acts like a crappy regularizer.
|
|
# it really just measures how much of a hot-shot the network thinks it is.
|
|
|
|
def forward(self, p, y=None):
|
|
categories = p.shape[-1]
|
|
confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
|
|
# the exponent in softmax puts a maximum on confidence,
|
|
# but we don't compensate for that. if necessary,
|
|
# it'd be better to use an activation that doesn't have this limit.
|
|
return np.mean(confidence)
|
|
|
|
def backward(self, p, y=None):
|
|
# in order to agree with the forward pass,
|
|
# using this backwards pass as-is will minimize confidence.
|
|
categories = p.shape[-1]
|
|
detc = p / categories / (1 - 1/categories)
|
|
dmax = p == np.max(p, axis=-1, keepdims=True)
|
|
return detc * dmax
|
|
|
|
class NLL(Loss): # Negative Log Likelihood
|
|
def forward(self, p, y):
|
|
correct = p * y
|
|
return np.mean(-correct)
|
|
|
|
def backward(self, p, y):
|
|
return -y / len(p)
|
|
|
|
# Regularizers {{{1
|
|
|
|
class SaturateRelu(Regularizer):
|
|
# paper: https://arxiv.org/abs/1703.09202
|
|
# TODO: test this (and ActivityRegularizer) more thoroughly.
|
|
# i've looked at the histogram of the resulting weights.
|
|
# it seems like only the layers after this are affected
|
|
# the way they should be.
|
|
|
|
def __init__(self, lamb=0.0):
|
|
self.lamb = _f(lamb)
|
|
|
|
def forward(self, X):
|
|
return self.lamb * np.where(X >= 0, X, 0)
|
|
|
|
def backward(self, X):
|
|
return self.lamb * np.where(X >= 0, 1, 0)
|
|
|
|
# Optimizers {{{1
|
|
|
|
class FTML(Optimizer):
|
|
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
|
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
|
|
|
def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
|
self.iterations = _0
|
|
self.b1 = _f(b1) # decay term
|
|
self.b2 = _f(b2) # decay term
|
|
self.eps = _f(eps)
|
|
|
|
super().__init__(alpha)
|
|
|
|
def reset(self):
|
|
self.dt1 = None
|
|
self.dt = None
|
|
self.vt = None
|
|
self.zt = None
|
|
self.b1_t = _1
|
|
self.b2_t = _1
|
|
|
|
def compute(self, dW, W):
|
|
if self.dt1 is None: self.dt1 = np.zeros_like(dW)
|
|
if self.dt is None: self.dt = np.zeros_like(dW)
|
|
if self.vt is None: self.vt = np.zeros_like(dW)
|
|
if self.zt is None: self.zt = np.zeros_like(dW)
|
|
|
|
# NOTE: we could probably rewrite these equations to avoid this copy.
|
|
self.dt1[:] = self.dt[:]
|
|
|
|
self.b1_t *= self.b1
|
|
self.b2_t *= self.b2
|
|
|
|
# hardly an elegant solution.
|
|
alpha = max(self.alpha, self.eps)
|
|
|
|
# same as Adam's vt.
|
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
|
|
|
# you can factor out "inner" out of Adam as well.
|
|
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
|
|
self.dt[:] = (1 - self.b1_t) / alpha * inner
|
|
|
|
sigma_t = self.dt - self.b1 * self.dt1
|
|
|
|
# Adam's mt minus the sigma term.
|
|
self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
|
|
|
|
# subtract by weights to avoid having to override self.update.
|
|
return -self.zt / self.dt - W
|
|
|
|
# Nonparametric Layers {{{1
|
|
|
|
class AlphaDropout(Layer):
|
|
# to be used alongside Selu activations.
|
|
# paper: https://arxiv.org/abs/1706.02515
|
|
|
|
def __init__(self, dropout=0.0, alpha=1.67326324, lamb=1.05070099):
|
|
super().__init__()
|
|
self.alpha = _f(alpha)
|
|
self.lamb = _f(lamb)
|
|
self.saturated = -self.lamb * self.alpha
|
|
self.dropout = _f(dropout)
|
|
|
|
@property
|
|
def dropout(self):
|
|
return self._dropout
|
|
|
|
@dropout.setter
|
|
def dropout(self, x):
|
|
self._dropout = _f(x)
|
|
self.q = 1 - self._dropout
|
|
assert 0 <= self.q <= 1
|
|
|
|
sat = self.saturated
|
|
|
|
self.a = 1 / np.sqrt(self.q + sat * sat * self.q * self._dropout)
|
|
self.b = -self.a * (self._dropout * sat)
|
|
|
|
def forward(self, X):
|
|
self.mask = np.random.rand(*X.shape) < self.q
|
|
return self.a * np.where(self.mask, X, self.saturated) + self.b
|
|
|
|
def forward_deterministic(self, X):
|
|
return X
|
|
|
|
def backward(self, dY):
|
|
return dY * self.a * self.mask
|
|
|
|
# Activations {{{2
|
|
|
|
class Selu(Layer):
|
|
# paper: https://arxiv.org/abs/1706.02515
|
|
|
|
def __init__(self, alpha=1.67326324, lamb=1.05070099):
|
|
super().__init__()
|
|
self.alpha = _f(alpha)
|
|
self.lamb = _f(lamb)
|
|
|
|
def forward(self, X):
|
|
self.cond = X >= 0
|
|
self.neg = self.alpha * np.exp(X)
|
|
return self.lamb * np.where(self.cond, X, self.neg - self.alpha)
|
|
|
|
def backward(self, dY):
|
|
return dY * self.lamb * np.where(self.cond, 1, self.neg)
|
|
|
|
class TanhTest(Layer):
|
|
def forward(self, X):
|
|
self.sig = np.tanh(1 / 2 * X)
|
|
return 2.4004 * self.sig
|
|
|
|
def backward(self, dY):
|
|
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
|
|
|
|
# Parametric Layers {{{1
|
|
|
|
class LayerNorm(Layer):
|
|
# paper: https://arxiv.org/abs/1607.06450
|
|
# note: nonparametric when affine == False
|
|
|
|
def __init__(self, eps=1e-5, affine=True):
|
|
super().__init__()
|
|
self.eps = _f(eps)
|
|
self.affine = bool(affine)
|
|
|
|
if self.affine:
|
|
self.gamma = self._new_weights('gamma', init=init_ones)
|
|
self.beta = self._new_weights('beta', init=init_zeros)
|
|
self.serialized = {
|
|
'gamma': 'gamma',
|
|
'beta': 'beta',
|
|
}
|
|
|
|
def make_shape(self, parent):
|
|
shape = parent.output_shape
|
|
self.input_shape = shape
|
|
self.output_shape = shape
|
|
assert len(shape) == 1, shape
|
|
if self.affine:
|
|
self.gamma.shape = (shape[0],)
|
|
self.beta.shape = (shape[0],)
|
|
|
|
def forward(self, X):
|
|
self.mean = X.mean(0)
|
|
self.center = X - self.mean
|
|
self.var = self.center.var(0) + self.eps
|
|
self.std = np.sqrt(self.var)
|
|
|
|
self.Xnorm = self.center / self.std
|
|
if self.affine:
|
|
return self.gamma.f * self.Xnorm + self.beta.f
|
|
return self.Xnorm
|
|
|
|
def backward(self, dY):
|
|
length = dY.shape[0]
|
|
|
|
if self.affine:
|
|
dXnorm = dY * self.gamma.f
|
|
self.gamma.g[:] = (dY * self.Xnorm).sum(0)
|
|
self.beta.g[:] = dY.sum(0)
|
|
else:
|
|
dXnorm = dY
|
|
|
|
dstd = (dXnorm * self.center).sum(0) / -self.var
|
|
dcenter = dXnorm / self.std + dstd / self.std * self.center / length
|
|
dmean = -dcenter.sum(0)
|
|
dX = dcenter + dmean / length
|
|
|
|
return dX
|
|
|
|
class Denses(Layer): # TODO: rename?
|
|
# acts as a separate Dense for each row or column. only for 2D arrays.
|
|
|
|
serialized = {
|
|
'W': 'coeffs',
|
|
'b': 'biases',
|
|
}
|
|
|
|
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None, axis=-1):
|
|
super().__init__()
|
|
self.dim = int(dim)
|
|
self.weight_init = init
|
|
self.axis = int(axis)
|
|
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
|
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
|
|
|
def make_shape(self, parent):
|
|
shape = parent.output_shape
|
|
self.input_shape = shape
|
|
assert len(shape) == 2, shape
|
|
|
|
assert -len(shape) <= self.axis < len(shape)
|
|
self.axis = self.axis % len(shape)
|
|
|
|
self.output_shape = list(shape)
|
|
self.output_shape[self.axis] = self.dim
|
|
self.output_shape = tuple(self.output_shape)
|
|
|
|
in_rows = self.input_shape[0]
|
|
in_cols = self.input_shape[1]
|
|
out_rows = self.output_shape[0]
|
|
out_cols = self.output_shape[1]
|
|
|
|
self.coeffs.shape = (in_rows, in_cols, self.dim)
|
|
self.biases.shape = (1, out_rows, out_cols)
|
|
|
|
def forward(self, X):
|
|
self.X = X
|
|
if self.axis == 0:
|
|
return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f
|
|
elif self.axis == 1:
|
|
return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f
|
|
|
|
def backward(self, dY):
|
|
self.biases.g[:] = dY.sum(0, keepdims=True)
|
|
if self.axis == 0:
|
|
self.coeffs.g[:] = np.einsum('ixj,ikj->xjk', self.X, dY)
|
|
return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
|
|
elif self.axis == 1:
|
|
self.coeffs.g[:] = np.einsum('ijx,ijk->jxk', self.X, dY)
|
|
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
|
|
|
|
class DenseOneLess(Dense):
|
|
def init(self, allocator):
|
|
super().init(allocator)
|
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
|
assert ins == outs, (ins, outs)
|
|
|
|
def forward(self, X):
|
|
np.fill_diagonal(self.coeffs.f, 0)
|
|
self.X = X
|
|
return X.dot(self.coeffs.f) + self.biases
|
|
|
|
def backward(self, dY):
|
|
self.coeffs.g[:] = self.X.T.dot(dY)
|
|
self.biases.g[:] = dY.sum(0, keepdims=True)
|
|
np.fill_diagonal(self.coeffs.g, 0)
|
|
return dY.dot(self.coeffs.f.T)
|
|
|
|
class CosineDense(Dense):
|
|
# paper: https://arxiv.org/abs/1702.05870
|
|
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
|
|
# the paper doesn't mention bias,
|
|
# so we treat bias as an additional weight with a constant input of 1.
|
|
# this is correct in Dense layers, so i hope it's correct here too.
|
|
|
|
eps = 1e-4
|
|
|
|
def forward(self, X):
|
|
self.X = X
|
|
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
|
|
+ 1 + self.eps)
|
|
self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \
|
|
+ np.square(self.biases.f) + self.eps)
|
|
self.dot = X.dot(self.coeffs.f) + self.biases.f
|
|
Y = self.dot / (self.X_norm * self.W_norm)
|
|
return Y
|
|
|
|
def backward(self, dY):
|
|
ddot = dY / self.X_norm / self.W_norm
|
|
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
|
|
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
|
|
|
|
self.coeffs.g[:] = self.X.T.dot(ddot) \
|
|
+ dW_norm / self.W_norm * self.coeffs.f
|
|
self.biases.g[:] = ddot.sum(0, keepdims=True) \
|
|
+ dW_norm / self.W_norm * self.biases.f
|
|
dX = ddot.dot(self.coeffs.f.T) + dX_norm / self.X_norm * self.X
|
|
|
|
return dX
|
|
|
|
# Rituals {{{1
|
|
|
|
def stochastic_multiply(W, gamma=0.5, allow_negation=False):
|
|
# paper: https://arxiv.org/abs/1606.01981
|
|
|
|
assert W.ndim == 1, W.ndim
|
|
assert 0 < gamma < 1, gamma
|
|
size = len(W)
|
|
alpha = np.max(np.abs(W))
|
|
# NOTE: numpy gives [low, high) but the paper advocates [low, high]
|
|
mult = np.random.uniform(gamma, 1/gamma, size=size)
|
|
if allow_negation:
|
|
# NOTE: i have yet to see this do anything but cause divergence.
|
|
# i've referenced the paper several times yet still don't understand
|
|
# what i'm doing wrong, so i'm disabling it by default in my code.
|
|
# maybe i just need *a lot* more weights to compensate.
|
|
prob = (W / alpha + 1) / 2
|
|
samples = np.random.random_sample(size=size)
|
|
mult *= np.where(samples < prob, 1, -1)
|
|
np.multiply(W, mult, out=W)
|
|
|
|
class StochMRitual(Ritual):
|
|
# paper: https://arxiv.org/abs/1606.01981
|
|
# this probably doesn't make sense for regression problems,
|
|
# let alone small models, but here it is anyway!
|
|
|
|
def __init__(self, learner=None, loss=None, mloss=None, gamma=0.5):
|
|
super().__init__(learner, loss, mloss)
|
|
self.gamma = _f(gamma)
|
|
|
|
def prepare(self, model):
|
|
self.W = np.copy(model.W)
|
|
super().prepare(model)
|
|
|
|
def learn(self, inputs, outputs):
|
|
# an experiment:
|
|
#assert self.learner.rate < 10, self.learner.rate
|
|
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
|
|
|
|
self.W[:] = self.model.W
|
|
for layer in self.model.ordered_nodes:
|
|
if isinstance(layer, Dense):
|
|
stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
|
|
residual = super().learn(inputs, outputs)
|
|
self.model.W[:] = self.W
|
|
return residual
|
|
|
|
def update(self):
|
|
super().update()
|
|
f = 0.5
|
|
for layer in self.model.ordered_nodes:
|
|
if isinstance(layer, Dense):
|
|
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
|
|
# np.clip(layer.W, -1, 1, out=layer.W)
|
|
|
|
class NoisyRitual(Ritual):
|
|
def __init__(self, learner=None, loss=None, mloss=None,
|
|
input_noise=0, output_noise=0, gradient_noise=0):
|
|
self.input_noise = _f(input_noise)
|
|
self.output_noise = _f(output_noise)
|
|
self.gradient_noise = _f(gradient_noise)
|
|
super().__init__(learner, loss, mloss)
|
|
|
|
def learn(self, inputs, outputs):
|
|
# this is pretty crude
|
|
if self.input_noise > 0:
|
|
s = self.input_noise
|
|
inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
|
if self.output_noise > 0:
|
|
s = self.output_noise
|
|
outputs = outputs + np.random.normal(0, s, size=outputs.shape)
|
|
return super().learn(inputs, outputs)
|
|
|
|
def update(self):
|
|
# gradient noise paper: https://arxiv.org/abs/1511.06807
|
|
if self.gradient_noise > 0:
|
|
size = len(self.model.dW)
|
|
gamma = 0.55
|
|
#s = self.gradient_noise / (1 + self.bn) ** gamma
|
|
# experiments:
|
|
s = self.gradient_noise * np.sqrt(self.learner.rate)
|
|
#s = np.square(self.learner.rate)
|
|
#s = self.learner.rate / self.en
|
|
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
|
|
super().update()
|
|
|
|
# Learners {{{1
|
|
|
|
class DumbLearner(AnnealingLearner):
|
|
# this is my own awful contraption. it's not really "SGD with restarts".
|
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10,
|
|
restarts=0, restart_advance=20, callback=None):
|
|
self.restart_epochs = int(epochs)
|
|
self.restarts = int(restarts)
|
|
self.restart_advance = float(restart_advance)
|
|
self.restart_callback = callback
|
|
epochs = self.restart_epochs * (self.restarts + 1)
|
|
super().__init__(optim, epochs, rate, halve_every)
|
|
|
|
def rate_at(self, epoch):
|
|
sub_epoch = epoch % self.restart_epochs
|
|
restart = epoch // self.restart_epochs
|
|
return super().rate_at(sub_epoch) * (self.anneal**self.restart_advance)**restart
|
|
|
|
def next(self):
|
|
if not super().next():
|
|
return False
|
|
sub_epoch = self.epoch % self.restart_epochs
|
|
restart = self.epoch // self.restart_epochs
|
|
if restart > 0 and sub_epoch == 0:
|
|
if self.restart_callback is not None:
|
|
self.restart_callback(restart)
|
|
return True
|
|
|
|
# Components {{{1
|
|
|
|
def _mr_make_norm(norm):
|
|
def _mr_norm(y, width, depth, block, multi, activation, style, FC, d):
|
|
skip = y
|
|
merger = Sum()
|
|
skip.feed(merger)
|
|
z_start = skip
|
|
z_start = z_start.feed(norm())
|
|
z_start = z_start.feed(activation())
|
|
for _ in range(multi):
|
|
z = z_start
|
|
for j in range(block):
|
|
if j > 0:
|
|
z = z.feed(norm())
|
|
z = z.feed(activation())
|
|
z = z.feed(FC())
|
|
z.feed(merger)
|
|
y = merger
|
|
return y
|
|
return _mr_norm
|
|
|
|
def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
|
|
skip = y
|
|
merger = Sum()
|
|
skip.feed(merger)
|
|
z_start = skip.feed(activation())
|
|
for _ in range(multi):
|
|
z = z_start
|
|
for j in range(block):
|
|
if j > 0:
|
|
z = z.feed(activation())
|
|
z = z.feed(FC())
|
|
z.feed(merger)
|
|
y = merger
|
|
return y
|
|
|
|
def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
|
|
# this is my own awful contraption.
|
|
is_last = d + 1 == depth
|
|
needs_sum = not is_last or multi > 1
|
|
skip = y
|
|
if needs_sum:
|
|
merger = Sum()
|
|
if not is_last:
|
|
skip.feed(merger)
|
|
z_start = skip.feed(activation())
|
|
for _ in range(multi):
|
|
z = z_start
|
|
for j in range(block):
|
|
if j > 0:
|
|
z = z.feed(activation())
|
|
z = z.feed(FC())
|
|
if needs_sum:
|
|
z.feed(merger)
|
|
if needs_sum:
|
|
y = merger
|
|
else:
|
|
y = z
|
|
return y
|
|
|
|
_mr_styles = dict(
|
|
lnorm=_mr_make_norm(LayerNorm),
|
|
batchless=_mr_batchless,
|
|
onelesssum=_mr_onelesssum,
|
|
)
|
|
|
|
def multiresnet(x, width, depth, block=2, multi=1,
|
|
activation=Relu, style='batchless',
|
|
init=init_he_normal):
|
|
if style == 'cossim':
|
|
style = 'batchless'
|
|
DenseClass = CosineDense
|
|
else:
|
|
DenseClass = Dense
|
|
if style not in _mr_styles:
|
|
raise Exception('unknown resnet style', style)
|
|
|
|
y = x
|
|
last_size = x.output_shape[0]
|
|
|
|
for d in range(depth):
|
|
size = width
|
|
FC = lambda: DenseClass(size, init)
|
|
|
|
if last_size != size:
|
|
y = y.feed(FC())
|
|
|
|
y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d)
|
|
|
|
last_size = size
|
|
|
|
return y
|
|
|
|
# Toy Data {{{1
|
|
|
|
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform,
|
|
glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform,
|
|
gaussian_unit=init_gaussian_unit)
|
|
activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh,
|
|
relu=Relu, elu=Elu, gelu=GeluApprox, selu=Selu,
|
|
softplus=Softplus)
|
|
|
|
def prettyize(data):
|
|
if isinstance(data, np.ndarray):
|
|
s = ', '.join(('{:8.2e}'.format(n) for n in data))
|
|
s = '[' + s + ']'
|
|
else:
|
|
s = '{:8.2e}'.format(data)
|
|
return s
|
|
|
|
def normalize_data(data, mean=None, std=None):
|
|
# in-place
|
|
if mean is None or std is None:
|
|
mean = np.mean(data, axis=0)
|
|
std = np.std(data, axis=0)
|
|
mean_str = prettyize(mean)
|
|
std_str = prettyize(std)
|
|
lament('nod(...,\n {},\n {})'.format(mean_str, std_str))
|
|
sys.exit(1)
|
|
data -= _f(mean)
|
|
data /= _f(std)
|
|
|
|
def toy_data(train_samples, valid_samples, problem=2):
|
|
total_samples = train_samples + valid_samples
|
|
|
|
nod = normalize_data # shorthand to keep a sane indentation
|
|
|
|
if problem == 0:
|
|
from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
|
valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)
|
|
|
|
nod(inputs, 127.5, 73.9)
|
|
nod(outputs, 44.8, 21.7)
|
|
nod(valid_inputs, 127.5, 73.9)
|
|
nod(valid_outputs, 44.8, 21.7)
|
|
|
|
elif problem == 1:
|
|
from sklearn.datasets import make_friedman1
|
|
inputs, outputs = make_friedman1(total_samples)
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
|
outputs = np.expand_dims(outputs, -1)
|
|
|
|
nod(inputs, 0.5, 1/np.sqrt(12))
|
|
nod(outputs, 14.4, 4.9)
|
|
|
|
elif problem == 2:
|
|
from sklearn.datasets import make_friedman2
|
|
inputs, outputs = make_friedman2(total_samples)
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
|
outputs = np.expand_dims(outputs, -1)
|
|
|
|
nod(inputs,
|
|
[5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
|
|
[2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
|
|
|
|
nod(outputs, [482], [380])
|
|
|
|
elif problem == 3:
|
|
from sklearn.datasets import make_friedman3
|
|
inputs, outputs = make_friedman3(total_samples)
|
|
inputs, outputs = _f(inputs), _f(outputs)
|
|
outputs = np.expand_dims(outputs, -1)
|
|
|
|
nod(inputs,
|
|
[4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
|
|
[2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
|
|
|
|
nod(outputs, [1.32327931], [0.31776295])
|
|
|
|
else:
|
|
raise Exception("unknown toy data set", problem)
|
|
|
|
if problem != 0:
|
|
# split off a validation set
|
|
indices = np.arange(inputs.shape[0])
|
|
np.random.shuffle(indices)
|
|
valid_inputs = inputs[indices][-valid_samples:]
|
|
valid_outputs = outputs[indices][-valid_samples:]
|
|
inputs = inputs[indices][:-valid_samples]
|
|
outputs = outputs[indices][:-valid_samples]
|
|
|
|
return (inputs, outputs), (valid_inputs, valid_outputs)
|
|
|
|
# Model Creation {{{1
|
|
|
|
def optim_from_config(config):
|
|
if config.optim == 'adam':
|
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
|
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
|
|
b1 = np.exp(-1/d1)
|
|
b2 = np.exp(-1/d2)
|
|
o = Nadam if config.nesterov else Adam
|
|
optim = o(b1=b1, b2=b2)
|
|
elif config.optim == 'ftml':
|
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 2
|
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
|
|
b1 = np.exp(-1/d1)
|
|
b2 = np.exp(-1/d2)
|
|
optim = FTML(b1=b1, b2=b2)
|
|
elif config.optim in ('rms', 'rmsprop'):
|
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
|
|
mu = np.exp(-1/d2)
|
|
optim = RMSprop(mu=mu)
|
|
elif config.optim == 'sgd':
|
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 0
|
|
if d1 > 0:
|
|
b1 = np.exp(-1/d1)
|
|
optim = Momentum(mu=b1, nesterov=config.nesterov)
|
|
else:
|
|
optim = Optimizer()
|
|
else:
|
|
raise Exception('unknown optimizer', config.optim)
|
|
|
|
return optim
|
|
|
|
def learner_from_config(config, optim, rscb):
|
|
if config.learner == 'sgdr':
|
|
expando = config.expando if 'expando' in config else None
|
|
learner = SGDR(optim, epochs=config.epochs, rate=config.learn,
|
|
restart_decay=config.restart_decay, restarts=config.restarts,
|
|
callback=rscb, expando=expando)
|
|
# final learning rate isn't of interest here; it's gonna be close to 0.
|
|
log('total epochs', learner.epochs)
|
|
elif config.learner == 'anneal':
|
|
learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
|
|
halve_every=config.learn_halve_every)
|
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
|
elif config.learner == 'dumb':
|
|
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
|
halve_every=config.learn_halve_every,
|
|
restarts=config.restarts,
|
|
restart_advance=config.learn_restart_advance,
|
|
callback=rscb)
|
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
|
elif config.learner == 'sgd':
|
|
learner = Learner(optim, epochs=config.epochs, rate=config.learn)
|
|
else:
|
|
raise Exception('unknown learner', config.learner)
|
|
|
|
return learner
|
|
|
|
def lookup_loss(maybe_name):
|
|
if isinstance(maybe_name, Loss):
|
|
return maybe_name
|
|
elif maybe_name == 'mse':
|
|
return Squared()
|
|
elif maybe_name == 'mshe': # mushy
|
|
return SquaredHalved()
|
|
elif maybe_name == 'mae':
|
|
return Absolute()
|
|
elif maybe_name == 'msee':
|
|
return SomethingElse()
|
|
raise Exception('unknown objective', maybe_name)
|
|
|
|
def ritual_from_config(config, learner, loss, mloss):
|
|
if config.ritual == 'default':
|
|
ritual = Ritual(learner=learner, loss=loss, mloss=mloss)
|
|
elif config.ritual == 'stochm':
|
|
ritual = StochMRitual(learner=learner, loss=loss, mloss=mloss)
|
|
elif config.ritual == 'noisy':
|
|
ritual = NoisyRitual(learner=learner, loss=loss, mloss=mloss,
|
|
input_noise=1e-1, output_noise=1e-2,
|
|
gradient_noise=2e-7)
|
|
else:
|
|
raise Exception('unknown ritual', config.ritual)
|
|
|
|
return ritual
|
|
|
|
def model_from_config(config, input_features, output_features, callbacks):
|
|
init = inits[config.init]
|
|
activation = activations[config.activation]
|
|
|
|
x = Input(shape=(input_features,))
|
|
y = x
|
|
y = multiresnet(y,
|
|
config.res_width, config.res_depth,
|
|
config.res_block, config.res_multi,
|
|
activation=activation, init=init,
|
|
style=config.parallel_style)
|
|
if y.output_shape[0] != output_features:
|
|
y = y.feed(Dense(output_features, init))
|
|
|
|
model = Model(x, y, unsafe=config.unsafe)
|
|
|
|
if config.fn_load is not None:
|
|
log('loading weights', config.fn_load)
|
|
model.load_weights(config.fn_load)
|
|
|
|
optim = optim_from_config(config)
|
|
|
|
def rscb(restart):
|
|
callbacks.restart()
|
|
log("restarting", restart)
|
|
if config.restart_optim:
|
|
optim.reset()
|
|
|
|
learner = learner_from_config(config, optim, rscb)
|
|
|
|
loss = lookup_loss(config.loss)
|
|
mloss = lookup_loss(config.mloss) if config.mloss else loss
|
|
|
|
ritual = ritual_from_config(config, learner, loss, mloss)
|
|
|
|
return model, learner, ritual
|
|
|
|
# main program {{{1
|
|
|
|
def run(program, args=None):
|
|
args = args if args else []
|
|
|
|
np.random.seed(42069)
|
|
|
|
# Config {{{2
|
|
|
|
from dotmap import DotMap
|
|
config = DotMap(
|
|
fn_load = None,
|
|
fn_save = 'optim_nn.h5',
|
|
log_fn = 'losses.npz',
|
|
|
|
# multi-residual network parameters
|
|
res_width = 28,
|
|
res_depth = 2,
|
|
res_block = 3, # normally 2 for plain resnet
|
|
res_multi = 2, # normally 1 for plain resnet
|
|
|
|
# style of resnet (order of layers, which layers, etc.)
|
|
parallel_style = 'onelesssum',
|
|
activation = 'gelu',
|
|
|
|
#optim = 'ftml',
|
|
#optim_decay1 = 2,
|
|
#optim_decay2 = 100,
|
|
#nesterov = False,
|
|
optim = 'adam', # note: most features only implemented for Adam
|
|
optim_decay1 = 24, # first momentum given in epochs (optional)
|
|
optim_decay2 = 100, # second momentum given in epochs (optional)
|
|
nesterov = True,
|
|
batch_size = 64,
|
|
|
|
# learning parameters
|
|
learner = 'sgdr',
|
|
learn = 0.00125,
|
|
epochs = 24,
|
|
learn_halve_every = 16, # only used with anneal/dumb
|
|
restarts = 4,
|
|
restart_decay = 0.25, # only used with SGDR
|
|
expando = lambda i: 24 * i,
|
|
|
|
# misc
|
|
init = 'he_normal',
|
|
loss = 'mse',
|
|
mloss = 'mse',
|
|
ritual = 'default',
|
|
restart_optim = False, # restarts also reset internal state of optimizer
|
|
warmup = False, # train a couple epochs on gaussian noise and reset
|
|
|
|
# logging/output
|
|
log10_loss = True, # personally, i'm sick of looking linear loss values!
|
|
#fancy_logs = True, # unimplemented (can't turn it off yet)
|
|
|
|
problem = 2,
|
|
compare = (
|
|
# best results for ~10,000 parameters
|
|
# training/validation pairs for each problem (starting from problem 0):
|
|
(10**-3.120, 10**-2.901),
|
|
# 1080 epochs on these...
|
|
(10**-6.747, 10**-6.555),
|
|
(10**-7.774, 10**-7.626),
|
|
(10**-6.278, 10**-5.234), # overfitting? bad valid set?
|
|
),
|
|
|
|
unsafe = True, # aka gotta go fast mode
|
|
)
|
|
|
|
for k in ['parallel_style', 'activation', 'optim', 'learner',
|
|
'init', 'loss', 'mloss', 'ritual']:
|
|
config[k] = config[k].lower()
|
|
|
|
config.learn *= np.sqrt(config.batch_size)
|
|
|
|
config.pprint()
|
|
|
|
# Toy Data {{{2
|
|
|
|
(inputs, outputs), (valid_inputs, valid_outputs) = \
|
|
toy_data(2**14, 2**11, problem=config.problem)
|
|
input_features = inputs.shape[-1]
|
|
output_features = outputs.shape[-1]
|
|
|
|
# Our Test Model
|
|
|
|
callbacks = Dummy()
|
|
|
|
model, learner, ritual = \
|
|
model_from_config(config, input_features, output_features, callbacks)
|
|
|
|
# Model Information {{{2
|
|
|
|
for node in model.ordered_nodes:
|
|
children = [str(n) for n in node.children]
|
|
if children:
|
|
sep = '->'
|
|
print(str(node) + sep + ('\n' + str(node) + sep).join(children))
|
|
log('parameters', model.param_count)
|
|
|
|
# Training {{{2
|
|
|
|
batch_losses = []
|
|
train_losses = []
|
|
valid_losses = []
|
|
|
|
def measure_error():
|
|
def print_error(name, inputs, outputs, comparison=None):
|
|
predicted = model.forward(inputs)
|
|
err = ritual.measure(predicted, outputs)
|
|
if config.log10_loss:
|
|
print(name, "{:12.6e}".format(err))
|
|
if comparison:
|
|
err10 = np.log10(err)
|
|
cmp10 = np.log10(comparison)
|
|
color = '\x1B[31m' if err10 > cmp10 else '\x1B[32m'
|
|
log(name + " log10-loss", "{:+6.3f} {}({:+6.3f})\x1B[0m".format(err10, color, err10 - cmp10))
|
|
else:
|
|
log(name + " log10-loss", "{:+6.3f}".format(err, np.log10(err)))
|
|
else:
|
|
log(name + " loss", "{:12.6e}".format(err))
|
|
if comparison:
|
|
fmt = "10**({:+7.4f}) times"
|
|
log("improvement", fmt.format(np.log10(comparison / err)))
|
|
return err
|
|
|
|
train_err = print_error("train",
|
|
inputs, outputs,
|
|
config.compare[config.problem][0])
|
|
valid_err = print_error("valid",
|
|
valid_inputs, valid_outputs,
|
|
config.compare[config.problem][1])
|
|
train_losses.append(train_err)
|
|
valid_losses.append(valid_err)
|
|
|
|
callbacks.restart = measure_error
|
|
|
|
training = config.epochs > 0 and config.restarts >= 0
|
|
|
|
ritual.prepare(model)
|
|
|
|
if training and config.warmup and not config.fn_load:
|
|
log("warming", "up")
|
|
|
|
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
|
|
temp_optim = learner.optim
|
|
temp_loss = ritual.loss
|
|
learner.optim = Optimizer(alpha=0.001)
|
|
ritual.loss = Absolute() # less likely to blow up; more general
|
|
|
|
# NOTE: experiment: trying const batches and batch_size
|
|
bs = 256
|
|
target = 1 * 1024 * 1024
|
|
# 4 being sizeof(float)
|
|
batches = (target / 4 / np.prod(inputs.shape[1:])) // bs * bs
|
|
ins = [int(batches)] + list( inputs.shape[1:])
|
|
outs = [int(batches)] + list(outputs.shape[1:])
|
|
|
|
for _ in range(4):
|
|
ritual.train_batched(
|
|
np.random.normal(size=ins),
|
|
np.random.normal(size=outs),
|
|
batch_size=bs)
|
|
ritual.reset()
|
|
|
|
learner.optim = temp_optim
|
|
ritual.loss = temp_loss
|
|
|
|
if training:
|
|
measure_error()
|
|
|
|
while training and learner.next():
|
|
avg_loss, losses = ritual.train_batched(
|
|
inputs, outputs,
|
|
config.batch_size,
|
|
return_losses=True)
|
|
batch_losses += losses
|
|
|
|
if config.log10_loss:
|
|
fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}"
|
|
log("info", fmt.format(learner.epoch, learner.rate, np.log10(avg_loss)),
|
|
update=True)
|
|
else:
|
|
fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
|
|
log("info", fmt.format(learner.epoch, learner.rate, avg_loss),
|
|
update=True)
|
|
|
|
measure_error()
|
|
|
|
if training and config.fn_save is not None:
|
|
log('saving weights', config.fn_save)
|
|
model.save_weights(config.fn_save, overwrite=True)
|
|
|
|
if training and config.log_fn is not None:
|
|
log('saving losses', config.log_fn)
|
|
np.savez_compressed(config.log_fn,
|
|
batch_losses=np.array(batch_losses, dtype=_f),
|
|
train_losses=np.array(train_losses, dtype=_f),
|
|
valid_losses=np.array(valid_losses, dtype=_f))
|
|
|
|
# Evaluation {{{2
|
|
# TODO: write this portion again
|
|
|
|
return 0
|
|
|
|
# run main program {{{1
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(run(sys.argv[0], sys.argv[1:]))
|