optim/optim_nn_core.py

877 lines
26 KiB
Python
Raw Normal View History

2017-02-12 17:29:52 -08:00
import numpy as np
2017-02-14 13:02:30 -08:00
_f = np.float32
2017-02-12 17:29:52 -08:00
# just for speed, not strictly essential:
from scipy.special import expit as sigmoid
# used for numbering layers like Keras:
from collections import defaultdict
_layer_counters = defaultdict(lambda: 0)
2017-02-14 13:02:30 -08:00
def _check(a):
assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
assert a.dtype == _f, a.dtype
return a
_0 = _f(0)
_1 = _f(1)
_2 = _f(2)
_inv2 = _f(1/2)
_sqrt2 = _f(np.sqrt(2))
_invsqrt2 = _f(1/np.sqrt(2))
_pi = _f(np.pi)
2017-02-27 01:07:25 -08:00
class LayerIncompatibility(Exception):
pass
2017-02-12 17:29:52 -08:00
# Initializations {{{1
# note: these are currently only implemented for 2D shapes.
def init_he_normal(size, ins, outs):
s = np.sqrt(2 / ins)
return np.random.normal(0, s, size=size)
def init_he_uniform(size, ins, outs):
s = np.sqrt(6 / ins)
return np.random.uniform(-s, s, size=size)
2017-02-25 23:41:38 -08:00
def init_glorot_normal(size, ins, outs):
s = np.sqrt(2 / (ins + outs))
return np.random.normal(0, s, size=size)
def init_glorot_uniform(size, ins, outs):
s = np.sqrt(6 / (ins + outs))
return np.random.uniform(-s, s, size=size)
2017-02-12 17:29:52 -08:00
# Loss functions {{{1
class Loss:
2017-02-15 20:18:53 -08:00
pass
2017-02-13 17:53:10 -08:00
2017-02-15 20:18:53 -08:00
class CategoricalCrossentropy(Loss):
# lifted from theano
2017-02-12 17:29:52 -08:00
2017-02-26 17:52:17 -08:00
def __init__(self, eps=1e-6):
2017-02-15 20:18:53 -08:00
self.eps = _f(eps)
2017-02-27 01:07:25 -08:00
def forward(self, p, y):
2017-02-15 20:18:53 -08:00
# TODO: assert dimensionality and p > 0 (if not self.unsafe?)
p = np.clip(p, self.eps, 1 - self.eps)
f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
2017-02-16 14:10:33 -08:00
return np.mean(f)
2017-02-15 20:18:53 -08:00
2017-02-27 01:07:25 -08:00
def backward(self, p, y):
2017-02-15 20:18:53 -08:00
p = np.clip(p, self.eps, 1 - self.eps)
df = (p - y) / (p * (1 - p))
2017-02-16 14:10:33 -08:00
return df / len(y)
2017-02-15 20:18:53 -08:00
2017-02-26 18:43:51 -08:00
class Accuracy(Loss):
# returns percentage of categories correctly predicted.
# utilizes max(), so it cannot be used for gradient descent.
# use CategoricalCrossentropy for that instead.
2017-02-27 01:07:25 -08:00
def forward(self, p, y):
2017-02-26 18:43:51 -08:00
correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1)
return np.mean(correct)
2017-02-27 01:07:25 -08:00
def backward(self, p, y):
2017-02-26 18:43:51 -08:00
raise NotImplementedError("cannot take the gradient of Accuracy")
2017-02-15 20:18:53 -08:00
class ResidualLoss(Loss):
2017-02-27 01:07:25 -08:00
def forward(self, p, y):
2017-02-15 20:18:53 -08:00
return np.mean(self.f(p - y))
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def backward(self, p, y):
2017-02-16 14:10:33 -08:00
ret = self.df(p - y) / len(y)
return ret
2017-02-15 20:18:53 -08:00
class Squared(ResidualLoss):
2017-02-12 17:29:52 -08:00
def f(self, r):
return np.square(r)
def df(self, r):
return 2 * r
2017-02-15 20:18:53 -08:00
class Absolute(ResidualLoss):
2017-02-12 17:29:52 -08:00
def f(self, r):
return np.abs(r)
def df(self, r):
return np.sign(r)
# Optimizers {{{1
class Optimizer:
def __init__(self, alpha=0.1):
2017-02-16 14:10:33 -08:00
self.alpha = _f(alpha) # learning rate
2017-02-12 17:29:52 -08:00
self.reset()
def reset(self):
pass
def compute(self, dW, W):
return -self.alpha * dW
def update(self, dW, W):
W += self.compute(dW, W)
# the following optimizers are blatantly lifted from tiny-dnn:
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
class Momentum(Optimizer):
2017-02-17 22:53:44 -08:00
def __init__(self, alpha=0.01, mu=0.9, nesterov=False):
2017-02-14 13:02:30 -08:00
self.mu = _f(mu) # momentum
2017-02-12 17:29:52 -08:00
self.nesterov = bool(nesterov)
2017-02-16 14:10:33 -08:00
super().__init__(alpha)
2017-02-12 17:29:52 -08:00
def reset(self):
2017-02-17 22:53:44 -08:00
self.Vprev = None
2017-02-12 17:29:52 -08:00
def compute(self, dW, W):
2017-02-17 22:53:44 -08:00
if self.Vprev is None:
self.Vprev = np.copy(dW)
V = self.mu * self.Vprev - self.alpha * dW
self.Vprev[:] = V
if self.nesterov:
return self.mu * V - self.alpha * dW
2017-02-16 14:10:33 -08:00
return V
2017-02-12 17:29:52 -08:00
2017-02-13 17:53:10 -08:00
class RMSprop(Optimizer):
# RMSprop generalizes* Adagrad, etc.
2017-02-15 10:43:57 -08:00
# TODO: verify this is correct:
2017-02-13 17:53:10 -08:00
# * RMSprop == Adagrad when
# RMSprop.mu == 1
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
2017-02-14 13:02:30 -08:00
self.mu = _f(mu) # decay term
self.eps = _f(eps)
2017-02-13 17:53:10 -08:00
# one might consider the following equation when specifying mu:
# mu = e**(-1/t)
# default: t = -1/ln(0.99) = ~99.5
# therefore the default of mu=0.99 means
# an input decays to 1/e its original amplitude over 99.5 epochs.
# (this is from DSP, so how relevant it is in SGD is debatable)
2017-02-16 14:10:33 -08:00
super().__init__(alpha)
2017-02-13 17:53:10 -08:00
def reset(self):
self.g = None
def compute(self, dW, W):
if self.g is None:
self.g = np.zeros_like(dW)
# basically apply a first-order low-pass filter to delta squared
self.g[:] = self.mu * self.g + (1 - self.mu) * dW * dW
# equivalent (though numerically different?):
#self.g += (dW * dW - self.g) * (1 - self.mu)
# finally sqrt it to complete the running root-mean-square approximation
return -self.alpha * dW / np.sqrt(self.g + self.eps)
2017-02-12 17:29:52 -08:00
class Adam(Optimizer):
2017-02-17 22:53:44 -08:00
# paper: https://arxiv.org/abs/1412.6980
2017-02-13 17:53:10 -08:00
# Adam generalizes* RMSprop, and
# adds a decay term to the regular (non-squared) delta, and
# does some decay-gain voodoo. (i guess it's compensating
# for the filtered deltas starting from zero)
# * Adam == RMSprop when
# Adam.b1 == 0
# Adam.b2 == RMSprop.mu
2017-02-17 22:53:44 -08:00
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
2017-02-14 13:02:30 -08:00
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
2017-02-17 22:53:44 -08:00
self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2) # decay term power t
2017-02-14 13:02:30 -08:00
self.eps = _f(eps)
2017-02-12 17:29:52 -08:00
2017-02-16 14:10:33 -08:00
super().__init__(alpha)
2017-02-12 17:29:52 -08:00
def reset(self):
self.mt = None
self.vt = None
self.b1_t = self.b1_t_default
self.b2_t = self.b2_t_default
def compute(self, dW, W):
if self.mt is None:
2017-02-13 17:53:10 -08:00
self.mt = np.zeros_like(dW)
2017-02-12 17:29:52 -08:00
if self.vt is None:
2017-02-13 17:53:10 -08:00
self.vt = np.zeros_like(dW)
2017-02-12 17:29:52 -08:00
2017-02-13 17:53:10 -08:00
# decay gain
2017-02-12 17:29:52 -08:00
self.b1_t *= self.b1
self.b2_t *= self.b2
2017-02-13 17:53:10 -08:00
# filter
2017-02-12 17:29:52 -08:00
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
return -self.alpha * (self.mt / (1 - self.b1_t)) \
/ np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
2017-02-17 22:53:44 -08:00
class Nadam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
2017-02-27 01:07:25 -08:00
# TODO: double-check this implementation. also read the damn paper.
2017-02-17 22:53:44 -08:00
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.eps = _f(eps)
super().__init__(alpha)
def reset(self):
self.mt = None
self.vt = None
self.t = 0
self.sched = 1
def compute(self, dW, W):
self.t += 1
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
sched0 = self.sched * ut0
sched1 = self.sched * ut0 * ut1
self.sched = sched0
gp = dW / (1 - sched0)
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * np.square(dW)
mtp = self.mt / (1 - sched1)
vtp = self.vt / (1 - self.b2**self.t)
mt_bar = (1 - ut0) * gp + ut1 * mtp
return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps)
2017-02-12 17:29:52 -08:00
# Abstract Layers {{{1
class Layer:
def __init__(self):
self.parents = []
self.children = []
self.input_shape = None
self.output_shape = None
kind = self.__class__.__name__
global _layer_counters
_layer_counters[kind] += 1
self.name = "{}_{}".format(kind, _layer_counters[kind])
self.size = None # total weight count (if any)
self.unsafe = False # disables assertions for better performance
def __str__(self):
return self.name
# methods we might want to override:
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
raise NotImplementedError("unimplemented", self)
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
raise NotImplementedError("unimplemented", self)
def do_feed(self, child):
self.children.append(child)
def be_fed(self, parent):
self.parents.append(parent)
def make_shape(self, shape):
if not self.unsafe:
assert shape is not None
if self.output_shape is None:
self.output_shape = shape
return shape
2017-02-27 01:07:25 -08:00
# TODO: better names for these (still)
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def _propogate(self, edges):
2017-02-12 17:29:52 -08:00
if not self.unsafe:
2017-02-27 01:07:25 -08:00
assert len(edges) == 1, self
return self.forward(edges[0])
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def _backpropogate(self, edges):
if len(edges) == 1:
return self.backward(edges[0])
return sum((self.backward(dY) for dY in edges))
2017-02-12 17:29:52 -08:00
# general utility methods:
2017-02-27 01:07:25 -08:00
def is_compatible(self, parent):
2017-02-12 17:29:52 -08:00
if self.input_shape is None:
# inherit shape from output
shape = self.make_shape(parent.output_shape)
if shape is None:
return False
self.input_shape = shape
2017-02-16 14:10:33 -08:00
return np.all(self.input_shape == parent.output_shape)
2017-02-12 17:29:52 -08:00
def feed(self, child):
2017-02-27 01:07:25 -08:00
if not child.is_compatible(self):
2017-02-12 17:29:52 -08:00
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
2017-02-27 01:07:25 -08:00
raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
2017-02-12 17:29:52 -08:00
self.do_feed(child)
child.be_fed(self)
return child
def validate_input(self, X):
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
def validate_output(self, Y):
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
2017-02-17 22:53:44 -08:00
def init(self, W, dW):
assert W.ndim == 1 and W.shape[0] == self.size, W.shape
assert dW.ndim == 1 and dW.shape[0] == self.size, dW.shape
self.W = W
self.dW = dW
2017-02-27 01:07:25 -08:00
def propagate(self, values):
2017-02-12 17:29:52 -08:00
if not self.unsafe:
2017-02-16 14:10:33 -08:00
assert self.parents, self
2017-02-27 01:07:25 -08:00
edges = []
2017-02-12 17:29:52 -08:00
for parent in self.parents:
# TODO: skip over irrelevant nodes (if any)
2017-02-27 01:07:25 -08:00
X = values[parent]
2017-02-12 17:29:52 -08:00
if not self.unsafe:
self.validate_input(X)
2017-02-27 01:07:25 -08:00
edges.append(X)
Y = self._propogate(edges)
2017-02-12 17:29:52 -08:00
if not self.unsafe:
self.validate_output(Y)
return Y
2017-02-27 01:07:25 -08:00
def backpropagate(self, values):
2017-02-12 17:29:52 -08:00
if not self.unsafe:
2017-02-16 14:10:33 -08:00
assert self.children, self
2017-02-27 01:07:25 -08:00
edges = []
2017-02-12 17:29:52 -08:00
for child in self.children:
# TODO: skip over irrelevant nodes (if any)
2017-02-27 01:07:25 -08:00
dY = values[child]
2017-02-12 17:29:52 -08:00
if not self.unsafe:
self.validate_output(dY)
2017-02-27 01:07:25 -08:00
edges.append(dY)
dX = self._backpropogate(edges)
2017-02-12 17:29:52 -08:00
if not self.unsafe:
self.validate_input(dX)
return dX
# Nonparametric Layers {{{1
class Input(Layer):
def __init__(self, shape):
assert shape is not None
super().__init__()
self.shape = tuple(shape)
self.input_shape = self.shape
self.output_shape = self.shape
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
return X
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
#self.dY = dY
return np.zeros_like(dY)
2017-02-25 23:41:38 -08:00
class Reshape(Layer):
def __init__(self, new_shape):
super().__init__()
self.shape = tuple(new_shape)
self.output_shape = self.shape
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-25 23:41:38 -08:00
self.batch_size = X.shape[0]
return X.reshape(self.batch_size, *self.output_shape)
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-25 23:41:38 -08:00
assert dY.shape[0] == self.batch_size
return dY.reshape(self.batch_size, *self.input_shape)
2017-02-26 18:05:33 -08:00
class Flatten(Layer):
def make_shape(self, shape):
super().make_shape(shape)
self.output_shape = (np.prod(shape),)
return shape
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-26 18:05:33 -08:00
self.batch_size = X.shape[0]
return X.reshape(self.batch_size, *self.output_shape)
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-26 18:05:33 -08:00
assert dY.shape[0] == self.batch_size
return dY.reshape(self.batch_size, *self.input_shape)
2017-02-12 17:29:52 -08:00
class Affine(Layer):
def __init__(self, a=1, b=0):
super().__init__()
2017-02-14 13:02:30 -08:00
self.a = _f(a)
self.b = _f(b)
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
return self.a * X + self.b
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
return dY * self.a
2017-02-15 20:18:53 -08:00
class Sum(Layer):
2017-02-27 01:07:25 -08:00
def _propogate(self, edges):
return np.sum(edges, axis=0)
2017-02-15 20:18:53 -08:00
2017-02-27 01:07:25 -08:00
def _backpropogate(self, edges):
#assert len(edges) == 1, "unimplemented"
return edges[0] # TODO: does this always work?
2017-02-15 20:18:53 -08:00
2017-02-12 17:29:52 -08:00
class Sigmoid(Layer): # aka Logistic
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
self.sig = sigmoid(X)
2017-02-18 18:43:42 -08:00
return self.sig
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
return dY * self.sig * (1 - self.sig)
class Tanh(Layer):
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
self.sig = np.tanh(X)
2017-02-18 18:43:42 -08:00
return self.sig
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
return dY * (1 - self.sig * self.sig)
class Relu(Layer):
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
self.cond = X >= 0
return np.where(self.cond, X, 0)
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
return np.where(self.cond, dY, 0)
class Elu(Layer):
# paper: https://arxiv.org/abs/1511.07289
def __init__(self, alpha=1):
super().__init__()
2017-02-14 13:02:30 -08:00
self.alpha = _f(alpha)
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
self.cond = X >= 0
self.neg = np.exp(X) - 1
return np.where(self.cond, X, self.neg)
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
return dY * np.where(self.cond, 1, self.neg + 1)
class GeluApprox(Layer):
# paper: https://arxiv.org/abs/1606.08415
# plot: https://www.desmos.com/calculator/ydzgtccsld
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
self.a = 1.704 * X
self.sig = sigmoid(self.a)
return X * self.sig
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
return dY * self.sig * (1 + self.a * (1 - self.sig))
2017-02-15 20:18:53 -08:00
class Softmax(Layer):
# lifted from theano
def __init__(self, axis=-1):
super().__init__()
self.axis = int(axis)
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-15 20:18:53 -08:00
alpha = np.max(X, axis=-1, keepdims=True)
num = np.exp(X - alpha)
den = np.sum(num, axis=-1, keepdims=True)
self.sm = num / den
return self.sm
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-15 20:18:53 -08:00
dYsm = dY * self.sm
dX = dYsm - np.sum(dYsm, axis=-1, keepdims=True) * self.sm
return dX
2017-02-12 17:29:52 -08:00
# Parametric Layers {{{1
class Dense(Layer):
2017-02-27 14:14:58 -08:00
serialized = {
'W': 'coeffs',
'b': 'biases',
}
2017-02-12 17:29:52 -08:00
def __init__(self, dim, init=init_he_uniform):
super().__init__()
2017-02-14 13:02:30 -08:00
self.dim = int(dim)
2017-02-12 17:29:52 -08:00
self.output_shape = (dim,)
self.weight_init = init
self.size = None
def make_shape(self, shape):
super().make_shape(shape)
if len(shape) != 1:
return False
self.nW = self.dim * shape[0]
self.nb = self.dim
self.size = self.nW + self.nb
return shape
def init(self, W, dW):
2017-02-17 22:53:44 -08:00
super().init(W, dW)
2017-02-12 17:29:52 -08:00
ins, outs = self.input_shape[0], self.output_shape[0]
self.coeffs = self.W[:self.nW].reshape(ins, outs)
self.biases = self.W[self.nW:].reshape(1, outs)
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
self.dbiases = self.dW[self.nW:].reshape(1, outs)
self.coeffs.flat = self.weight_init(self.nW, ins, outs)
self.biases.flat = 0
self.std = np.std(self.W)
2017-02-27 01:07:25 -08:00
def forward(self, X):
2017-02-12 17:29:52 -08:00
self.X = X
2017-02-26 17:52:17 -08:00
return X.dot(self.coeffs) + self.biases
2017-02-12 17:29:52 -08:00
2017-02-27 01:07:25 -08:00
def backward(self, dY):
2017-02-12 17:29:52 -08:00
self.dcoeffs[:] = self.X.T.dot(dY)
self.dbiases[:] = dY.sum(0, keepdims=True)
2017-02-26 17:52:17 -08:00
return dY.dot(self.coeffs.T)
2017-02-12 17:29:52 -08:00
# Models {{{1
class Model:
def __init__(self, x, y, unsafe=False):
assert isinstance(x, Layer), x
assert isinstance(y, Layer), y
self.x = x
self.y = y
self.ordered_nodes = self.traverse([], self.y)
self.make_weights()
for node in self.ordered_nodes:
node.unsafe = unsafe
def make_weights(self):
self.param_count = 0
for node in self.ordered_nodes:
if node.size is not None:
self.param_count += node.size
2017-02-14 13:02:30 -08:00
self.W = np.zeros(self.param_count, dtype=_f)
self.dW = np.zeros(self.param_count, dtype=_f)
2017-02-12 17:29:52 -08:00
offset = 0
for node in self.ordered_nodes:
if node.size is not None:
end = offset + node.size
node.init(self.W[offset:end], self.dW[offset:end])
offset += node.size
def traverse(self, nodes, node):
if node == self.x:
return [node]
for parent in node.parents:
if parent not in nodes:
new_nodes = self.traverse(nodes, parent)
for new_node in new_nodes:
if new_node not in nodes:
nodes.append(new_node)
if nodes:
nodes.append(node)
return nodes
def forward(self, X):
2017-02-27 01:07:25 -08:00
values = dict()
2017-02-12 17:29:52 -08:00
input_node = self.ordered_nodes[0]
output_node = self.ordered_nodes[-1]
2017-02-27 01:07:25 -08:00
values[input_node] = input_node._propogate(np.expand_dims(X, 0))
2017-02-12 17:29:52 -08:00
for node in self.ordered_nodes[1:]:
2017-02-27 01:07:25 -08:00
values[node] = node.propagate(values)
return values[output_node]
2017-02-12 17:29:52 -08:00
def backward(self, error):
2017-02-27 01:07:25 -08:00
values = dict()
2017-02-12 17:29:52 -08:00
output_node = self.ordered_nodes[-1]
2017-02-27 01:07:25 -08:00
values[output_node] = output_node._backpropogate(np.expand_dims(error, 0))
2017-02-12 17:29:52 -08:00
for node in reversed(self.ordered_nodes[:-1]):
2017-02-27 01:07:25 -08:00
values[node] = node.backpropagate(values)
2017-02-12 17:29:52 -08:00
return self.dW
def load_weights(self, fn):
# seemingly compatible with keras' Dense layers.
import h5py
2017-02-27 14:14:58 -08:00
open(fn) # just ensure the file exists (python's error is better)
f = h5py.File(fn, 'r')
2017-02-12 17:29:52 -08:00
weights = {}
def visitor(name, obj):
if isinstance(obj, h5py.Dataset):
2017-02-14 13:02:30 -08:00
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
2017-02-12 17:29:52 -08:00
f.visititems(visitor)
f.close()
2017-02-27 14:14:58 -08:00
used = {}
for k in weights.keys():
used[k] = False
nodes = [node for node in self.ordered_nodes if node.size is not None]
for node in nodes:
full_name = str(node).lower()
for s_name, o_name in node.serialized.items():
key = full_name + '_' + s_name
data = weights[key]
target = getattr(node, o_name)
target[:] = data
used[key] = True
for k, v in used.items():
if not v:
lament("WARNING: unused weight", k)
2017-02-12 17:29:52 -08:00
def save_weights(self, fn, overwrite=False):
import h5py
f = h5py.File(fn, 'w')
2017-02-27 14:14:58 -08:00
counts = defaultdict(lambda: 0)
nodes = [node for node in self.ordered_nodes if node.size is not None]
for node in nodes:
full_name = str(node).lower()
grp = f.create_group(full_name)
for s_name, o_name in node.serialized.items():
key = full_name + '_' + s_name
target = getattr(node, o_name)
data = grp.create_dataset(key, target.shape, dtype=_f)
data[:] = target
counts[key] += 1
if counts[key] > 1:
lament("WARNING: rewrote weight", key)
2017-02-12 17:29:52 -08:00
f.close()
# Rituals {{{1
class Ritual: # i'm just making up names at this point
def __init__(self, learner=None, loss=None, mloss=None):
self.learner = learner if learner is not None else Learner(Optimizer())
self.loss = loss if loss is not None else Squared()
self.mloss = mloss if mloss is not None else loss
def reset(self):
self.learner.reset(optim=True)
2017-02-17 18:37:04 -08:00
self.en = 0
self.bn = 0
2017-02-12 17:29:52 -08:00
2017-02-15 20:18:53 -08:00
def measure(self, p, y):
2017-02-27 01:07:25 -08:00
return self.mloss.forward(p, y)
2017-02-12 17:29:52 -08:00
2017-02-15 20:18:53 -08:00
def derive(self, p, y):
2017-02-27 01:07:25 -08:00
return self.loss.backward(p, y)
2017-02-12 17:29:52 -08:00
def learn(self, inputs, outputs):
predicted = self.model.forward(inputs)
2017-02-15 20:18:53 -08:00
self.model.backward(self.derive(predicted, outputs))
return predicted
2017-02-12 17:29:52 -08:00
def update(self):
self.learner.optim.update(self.model.dW, self.model.W)
def prepare(self, model):
self.en = 0
self.bn = 0
self.model = model
2017-02-27 14:48:49 -08:00
def train_batched(self, inputs, outputs, batch_size,
return_losses=False, test_only=False):
assert isinstance(return_losses, bool) or return_losses == 'both'
if not test_only:
self.en += 1
cumsum_loss, cumsum_mloss = _0, _0
2017-02-12 17:29:52 -08:00
batch_count = inputs.shape[0] // batch_size
2017-02-27 14:48:49 -08:00
losses, mlosses = [], []
2017-02-15 20:18:53 -08:00
assert inputs.shape[0] % batch_size == 0, \
"inputs is not evenly divisible by batch_size" # TODO: lift this restriction
2017-02-12 17:29:52 -08:00
for b in range(batch_count):
2017-02-27 14:48:49 -08:00
if not test_only:
self.bn += 1
2017-02-12 17:29:52 -08:00
bi = b * batch_size
batch_inputs = inputs[ bi:bi+batch_size]
batch_outputs = outputs[bi:bi+batch_size]
2017-02-27 14:48:49 -08:00
if not test_only and self.learner.per_batch:
2017-02-27 16:36:04 -08:00
self.learner.batch(b / batch_count)
2017-02-12 17:29:52 -08:00
2017-02-15 20:18:53 -08:00
predicted = self.learn(batch_inputs, batch_outputs)
2017-02-27 14:48:49 -08:00
if not test_only:
self.update()
2017-02-12 17:29:52 -08:00
2017-02-27 14:48:49 -08:00
if return_losses == 'both':
batch_loss = self.loss.forward(predicted, batch_outputs)
if np.isnan(batch_loss):
raise Exception("nan")
2017-02-25 23:41:38 -08:00
losses.append(batch_loss)
2017-02-27 14:48:49 -08:00
cumsum_loss += batch_loss
2017-02-25 23:41:38 -08:00
2017-02-27 14:48:49 -08:00
batch_mloss = self.measure(predicted, batch_outputs)
if np.isnan(batch_mloss):
2017-02-12 17:29:52 -08:00
raise Exception("nan")
if return_losses:
2017-02-27 14:48:49 -08:00
mlosses.append(batch_mloss)
cumsum_mloss += batch_mloss
avg_mloss = cumsum_mloss / _f(batch_count)
if return_losses == 'both':
avg_loss = cumsum_loss / _f(batch_count)
return avg_loss, avg_mloss, losses, mlosses
elif return_losses:
return avg_mloss, mlosses
return avg_mloss
def test_batched(self, *args, **kwargs):
return self.train_batched(*args, test_only=True, **kwargs)
2017-02-12 17:29:52 -08:00
# Learners {{{1
class Learner:
per_batch = False
def __init__(self, optim, epochs=100, rate=None):
assert isinstance(optim, Optimizer)
self.optim = optim
2017-02-14 13:02:30 -08:00
self.start_rate = optim.alpha if rate is None else _f(rate)
2017-02-12 17:29:52 -08:00
self.epochs = int(epochs)
self.reset()
def reset(self, optim=False):
self.started = False
self.epoch = 0
if optim:
self.optim.reset()
@property
def epoch(self):
return self._epoch
@epoch.setter
def epoch(self, new_epoch):
self._epoch = int(new_epoch)
self.rate = self.rate_at(self._epoch)
@property
def rate(self):
return self.optim.alpha
@rate.setter
def rate(self, new_rate):
self.optim.alpha = new_rate
def rate_at(self, epoch):
return self.start_rate
def next(self):
# prepares the next epoch. returns whether or not to continue training.
if self.epoch + 1 >= self.epochs:
return False
if self.started:
self.epoch += 1
else:
self.started = True
self.epoch = self.epoch # poke property setter just in case
return True
def batch(self, progress): # TODO: rename
# interpolates rates between epochs.
# unlike epochs, we do not store batch number as a state.
# i.e. calling next() will not respect progress.
assert 0 <= progress <= 1
self.rate = self.rate_at(self._epoch + progress)
@property
def final_rate(self):
return self.rate_at(self.epochs - 1)
class AnnealingLearner(Learner):
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
2017-02-14 13:02:30 -08:00
self.halve_every = _f(halve_every)
self.anneal = _f(0.5**(1/self.halve_every))
2017-02-12 17:29:52 -08:00
super().__init__(optim, epochs, rate)
def rate_at(self, epoch):
return self.start_rate * self.anneal**epoch
def cosmod(x):
# plot: https://www.desmos.com/calculator/hlgqmyswy2
2017-02-14 13:02:30 -08:00
return (_1 + np.cos((x % _1) * _pi)) * _inv2
2017-02-12 17:29:52 -08:00
class SGDR(Learner):
# Stochastic Gradient Descent with Restarts
# paper: https://arxiv.org/abs/1608.03983
# NOTE: this is missing a couple features.
per_batch = True
def __init__(self, optim, epochs=100, rate=None,
restarts=0, restart_decay=0.5, callback=None,
expando=None):
self.restart_epochs = int(epochs)
2017-02-14 13:02:30 -08:00
self.decay = _f(restart_decay)
2017-02-12 17:29:52 -08:00
self.restarts = int(restarts)
self.restart_callback = callback
# TODO: rename expando to something not insane
2017-02-16 14:10:33 -08:00
self.expando = expando if expando is not None else lambda i: i
2017-02-12 17:29:52 -08:00
self.splits = []
epochs = 0
for i in range(0, self.restarts + 1):
2017-02-16 14:10:33 -08:00
split = epochs + self.restart_epochs + int(self.expando(i))
2017-02-12 17:29:52 -08:00
self.splits.append(split)
epochs = split
super().__init__(optim, epochs, rate)
def split_num(self, epoch):
shit = [0] + self.splits # hack
for i in range(0, len(self.splits)):
if epoch < self.splits[i]:
sub_epoch = epoch - shit[i]
next_restart = self.splits[i] - shit[i]
return i, sub_epoch, next_restart
raise Exception('this should never happen.')
def rate_at(self, epoch):
restart, sub_epoch, next_restart = self.split_num(epoch)
2017-02-14 13:02:30 -08:00
x = _f(sub_epoch) / _f(next_restart)
return self.start_rate * self.decay**_f(restart) * cosmod(x)
2017-02-12 17:29:52 -08:00
def next(self):
if not super().next():
return False
restart, sub_epoch, next_restart = self.split_num(self.epoch)
if restart > 0 and sub_epoch == 0:
if self.restart_callback is not None:
self.restart_callback(restart)
return True