Connor Olding
2cf38d4ece
okay, this is a disaster, but i think i've got it under control now. the way batch-based learners now work is: the epoch we're working towards is the truncated part of the epoch variable, and how far we are into the epoch is the fractional part. epoch starts at 1, so subtract by 1 when doing periodic operations.
1268 lines
38 KiB
Python
1268 lines
38 KiB
Python
import sys
|
|
import types
|
|
|
|
def lament(*args, **kwargs):
|
|
print(*args, file=sys.stderr, **kwargs)
|
|
|
|
def lower_priority():
|
|
"""Set the priority of the process to below-normal."""
|
|
# via https://stackoverflow.com/a/1023269
|
|
if sys.platform == 'win32':
|
|
try:
|
|
import win32api, win32process, win32con
|
|
pid = win32api.GetCurrentProcessId()
|
|
handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
|
|
win32process.SetPriorityClass(handle, win32process.BELOW_NORMAL_PRIORITY_CLASS)
|
|
except ImportError:
|
|
lament("you do not have pywin32 installed.")
|
|
lament("the process priority could not be lowered.")
|
|
lament("consider: python -m pip install pypiwin32")
|
|
lament("consider: conda install pywin32")
|
|
else:
|
|
import os
|
|
os.nice(1)
|
|
|
|
import numpy as np
|
|
_f = np.float32
|
|
|
|
# just for speed, not strictly essential:
|
|
from scipy.special import expit as sigmoid
|
|
|
|
# used for numbering layers like Keras, and keeping initialization consistent:
|
|
from collections import defaultdict, OrderedDict
|
|
_layer_counters = defaultdict(lambda: 0)
|
|
|
|
def _check(a):
|
|
assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
|
|
assert a.dtype == _f, a.dtype
|
|
return a
|
|
|
|
_0 = _f(0)
|
|
_1 = _f(1)
|
|
_2 = _f(2)
|
|
_inv2 = _f(1/2)
|
|
_sqrt2 = _f(np.sqrt(2))
|
|
_invsqrt2 = _f(1/np.sqrt(2))
|
|
_pi = _f(np.pi)
|
|
|
|
class LayerIncompatibility(Exception):
|
|
pass
|
|
|
|
# Node Traversal {{{1
|
|
|
|
class DummyNode:
|
|
name = "Dummy"
|
|
|
|
def __init__(self, children=None, parents=None):
|
|
self.children = children if children is not None else []
|
|
self.parents = parents if parents is not None else []
|
|
|
|
def traverse(node_in, node_out, nodes=None, dummy_mode=False):
|
|
# i have no idea if this is any algorithm in particular.
|
|
nodes = nodes if nodes is not None else []
|
|
|
|
seen_up = {}
|
|
q = [node_out]
|
|
while len(q) > 0:
|
|
node = q.pop(0)
|
|
seen_up[node] = True
|
|
for parent in node.parents:
|
|
q.append(parent)
|
|
|
|
if dummy_mode:
|
|
seen_up[node_in] = True
|
|
|
|
nodes = []
|
|
q = [node_in]
|
|
while len(q) > 0:
|
|
node = q.pop(0)
|
|
if not seen_up[node]:
|
|
continue
|
|
parents_added = (parent in nodes for parent in node.parents)
|
|
if not node in nodes and all(parents_added):
|
|
nodes.append(node)
|
|
for child in node.children:
|
|
q.append(child)
|
|
|
|
if dummy_mode:
|
|
nodes.remove(node_in)
|
|
|
|
return nodes
|
|
|
|
def traverse_all(nodes_in, nodes_out, nodes=None):
|
|
all_in = DummyNode(children=nodes_in)
|
|
all_out = DummyNode(parents=nodes_out)
|
|
return traverse(all_in, all_out, nodes, dummy_mode=True)
|
|
|
|
# Initializations {{{1
|
|
|
|
# note: these are currently only implemented for 2D shapes.
|
|
|
|
def init_zeros(size, ins=None, outs=None):
|
|
return np.zeros(size)
|
|
|
|
def init_ones(size, ins=None, outs=None):
|
|
return np.ones(size)
|
|
|
|
def init_he_normal(size, ins, outs):
|
|
s = np.sqrt(2 / ins)
|
|
return np.random.normal(0, s, size=size)
|
|
|
|
def init_he_uniform(size, ins, outs):
|
|
s = np.sqrt(6 / ins)
|
|
return np.random.uniform(-s, s, size=size)
|
|
|
|
def init_glorot_normal(size, ins, outs):
|
|
s = np.sqrt(2 / (ins + outs))
|
|
return np.random.normal(0, s, size=size)
|
|
|
|
def init_glorot_uniform(size, ins, outs):
|
|
s = np.sqrt(6 / (ins + outs))
|
|
return np.random.uniform(-s, s, size=size)
|
|
|
|
# Weight container {{{1
|
|
|
|
class Weights:
|
|
# we may or may not contain weights -- or any information, for that matter.
|
|
|
|
def __init__(self, **kwargs):
|
|
self.f = None # forward weights
|
|
self.g = None # backward weights (gradients)
|
|
self.shape = None
|
|
self.init = None
|
|
self.allocator = None
|
|
self.regularizer = None
|
|
|
|
self.configure(**kwargs)
|
|
|
|
def configure(self, **kwargs):
|
|
for k, v in kwargs.items():
|
|
getattr(self, k) # ensures the key already exists
|
|
setattr(self, k, v)
|
|
|
|
@property
|
|
def size(self):
|
|
assert self.shape is not None
|
|
return np.prod(self.shape)
|
|
|
|
def allocate(self, *args, **kwargs):
|
|
self.configure(**kwargs)
|
|
|
|
# intentionally not using isinstance
|
|
assert type(self.shape) == tuple, self.shape
|
|
|
|
f, g = self.allocator(self.size)
|
|
assert len(f) == self.size, "{} != {}".format(f.shape, self.size)
|
|
assert len(g) == self.size, "{} != {}".format(g.shape, self.size)
|
|
f[:] = self.init(self.size, *args)
|
|
g[:] = self.init(self.size, *args)
|
|
self.f = f.reshape(self.shape)
|
|
self.g = g.reshape(self.shape)
|
|
|
|
def forward(self):
|
|
if self.regularizer is None:
|
|
return 0.0
|
|
return self.regularizer.forward(self.f)
|
|
|
|
def backward(self):
|
|
if self.regularizer is None:
|
|
return 0.0
|
|
return self.regularizer.backward(self.f)
|
|
|
|
def update(self):
|
|
if self.regularizer is None:
|
|
return
|
|
self.g += self.regularizer.backward(self.f)
|
|
|
|
# Loss functions {{{1
|
|
|
|
class Loss:
|
|
pass
|
|
|
|
class CategoricalCrossentropy(Loss):
|
|
# lifted from theano
|
|
|
|
def __init__(self, eps=1e-6):
|
|
self.eps = _f(eps)
|
|
|
|
def forward(self, p, y):
|
|
p = np.clip(p, self.eps, 1 - self.eps)
|
|
f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
|
|
return np.mean(f)
|
|
|
|
def backward(self, p, y):
|
|
p = np.clip(p, self.eps, 1 - self.eps)
|
|
df = (p - y) / (p * (1 - p))
|
|
return df / len(y)
|
|
|
|
class Accuracy(Loss):
|
|
# returns percentage of categories correctly predicted.
|
|
# utilizes argmax(), so it cannot be used for gradient descent.
|
|
# use CategoricalCrossentropy for that instead.
|
|
|
|
def forward(self, p, y):
|
|
correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1)
|
|
return np.mean(correct)
|
|
|
|
def backward(self, p, y):
|
|
raise NotImplementedError("cannot take the gradient of Accuracy")
|
|
|
|
class ResidualLoss(Loss):
|
|
def forward(self, p, y):
|
|
return np.mean(self.f(p - y))
|
|
|
|
def backward(self, p, y):
|
|
ret = self.df(p - y) / len(y)
|
|
return ret
|
|
|
|
class Squared(ResidualLoss):
|
|
def f(self, r):
|
|
return np.square(r)
|
|
|
|
def df(self, r):
|
|
return 2 * r
|
|
|
|
class Absolute(ResidualLoss):
|
|
def f(self, r):
|
|
return np.abs(r)
|
|
|
|
def df(self, r):
|
|
return np.sign(r)
|
|
|
|
# Regularizers {{{1
|
|
|
|
class Regularizer:
|
|
pass
|
|
|
|
class L1L2(Regularizer):
|
|
def __init__(self, l1=0.0, l2=0.0):
|
|
self.l1 = _f(l1)
|
|
self.l2 = _f(l2)
|
|
|
|
def forward(self, X):
|
|
f = _0
|
|
if self.l1:
|
|
f += np.sum(self.l1 * np.abs(X))
|
|
if self.l2:
|
|
f += np.sum(self.l2 * np.square(X))
|
|
return f
|
|
|
|
def backward(self, X):
|
|
df = np.zeros_like(X)
|
|
if self.l1:
|
|
df += self.l1 * np.sign(X)
|
|
if self.l2:
|
|
df += self.l2 * 2 * X
|
|
return df
|
|
|
|
# Optimizers {{{1
|
|
|
|
class Optimizer:
|
|
def __init__(self, lr=0.1):
|
|
self.lr = _f(lr) # learning rate
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
pass
|
|
|
|
def compute(self, dW, W):
|
|
return -self.lr * dW
|
|
|
|
def update(self, dW, W):
|
|
W += self.compute(dW, W)
|
|
|
|
# the following optimizers are blatantly lifted from tiny-dnn:
|
|
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
|
|
|
class Momentum(Optimizer):
|
|
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
|
|
self.mu = _f(mu) # momentum
|
|
self.nesterov = bool(nesterov)
|
|
|
|
super().__init__(lr)
|
|
|
|
def reset(self):
|
|
self.Vprev = None
|
|
|
|
def compute(self, dW, W):
|
|
if self.Vprev is None:
|
|
self.Vprev = np.copy(dW)
|
|
|
|
V = self.mu * self.Vprev - self.lr * dW
|
|
self.Vprev[:] = V
|
|
if self.nesterov:
|
|
return self.mu * V - self.lr * dW
|
|
|
|
return V
|
|
|
|
class RMSprop(Optimizer):
|
|
# RMSprop generalizes* Adagrad, etc.
|
|
|
|
# TODO: verify this is correct:
|
|
# * RMSprop == Adagrad when
|
|
# RMSprop.mu == 1
|
|
|
|
def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
|
|
self.mu = _f(mu) # decay term
|
|
self.eps = _f(eps)
|
|
|
|
# one might consider the following equation when specifying mu:
|
|
# mu = e**(-1/t)
|
|
# default: t = -1/ln(0.99) = ~99.5
|
|
# therefore the default of mu=0.99 means
|
|
# an input decays to 1/e its original amplitude over 99.5 epochs.
|
|
# (this is from DSP, so how relevant it is in SGD is debatable)
|
|
|
|
super().__init__(lr)
|
|
|
|
def reset(self):
|
|
self.g = None
|
|
|
|
def compute(self, dW, W):
|
|
if self.g is None:
|
|
self.g = np.zeros_like(dW)
|
|
|
|
# basically apply a first-order low-pass filter to delta squared
|
|
self.g += (1 - self.mu) * (np.square(dW) - self.g)
|
|
|
|
# finally sqrt it to complete the running root-mean-square approximation
|
|
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
|
|
|
class RMSpropCentered(Optimizer):
|
|
# referenced TensorFlow/PyTorch.
|
|
# paper: https://arxiv.org/pdf/1308.0850v5.pdf
|
|
|
|
def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8):
|
|
self.aleph = _f(aleph)
|
|
self.momentum = _f(momentum)
|
|
self.eps = _f(eps)
|
|
|
|
super().__init__(lr)
|
|
|
|
def reset(self):
|
|
self.g = None
|
|
self.mt = None
|
|
self.vt = None
|
|
self.delta = None
|
|
|
|
def compute(self, dW, W):
|
|
if self.g is None:
|
|
self.g = np.zeros_like(dW)
|
|
if self.mt is None:
|
|
self.mt = np.zeros_like(dW)
|
|
if self.vt is None:
|
|
self.vt = np.zeros_like(dW)
|
|
if self.delta is None:
|
|
self.delta = np.zeros_like(dW)
|
|
|
|
self.mt += (1 - self.aleph) * (dW - self.mt)
|
|
self.vt += (1 - self.aleph) * (np.square(dW) - self.vt)
|
|
|
|
# PyTorch has the epsilon outside of the sqrt,
|
|
# TensorFlow and the paper have it within.
|
|
# in onn, we generally do it outside, as this seems to work better.
|
|
temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps)
|
|
|
|
# TensorFlow does it this way.
|
|
self.delta[:] = self.momentum * self.delta + self.lr * temp
|
|
return -self.delta
|
|
# PyTorch does it this way.
|
|
#self.delta[:] = self.momentum * self.delta + temp
|
|
#return -self.lr * self.delta
|
|
# they are equivalent only when LR is constant, which it might not be.
|
|
|
|
class Adam(Optimizer):
|
|
# paper: https://arxiv.org/abs/1412.6980
|
|
# Adam generalizes* RMSprop, and
|
|
# adds a decay term to the regular (non-squared) delta, and performs
|
|
# debiasing to compensate for the filtered deltas starting from zero.
|
|
|
|
# * Adam == RMSprop when
|
|
# Adam.b1 == 0
|
|
# Adam.b2 == RMSprop.mu
|
|
|
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
|
self.b1 = _f(b1) # decay term
|
|
self.b2 = _f(b2) # decay term
|
|
self.b1_t_default = _f(b1) # decay term power t
|
|
self.b2_t_default = _f(b2) # decay term power t
|
|
self.eps = _f(eps)
|
|
|
|
super().__init__(lr)
|
|
|
|
def reset(self):
|
|
self.mt = None
|
|
self.vt = None
|
|
self.b1_t = self.b1_t_default
|
|
self.b2_t = self.b2_t_default
|
|
|
|
def compute(self, dW, W):
|
|
if self.mt is None:
|
|
self.mt = np.zeros_like(dW)
|
|
if self.vt is None:
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
# decay gain
|
|
self.b1_t *= self.b1
|
|
self.b2_t *= self.b2
|
|
|
|
# filter
|
|
self.mt += (1 - self.b1) * (dW - self.mt)
|
|
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
|
|
|
return -self.lr * (self.mt / (1 - self.b1_t)) \
|
|
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
|
|
|
class Nadam(Optimizer):
|
|
# paper: https://arxiv.org/abs/1412.6980
|
|
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
|
|
# TODO: double-check this implementation. also read the damn paper.
|
|
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
|
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
|
|
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
|
self.b1 = _f(b1) # decay term
|
|
self.b2 = _f(b2) # decay term
|
|
self.eps = _f(eps)
|
|
|
|
super().__init__(lr)
|
|
|
|
def reset(self):
|
|
self.mt = None
|
|
self.vt = None
|
|
self.t = 0
|
|
self.sched = 1
|
|
|
|
def compute(self, dW, W):
|
|
self.t += 1
|
|
|
|
if self.mt is None:
|
|
self.mt = np.zeros_like(dW)
|
|
if self.vt is None:
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
|
|
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
|
|
|
|
sched0 = self.sched * ut0
|
|
sched1 = self.sched * ut0 * ut1
|
|
self.sched = sched0
|
|
|
|
gp = dW / (1 - sched0)
|
|
|
|
self.mt += (1 - self.b1) * (dW - self.mt)
|
|
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
|
|
|
mtp = self.mt / (1 - sched1)
|
|
vtp = self.vt / (1 - self.b2**self.t)
|
|
|
|
mt_bar = (1 - ut0) * gp + ut1 * mtp
|
|
|
|
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
|
|
|
|
# Abstract Layers {{{1
|
|
|
|
class Layer:
|
|
def __init__(self):
|
|
self.parents = []
|
|
self.children = []
|
|
self.weights = OrderedDict()
|
|
self.loss = None # for activity regularizers
|
|
self.input_shape = None
|
|
self.output_shape = None
|
|
kind = self.__class__.__name__
|
|
global _layer_counters
|
|
_layer_counters[kind] += 1
|
|
self.name = "{}_{}".format(kind, _layer_counters[kind])
|
|
self.unsafe = False # disables assertions for better performance
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
# methods we might want to override:
|
|
|
|
def forward(self, X):
|
|
raise NotImplementedError("unimplemented", self)
|
|
|
|
def forward_deterministic(self, X):
|
|
return self.forward(X)
|
|
|
|
def backward(self, dY):
|
|
raise NotImplementedError("unimplemented", self)
|
|
|
|
def make_shape(self, parent):
|
|
if self.input_shape == None:
|
|
self.input_shape = parent.output_shape
|
|
if self.output_shape == None:
|
|
self.output_shape = self.input_shape
|
|
|
|
def do_feed(self, child):
|
|
self.children.append(child)
|
|
|
|
def be_fed(self, parent):
|
|
self.parents.append(parent)
|
|
|
|
# TODO: better names for these (still)
|
|
def _propagate(self, edges, deterministic):
|
|
if not self.unsafe:
|
|
assert len(edges) == 1, self
|
|
if deterministic:
|
|
return self.forward_deterministic(edges[0])
|
|
else:
|
|
return self.forward(edges[0])
|
|
|
|
def _backpropagate(self, edges):
|
|
if len(edges) == 1:
|
|
return self.backward(edges[0])
|
|
return sum((self.backward(dY) for dY in edges))
|
|
|
|
# general utility methods:
|
|
|
|
def is_compatible(self, parent):
|
|
return np.all(self.input_shape == parent.output_shape)
|
|
|
|
def feed(self, child):
|
|
assert self.output_shape is not None, self
|
|
child.make_shape(self)
|
|
if not child.is_compatible(self):
|
|
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
|
|
raise LayerIncompatibility(fmt.format(self, child, self.output_shape, child.input_shape))
|
|
self.do_feed(child)
|
|
child.be_fed(self)
|
|
return child
|
|
|
|
def validate_input(self, X):
|
|
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
|
|
|
|
def validate_output(self, Y):
|
|
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
|
|
|
def _new_weights(self, name, **kwargs):
|
|
w = Weights(**kwargs)
|
|
assert name not in self.weights, name
|
|
self.weights[name] = w
|
|
return w
|
|
|
|
@property
|
|
def size(self):
|
|
return sum((w.size for w in self.weights.values()))
|
|
|
|
def init(self, allocator):
|
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
|
for k, w in self.weights.items():
|
|
w.allocate(ins, outs, allocator=allocator)
|
|
|
|
def propagate(self, values, deterministic):
|
|
if not self.unsafe:
|
|
assert self.parents, self
|
|
edges = []
|
|
for parent in self.parents:
|
|
# TODO: skip over irrelevant nodes (if any)
|
|
X = values[parent]
|
|
if not self.unsafe:
|
|
self.validate_input(X)
|
|
edges.append(X)
|
|
Y = self._propagate(edges, deterministic)
|
|
if not self.unsafe:
|
|
self.validate_output(Y)
|
|
return Y
|
|
|
|
def backpropagate(self, values):
|
|
if not self.unsafe:
|
|
assert self.children, self
|
|
edges = []
|
|
for child in self.children:
|
|
# TODO: skip over irrelevant nodes (if any)
|
|
dY = values[child]
|
|
if not self.unsafe:
|
|
self.validate_output(dY)
|
|
edges.append(dY)
|
|
dX = self._backpropagate(edges)
|
|
if not self.unsafe:
|
|
self.validate_input(dX)
|
|
return dX
|
|
|
|
# Nonparametric Layers {{{1
|
|
|
|
class Input(Layer):
|
|
def __init__(self, shape):
|
|
assert shape is not None
|
|
super().__init__()
|
|
self.shape = tuple(shape)
|
|
self.input_shape = self.shape
|
|
self.output_shape = self.shape
|
|
|
|
def forward(self, X):
|
|
return X
|
|
|
|
def backward(self, dY):
|
|
#self.dY = dY
|
|
return np.zeros_like(dY)
|
|
|
|
class Reshape(Layer):
|
|
def __init__(self, new_shape):
|
|
super().__init__()
|
|
self.shape = tuple(new_shape)
|
|
self.output_shape = self.shape
|
|
|
|
def forward(self, X):
|
|
self.batch_size = X.shape[0]
|
|
return X.reshape(self.batch_size, *self.output_shape)
|
|
|
|
def backward(self, dY):
|
|
assert dY.shape[0] == self.batch_size
|
|
return dY.reshape(self.batch_size, *self.input_shape)
|
|
|
|
class Flatten(Layer):
|
|
def make_shape(self, parent):
|
|
shape = parent.output_shape
|
|
self.input_shape = shape
|
|
self.output_shape = (np.prod(shape),)
|
|
|
|
def forward(self, X):
|
|
self.batch_size = X.shape[0]
|
|
return X.reshape(self.batch_size, *self.output_shape)
|
|
|
|
def backward(self, dY):
|
|
assert dY.shape[0] == self.batch_size
|
|
return dY.reshape(self.batch_size, *self.input_shape)
|
|
|
|
class ConstAffine(Layer):
|
|
def __init__(self, a=1, b=0):
|
|
super().__init__()
|
|
self.a = _f(a)
|
|
self.b = _f(b)
|
|
|
|
def forward(self, X):
|
|
return self.a * X + self.b
|
|
|
|
def backward(self, dY):
|
|
return dY * self.a
|
|
|
|
class Sum(Layer):
|
|
def _propagate(self, edges, deterministic):
|
|
return np.sum(edges, axis=0)
|
|
|
|
def _backpropagate(self, edges):
|
|
#assert len(edges) == 1, "unimplemented"
|
|
return edges[0] # TODO: does this always work?
|
|
|
|
class ActivityRegularizer(Layer):
|
|
def __init__(self, reg):
|
|
super().__init__()
|
|
assert isinstance(reg, Regularizer), reg
|
|
self.reg = reg
|
|
|
|
def forward(self, X):
|
|
self.X = X
|
|
self.loss = np.sum(self.reg.forward(X))
|
|
return X
|
|
|
|
def backward(self, dY):
|
|
return dY + self.reg.backward(self.X)
|
|
|
|
class Dropout(Layer):
|
|
def __init__(self, dropout=0.0):
|
|
super().__init__()
|
|
self.p = _f(1 - dropout)
|
|
assert 0 <= self.p <= 1
|
|
|
|
def forward(self, X):
|
|
self.mask = (np.random.rand(*X.shape) < self.p) / self.p
|
|
return X * self.mask
|
|
|
|
def forward_deterministic(self, X):
|
|
#self.mask = _1
|
|
return X
|
|
|
|
def backward(self, dY):
|
|
return dY * self.mask
|
|
|
|
# Activation Layers {{{2
|
|
|
|
class Linear(Layer):
|
|
def forward(self, X):
|
|
return X
|
|
|
|
def backward(self, dY):
|
|
return dY
|
|
|
|
class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
|
|
def forward(self, X):
|
|
self.sig = sigmoid(X)
|
|
return self.sig
|
|
|
|
def backward(self, dY):
|
|
return dY * self.sig * (1 - self.sig)
|
|
|
|
class Softplus(Layer):
|
|
# integral of Sigmoid.
|
|
|
|
def forward(self, X):
|
|
self.X = X
|
|
return np.log(1 + np.exp(X))
|
|
|
|
def backward(self, dY):
|
|
return dY * sigmoid(self.X)
|
|
|
|
class Tanh(Layer):
|
|
def forward(self, X):
|
|
self.sig = np.tanh(X)
|
|
return self.sig
|
|
|
|
def backward(self, dY):
|
|
return dY * (1 - self.sig * self.sig)
|
|
|
|
class LeCunTanh(Layer):
|
|
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
|
|
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
|
|
# scaled such that f([-1, 1]) = [-1, 1].
|
|
# helps preserve an input variance of 1.
|
|
# second derivative peaks around an input of ±1.
|
|
|
|
def forward(self, X):
|
|
self.sig = np.tanh(2 / 3 * X)
|
|
return 1.7159 * self.sig
|
|
|
|
def backward(self, dY):
|
|
return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
|
|
|
|
class Relu(Layer):
|
|
def forward(self, X):
|
|
self.cond = X >= 0
|
|
return np.where(self.cond, X, 0)
|
|
|
|
def backward(self, dY):
|
|
return np.where(self.cond, dY, 0)
|
|
|
|
class Elu(Layer):
|
|
# paper: https://arxiv.org/abs/1511.07289
|
|
|
|
def __init__(self, alpha=1):
|
|
super().__init__()
|
|
self.alpha = _f(alpha) # FIXME: unused
|
|
|
|
def forward(self, X):
|
|
self.cond = X >= 0
|
|
self.neg = np.exp(X) - 1
|
|
return np.where(self.cond, X, self.neg)
|
|
|
|
def backward(self, dY):
|
|
return dY * np.where(self.cond, 1, self.neg + 1)
|
|
|
|
class GeluApprox(Layer):
|
|
# paper: https://arxiv.org/abs/1606.08415
|
|
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
|
|
|
def forward(self, X):
|
|
self.a = 1.704 * X
|
|
self.sig = sigmoid(self.a)
|
|
return X * self.sig
|
|
|
|
def backward(self, dY):
|
|
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
|
|
|
class Softmax(Layer):
|
|
def __init__(self, axis=-1):
|
|
super().__init__()
|
|
self.axis = int(axis)
|
|
|
|
def forward(self, X):
|
|
alpha = np.max(X, axis=-1, keepdims=True)
|
|
num = np.exp(X - alpha)
|
|
den = np.sum(num, axis=-1, keepdims=True)
|
|
self.sm = num / den
|
|
return self.sm
|
|
|
|
def backward(self, dY):
|
|
return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
|
|
|
|
class LogSoftmax(Softmax):
|
|
def __init__(self, axis=-1, eps=1e-6):
|
|
super().__init__()
|
|
self.axis = int(axis)
|
|
self.eps = _f(eps)
|
|
|
|
def forward(self, X):
|
|
return np.log(super().forward(X) + self.eps)
|
|
|
|
def backward(self, dY):
|
|
return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
|
|
|
|
class Cos(Layer):
|
|
# performs well on MNIST for some strange reason.
|
|
|
|
def forward(self, X):
|
|
self.X = X
|
|
return np.cos(X)
|
|
|
|
def backward(self, dY):
|
|
return dY * -np.sin(self.X)
|
|
|
|
# Parametric Layers {{{1
|
|
|
|
class Dense(Layer):
|
|
serialized = {
|
|
'W': 'coeffs',
|
|
'b': 'biases',
|
|
}
|
|
|
|
def __init__(self, dim, init=init_he_uniform, reg_w=None, reg_b=None):
|
|
super().__init__()
|
|
self.dim = int(dim)
|
|
self.output_shape = (dim,)
|
|
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg_w)
|
|
self.biases = self._new_weights('biases', init=init_zeros, regularizer=reg_b)
|
|
|
|
def make_shape(self, parent):
|
|
shape = parent.output_shape
|
|
self.input_shape = shape
|
|
assert len(shape) == 1, shape
|
|
self.coeffs.shape = (shape[0], self.dim)
|
|
self.biases.shape = (1, self.dim)
|
|
|
|
def forward(self, X):
|
|
self.X = X
|
|
return X.dot(self.coeffs.f) + self.biases.f
|
|
|
|
def backward(self, dY):
|
|
self.coeffs.g[:] = self.X.T.dot(dY)
|
|
self.biases.g[:] = dY.sum(0, keepdims=True)
|
|
return dY.dot(self.coeffs.f.T)
|
|
|
|
# Models {{{1
|
|
|
|
class Model:
|
|
def __init__(self, nodes_in, nodes_out, unsafe=False):
|
|
nodes_in = [nodes_in] if isinstance(nodes_in, Layer) else nodes_in
|
|
nodes_out = [nodes_out] if isinstance(nodes_out, Layer) else nodes_out
|
|
assert type(nodes_in) == list, type(nodes_in)
|
|
assert type(nodes_out) == list, type(nodes_out)
|
|
self.nodes_in = nodes_in
|
|
self.nodes_out = nodes_out
|
|
self.nodes = traverse_all(self.nodes_in, self.nodes_out)
|
|
self.make_weights()
|
|
for node in self.nodes:
|
|
node.unsafe = unsafe
|
|
|
|
@property
|
|
def ordered_nodes(self):
|
|
# deprecated? we don't guarantee an order like we did before.
|
|
return self.nodes
|
|
|
|
def make_weights(self):
|
|
self.param_count = sum((node.size for node in self.nodes))
|
|
self.W = np.zeros(self.param_count, dtype=_f)
|
|
self.dW = np.zeros(self.param_count, dtype=_f)
|
|
|
|
offset = 0
|
|
for node in self.nodes:
|
|
if node.size > 0:
|
|
inner_offset = 0
|
|
|
|
def allocate(size):
|
|
nonlocal inner_offset
|
|
o = offset + inner_offset
|
|
ret = self.W[o:o+size], self.dW[o:o+size]
|
|
inner_offset += size
|
|
assert len(ret[0]) == len(ret[1])
|
|
assert size == len(ret[0]), (size, len(ret[0]))
|
|
return ret
|
|
|
|
node.init(allocate)
|
|
assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node)
|
|
# i don't care if "less" is grammatically incorrect.
|
|
# you're mom is grammatically incorrect.
|
|
assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node)
|
|
offset += node.size
|
|
|
|
def forward(self, X, deterministic=False):
|
|
values = dict()
|
|
input_node = self.nodes[0]
|
|
output_node = self.nodes[-1]
|
|
values[input_node] = input_node._propagate(np.expand_dims(X, 0), deterministic)
|
|
for node in self.nodes[1:]:
|
|
values[node] = node.propagate(values, deterministic)
|
|
return values[output_node]
|
|
|
|
def backward(self, error):
|
|
values = dict()
|
|
output_node = self.nodes[-1]
|
|
values[output_node] = output_node._backpropagate(np.expand_dims(error, 0))
|
|
for node in reversed(self.nodes[:-1]):
|
|
values[node] = node.backpropagate(values)
|
|
return self.dW
|
|
|
|
def regulate_forward(self):
|
|
loss = _0
|
|
for node in self.nodes:
|
|
if node.loss is not None:
|
|
loss += node.loss
|
|
for k, w in node.weights.items():
|
|
loss += w.forward()
|
|
return loss
|
|
|
|
def regulate(self):
|
|
for node in self.nodes:
|
|
for k, w in node.weights.items():
|
|
w.update()
|
|
|
|
def load_weights(self, fn):
|
|
# seemingly compatible with keras' Dense layers.
|
|
import h5py
|
|
open(fn) # just ensure the file exists (python's error is better)
|
|
f = h5py.File(fn, 'r')
|
|
weights = {}
|
|
def visitor(name, obj):
|
|
if isinstance(obj, h5py.Dataset):
|
|
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
|
|
f.visititems(visitor)
|
|
f.close()
|
|
|
|
used = {}
|
|
for k in weights.keys():
|
|
used[k] = False
|
|
|
|
nodes = [node for node in self.nodes if node.size > 0]
|
|
for node in nodes:
|
|
full_name = str(node).lower()
|
|
for s_name, o_name in node.serialized.items():
|
|
key = full_name + '_' + s_name
|
|
data = weights[key]
|
|
target = getattr(node, o_name)
|
|
target.f[:] = data
|
|
used[key] = True
|
|
|
|
for k, v in used.items():
|
|
if not v:
|
|
lament("WARNING: unused weight", k)
|
|
|
|
def save_weights(self, fn, overwrite=False):
|
|
import h5py
|
|
f = h5py.File(fn, 'w')
|
|
|
|
counts = defaultdict(lambda: 0)
|
|
|
|
nodes = [node for node in self.nodes if node.size > 0]
|
|
for node in nodes:
|
|
full_name = str(node).lower()
|
|
grp = f.create_group(full_name)
|
|
for s_name, o_name in node.serialized.items():
|
|
key = full_name + '_' + s_name
|
|
target = getattr(node, o_name)
|
|
data = grp.create_dataset(key, target.shape, dtype=_f)
|
|
data[:] = target.f
|
|
counts[key] += 1
|
|
if counts[key] > 1:
|
|
lament("WARNING: rewrote weight", key)
|
|
|
|
f.close()
|
|
|
|
def print_graph(self, file=sys.stdout):
|
|
print('digraph G {', file=file)
|
|
for node in self.nodes:
|
|
children = [str(n) for n in node.children]
|
|
if children:
|
|
sep = '->'
|
|
print('\t' + str(node) + sep + (';\n\t' + str(node) + sep).join(children) + ';', file=file)
|
|
print('}', file=file)
|
|
|
|
# Rituals {{{1
|
|
|
|
class Ritual: # i'm just making up names at this point.
|
|
def __init__(self, learner=None, loss=None, mloss=None):
|
|
self.learner = learner if learner is not None else Learner(Optimizer())
|
|
self.loss = loss if loss is not None else Squared()
|
|
self.mloss = mloss if mloss is not None else loss
|
|
self.model = None
|
|
|
|
def reset(self):
|
|
self.learner.reset(optim=True)
|
|
self.en = 0
|
|
self.bn = 0
|
|
|
|
def learn(self, inputs, outputs):
|
|
predicted = self.model.forward(inputs)
|
|
self.model.backward(self.loss.backward(predicted, outputs))
|
|
self.model.regulate()
|
|
return predicted
|
|
|
|
def update(self):
|
|
self.learner.optim.update(self.model.dW, self.model.W)
|
|
|
|
def prepare(self, model):
|
|
self.en = 0
|
|
self.bn = 0
|
|
self.model = model
|
|
|
|
def _train_batch(self, batch_inputs, batch_outputs, b, batch_count,
|
|
test_only=False, loss_logging=False, mloss_logging=True):
|
|
if not test_only and self.learner.per_batch:
|
|
self.learner.batch(b / batch_count)
|
|
|
|
if test_only:
|
|
predicted = self.model.forward(batch_inputs, deterministic=True)
|
|
else:
|
|
predicted = self.learn(batch_inputs, batch_outputs)
|
|
self.model.regulate_forward()
|
|
self.update()
|
|
|
|
if loss_logging:
|
|
batch_loss = self.loss.forward(predicted, batch_outputs)
|
|
if np.isnan(batch_loss):
|
|
raise Exception("nan")
|
|
self.losses.append(batch_loss)
|
|
self.cumsum_loss += batch_loss
|
|
|
|
if mloss_logging:
|
|
# NOTE: this can use the non-deterministic predictions. fixme?
|
|
batch_mloss = self.mloss.forward(predicted, batch_outputs)
|
|
if np.isnan(batch_mloss):
|
|
raise Exception("nan")
|
|
self.mlosses.append(batch_mloss)
|
|
self.cumsum_mloss += batch_mloss
|
|
|
|
def train_batched(self, inputs_or_generator, outputs_or_batch_count,
|
|
batch_size=None,
|
|
return_losses=False, test_only=False, shuffle=True):
|
|
assert isinstance(return_losses, bool) or return_losses == 'both'
|
|
|
|
gen = isinstance(inputs_or_generator, types.GeneratorType)
|
|
if gen:
|
|
generator = inputs_or_generator
|
|
batch_count = outputs_or_batch_count
|
|
assert isinstance(batch_count, int), type(batch_count)
|
|
else:
|
|
inputs = inputs_or_generator
|
|
outputs = outputs_or_batch_count
|
|
|
|
if not test_only:
|
|
self.en += 1
|
|
|
|
if shuffle:
|
|
if gen:
|
|
raise Exception("shuffling is incompatibile with using a generator.")
|
|
indices = np.arange(inputs.shape[0])
|
|
np.random.shuffle(indices)
|
|
inputs = inputs[indices]
|
|
outputs = outputs[indices]
|
|
|
|
self.cumsum_loss, self.cumsum_mloss = _0, _0
|
|
self.losses, self.mlosses = [], []
|
|
|
|
if not gen:
|
|
batch_count = inputs.shape[0] // batch_size
|
|
# TODO: lift this restriction
|
|
assert inputs.shape[0] % batch_size == 0, \
|
|
"inputs is not evenly divisible by batch_size"
|
|
|
|
prev_batch_size = None
|
|
for b in range(batch_count):
|
|
if not test_only:
|
|
self.bn += 1
|
|
|
|
if gen:
|
|
# TODO: pass self as an argument to the generator.
|
|
# ...is there a pythonic way of doing that?
|
|
# as it turns out, there is! (untested code)
|
|
# in the generator: model = yield
|
|
# yield dostuff(model)
|
|
# in here: generator.send()
|
|
# stuff = next(generator)
|
|
batch_inputs, batch_outputs = next(generator)
|
|
batch_size = batch_inputs.shape[0]
|
|
# TODO: lift this restriction
|
|
assert batch_size == prev_batch_size or prev_batch_size is None, \
|
|
"non-constant batch size (got {}, expected {})".format(batch_size, prev_batch_size)
|
|
else:
|
|
bi = b * batch_size
|
|
batch_inputs = inputs[ bi:bi+batch_size]
|
|
batch_outputs = outputs[bi:bi+batch_size]
|
|
|
|
self._train_batch(batch_inputs, batch_outputs, b, batch_count,
|
|
test_only, return_losses == 'both', return_losses)
|
|
|
|
prev_batch_size = batch_size
|
|
|
|
avg_mloss = self.cumsum_mloss / _f(batch_count)
|
|
if return_losses == 'both':
|
|
avg_loss = self.cumsum_loss / _f(batch_count)
|
|
return avg_loss, avg_mloss, self.losses, self.mlosses
|
|
elif return_losses:
|
|
return avg_mloss, self.mlosses
|
|
return avg_mloss
|
|
|
|
def test_batched(self, inputs, outputs, *args, **kwargs):
|
|
return self.train_batched(inputs, outputs, *args,
|
|
test_only=True, **kwargs)
|
|
|
|
def train_batched_gen(self, generator, batch_count, *args, **kwargs):
|
|
return self.train_batched(generator, batch_count, *args,
|
|
shuffle=False, **kwargs)
|
|
|
|
# Learners {{{1
|
|
|
|
class Learner:
|
|
per_batch = False
|
|
|
|
def __init__(self, optim, epochs=100, rate=None):
|
|
assert isinstance(optim, Optimizer)
|
|
self.optim = optim
|
|
self.start_rate = rate # None is okay; it'll use optim.lr instead.
|
|
self.epochs = int(epochs)
|
|
self.reset()
|
|
|
|
def reset(self, optim=False):
|
|
self.started = False
|
|
self.epoch = 0
|
|
if optim:
|
|
self.optim.reset()
|
|
|
|
@property
|
|
def epoch(self):
|
|
return self._epoch
|
|
|
|
@epoch.setter
|
|
def epoch(self, new_epoch):
|
|
self._epoch = int(new_epoch)
|
|
if 0 <= self.epoch <= self.epochs:
|
|
self.rate = self.rate_at(self._epoch)
|
|
|
|
@property
|
|
def rate(self):
|
|
return self.optim.lr
|
|
|
|
@rate.setter
|
|
def rate(self, new_rate):
|
|
self.optim.lr = new_rate
|
|
|
|
def rate_at(self, epoch):
|
|
if self.start_rate is None:
|
|
return self.optim.lr
|
|
return self.start_rate
|
|
|
|
def next(self):
|
|
# prepares the next epoch. returns whether or not to continue training.
|
|
if not self.started:
|
|
self.started = True
|
|
self.epoch += 1
|
|
if self.epoch > self.epochs:
|
|
return False
|
|
return True
|
|
|
|
def batch(self, progress): # TODO: rename
|
|
# interpolates rates between epochs.
|
|
# unlike epochs, we do not store batch number as a state.
|
|
# i.e. calling next() will not respect progress.
|
|
assert 0 <= progress <= 1
|
|
self.rate = self.rate_at(self._epoch + progress)
|
|
|
|
@property
|
|
def final_rate(self):
|
|
return self.rate_at(self.epochs - 1)
|
|
|
|
class AnnealingLearner(Learner):
|
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
|
self.halve_every = _f(halve_every)
|
|
self.anneal = _f(0.5**(1/self.halve_every))
|
|
super().__init__(optim, epochs, rate)
|
|
|
|
def rate_at(self, epoch):
|
|
return self.start_rate * self.anneal**epoch
|
|
|
|
def cosmod(x):
|
|
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
|
return (_1 + np.cos((x % _1) * _pi)) * _inv2
|
|
|
|
class SGDR(Learner):
|
|
# Stochastic Gradient Descent with Restarts
|
|
# paper: https://arxiv.org/abs/1608.03983
|
|
# NOTE: this is missing a couple of the proposed features.
|
|
|
|
per_batch = True
|
|
|
|
def __init__(self, optim, epochs=100, rate=None,
|
|
restarts=0, restart_decay=0.5, callback=None,
|
|
expando=0):
|
|
self.restart_epochs = int(epochs)
|
|
self.decay = _f(restart_decay)
|
|
self.restarts = int(restarts)
|
|
self.restart_callback = callback
|
|
|
|
# TODO: rename expando to something not insane
|
|
self.expando = expando if expando is not None else lambda i: i
|
|
if type(self.expando) == int:
|
|
inc = self.expando
|
|
self.expando = lambda i: i * inc
|
|
|
|
self.splits = []
|
|
epochs = 0
|
|
for i in range(0, self.restarts + 1):
|
|
split = epochs + self.restart_epochs + int(self.expando(i))
|
|
self.splits.append(split)
|
|
epochs = split
|
|
super().__init__(optim, epochs, rate)
|
|
|
|
def split_num(self, epoch):
|
|
previous = [0] + self.splits
|
|
for i, split in enumerate(self.splits):
|
|
if epoch - 1 < split:
|
|
sub_epoch = epoch - previous[i]
|
|
next_restart = split - previous[i]
|
|
return i, sub_epoch, next_restart
|
|
raise Exception('this should never happen.')
|
|
|
|
def rate_at(self, epoch):
|
|
restart, sub_epoch, next_restart = self.split_num(max(1, epoch))
|
|
x = _f(sub_epoch - 1) / _f(next_restart)
|
|
return self.start_rate * self.decay**_f(restart) * cosmod(x)
|
|
|
|
def next(self):
|
|
if not super().next():
|
|
return False
|
|
restart, sub_epoch, next_restart = self.split_num(self.epoch)
|
|
if restart > 0 and sub_epoch == 1:
|
|
if self.restart_callback is not None:
|
|
self.restart_callback(restart)
|
|
return True
|
|
|
|
class TriangularCLR(Learner):
|
|
# note: i haven't actually read (nor seen) the paper(s) on CLR,
|
|
# but this case (triangular) should be pretty difficult to get wrong.
|
|
|
|
per_batch = True
|
|
|
|
def __init__(self, optim, epochs=400, upper_rate=None, lower_rate=0,
|
|
frequency=100, callback=None):
|
|
# NOTE: start_rate is treated as upper_rate
|
|
self.frequency = int(frequency)
|
|
assert self.frequency > 0
|
|
self.callback = callback
|
|
self.lower_rate = _f(lower_rate)
|
|
super().__init__(optim, epochs, upper_rate)
|
|
|
|
def _t(self, epoch):
|
|
# NOTE: this could probably be simplified
|
|
offset = self.frequency / 2
|
|
return np.abs(((epoch - 1 + offset) % self.frequency) - offset) / offset
|
|
|
|
def rate_at(self, epoch):
|
|
# NOTE: start_rate is treated as upper_rate
|
|
return self._t(epoch) * (self.start_rate - self.lower_rate) + self.lower_rate
|
|
|
|
def next(self):
|
|
if not super().next():
|
|
return False
|
|
e = self.epoch - 1
|
|
if e > 0 and e % self.frequency == 0:
|
|
if self.callback is not None:
|
|
self.callback(self.epoch // self.frequency)
|
|
return True
|
|
|
|
class SineCLR(TriangularCLR):
|
|
def _t(self, epoch):
|
|
return np.sin(_pi * _inv2 * super()._t(epoch))
|
|
|
|
class WaveCLR(TriangularCLR):
|
|
def _t(self, epoch):
|
|
return _inv2 * (_1 - np.cos(_pi * super()._t(epoch)))
|