.
This commit is contained in:
parent
745e012a7a
commit
77ba4fa11b
838
optim_nn.py
838
optim_nn.py
|
@ -3,48 +3,15 @@
|
||||||
# external packages required for full functionality:
|
# external packages required for full functionality:
|
||||||
# numpy scipy h5py sklearn dotmap
|
# numpy scipy h5py sklearn dotmap
|
||||||
|
|
||||||
import numpy as np
|
from optim_nn_core import *
|
||||||
# ugly shorthand:
|
|
||||||
nf = np.float32
|
|
||||||
nfa = lambda x: np.array(x, dtype=nf)
|
|
||||||
ni = np.int
|
|
||||||
nia = lambda x: np.array(x, dtype=ni)
|
|
||||||
|
|
||||||
# just for speed, not strictly essential:
|
import sys
|
||||||
from scipy.special import expit as sigmoid
|
lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
|
||||||
|
|
||||||
# used for numbering layers like Keras:
|
def log(left, right):
|
||||||
from collections import defaultdict
|
lament("{:>20}: {}".format(left, right))
|
||||||
_layer_counters = defaultdict(lambda: 0)
|
|
||||||
|
|
||||||
# Initializations
|
# Loss functions {{{1
|
||||||
|
|
||||||
# note: these are currently only implemented for 2D shapes.
|
|
||||||
|
|
||||||
def init_he_normal(size, ins, outs):
|
|
||||||
s = np.sqrt(2 / ins)
|
|
||||||
return np.random.normal(0, s, size=size)
|
|
||||||
|
|
||||||
def init_he_uniform(size, ins, outs):
|
|
||||||
s = np.sqrt(6 / ins)
|
|
||||||
return np.random.uniform(-s, s, size=size)
|
|
||||||
|
|
||||||
# Loss functions
|
|
||||||
|
|
||||||
class Loss:
|
|
||||||
def mean(self, r):
|
|
||||||
return np.average(self.f(r))
|
|
||||||
|
|
||||||
def dmean(self, r):
|
|
||||||
d = self.df(r)
|
|
||||||
return d / len(d)
|
|
||||||
|
|
||||||
class Squared(Loss):
|
|
||||||
def f(self, r):
|
|
||||||
return np.square(r)
|
|
||||||
|
|
||||||
def df(self, r):
|
|
||||||
return 2 * r
|
|
||||||
|
|
||||||
class SquaredHalved(Loss):
|
class SquaredHalved(Loss):
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
|
@ -53,13 +20,6 @@ class SquaredHalved(Loss):
|
||||||
def df(self, r):
|
def df(self, r):
|
||||||
return r
|
return r
|
||||||
|
|
||||||
class Absolute(Loss):
|
|
||||||
def f(self, r):
|
|
||||||
return np.abs(r)
|
|
||||||
|
|
||||||
def df(self, r):
|
|
||||||
return np.sign(r)
|
|
||||||
|
|
||||||
class SomethingElse(Loss):
|
class SomethingElse(Loss):
|
||||||
# generalizes Absolute and SquaredHalved (|dx| = 1)
|
# generalizes Absolute and SquaredHalved (|dx| = 1)
|
||||||
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
||||||
|
@ -75,347 +35,7 @@ class SomethingElse(Loss):
|
||||||
def df(self, r):
|
def df(self, r):
|
||||||
return np.sign(r) * np.abs(r)**self.c
|
return np.sign(r) * np.abs(r)**self.c
|
||||||
|
|
||||||
# Optimizers
|
# Parametric Layers {{{1
|
||||||
|
|
||||||
class Optimizer:
|
|
||||||
def __init__(self, alpha=0.1):
|
|
||||||
self.alpha = nf(alpha)
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def compute(self, dW, W):
|
|
||||||
return -self.alpha * dW
|
|
||||||
|
|
||||||
def update(self, dW, W):
|
|
||||||
W += self.compute(dW, W)
|
|
||||||
|
|
||||||
# the following optimizers are blatantly lifted from tiny-dnn:
|
|
||||||
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
|
||||||
|
|
||||||
class Momentum(Optimizer):
|
|
||||||
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
|
||||||
self.alpha = np.asfarray(alpha) # learning rate
|
|
||||||
self.lamb = np.asfarray(lamb) # weight decay
|
|
||||||
self.mu = np.asfarray(mu) # momentum
|
|
||||||
self.nesterov = bool(nesterov)
|
|
||||||
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.dWprev = None
|
|
||||||
|
|
||||||
def compute(self, dW, W):
|
|
||||||
if self.dWprev is None:
|
|
||||||
#self.dWprev = np.zeros_like(dW)
|
|
||||||
self.dWprev = np.copy(dW)
|
|
||||||
|
|
||||||
V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb)
|
|
||||||
self.dWprev[:] = V
|
|
||||||
if self.nesterov: # TODO: is this correct? looks weird
|
|
||||||
return self.mu * V - self.alpha * (dW + W * self.lamb)
|
|
||||||
else:
|
|
||||||
return V
|
|
||||||
|
|
||||||
class Adam(Optimizer):
|
|
||||||
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
|
||||||
self.alpha = nf(alpha) # learning rate
|
|
||||||
self.b1 = nf(b1) # decay term
|
|
||||||
self.b2 = nf(b2) # decay term
|
|
||||||
self.b1_t_default = nf(b1_t) # decay term power t
|
|
||||||
self.b2_t_default = nf(b2_t) # decay term power t
|
|
||||||
self.eps = nf(eps)
|
|
||||||
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.mt = None
|
|
||||||
self.vt = None
|
|
||||||
self.b1_t = self.b1_t_default
|
|
||||||
self.b2_t = self.b2_t_default
|
|
||||||
|
|
||||||
def compute(self, dW, W):
|
|
||||||
if self.mt is None:
|
|
||||||
self.mt = np.zeros_like(W)
|
|
||||||
if self.vt is None:
|
|
||||||
self.vt = np.zeros_like(W)
|
|
||||||
|
|
||||||
# decay
|
|
||||||
self.b1_t *= self.b1
|
|
||||||
self.b2_t *= self.b2
|
|
||||||
|
|
||||||
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
|
|
||||||
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
|
||||||
|
|
||||||
return -self.alpha * (self.mt / (1 - self.b1_t)) \
|
|
||||||
/ np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
|
|
||||||
|
|
||||||
# Abstract Layers
|
|
||||||
|
|
||||||
class Layer:
|
|
||||||
def __init__(self):
|
|
||||||
self.parents = []
|
|
||||||
self.children = []
|
|
||||||
self.input_shape = None
|
|
||||||
self.output_shape = None
|
|
||||||
kind = self.__class__.__name__
|
|
||||||
global _layer_counters
|
|
||||||
_layer_counters[kind] += 1
|
|
||||||
self.name = "{}_{}".format(kind, _layer_counters[kind])
|
|
||||||
self.size = None # total weight count (if any)
|
|
||||||
self.unsafe = False # disables assertions for better performance
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.name
|
|
||||||
|
|
||||||
# methods we might want to override:
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
raise NotImplementedError("unimplemented", self)
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
raise NotImplementedError("unimplemented", self)
|
|
||||||
|
|
||||||
def do_feed(self, child):
|
|
||||||
self.children.append(child)
|
|
||||||
|
|
||||||
def be_fed(self, parent):
|
|
||||||
self.parents.append(parent)
|
|
||||||
|
|
||||||
def make_shape(self, shape):
|
|
||||||
if not self.unsafe:
|
|
||||||
assert shape is not None
|
|
||||||
if self.output_shape is None:
|
|
||||||
self.output_shape = shape
|
|
||||||
return shape
|
|
||||||
|
|
||||||
# TODO: rename this multi and B crap to something actually relevant.
|
|
||||||
|
|
||||||
def multi(self, B):
|
|
||||||
if not self.unsafe:
|
|
||||||
assert len(B) == 1, self
|
|
||||||
return self.F(B[0])
|
|
||||||
|
|
||||||
def dmulti(self, dB):
|
|
||||||
if len(dB) == 1:
|
|
||||||
return self.dF(dB[0])
|
|
||||||
else:
|
|
||||||
dX = None
|
|
||||||
for dY in dB:
|
|
||||||
if dX is None:
|
|
||||||
dX = self.dF(dY)
|
|
||||||
else:
|
|
||||||
dX += self.dF(dY)
|
|
||||||
return dX
|
|
||||||
|
|
||||||
# general utility methods:
|
|
||||||
|
|
||||||
def compatible(self, parent):
|
|
||||||
if self.input_shape is None:
|
|
||||||
# inherit shape from output
|
|
||||||
shape = self.make_shape(parent.output_shape)
|
|
||||||
if shape is None:
|
|
||||||
return False
|
|
||||||
self.input_shape = shape
|
|
||||||
if np.all(self.input_shape == parent.output_shape):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def feed(self, child):
|
|
||||||
if not child.compatible(self):
|
|
||||||
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
|
|
||||||
raise Exception(fmt.format(self, child, self.output_shape, child.input_shape))
|
|
||||||
self.do_feed(child)
|
|
||||||
child.be_fed(self)
|
|
||||||
return child
|
|
||||||
|
|
||||||
def validate_input(self, X):
|
|
||||||
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
|
|
||||||
|
|
||||||
def validate_output(self, Y):
|
|
||||||
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
|
||||||
|
|
||||||
def forward(self, lut):
|
|
||||||
if not self.unsafe:
|
|
||||||
assert len(self.parents) > 0, self
|
|
||||||
B = []
|
|
||||||
for parent in self.parents:
|
|
||||||
# TODO: skip over irrelevant nodes (if any)
|
|
||||||
X = lut[parent]
|
|
||||||
if not self.unsafe:
|
|
||||||
self.validate_input(X)
|
|
||||||
B.append(X)
|
|
||||||
Y = self.multi(B)
|
|
||||||
if not self.unsafe:
|
|
||||||
self.validate_output(Y)
|
|
||||||
return Y
|
|
||||||
|
|
||||||
def backward(self, lut):
|
|
||||||
if not self.unsafe:
|
|
||||||
assert len(self.children) > 0, self
|
|
||||||
dB = []
|
|
||||||
for child in self.children:
|
|
||||||
# TODO: skip over irrelevant nodes (if any)
|
|
||||||
dY = lut[child]
|
|
||||||
if not self.unsafe:
|
|
||||||
self.validate_output(dY)
|
|
||||||
dB.append(dY)
|
|
||||||
dX = self.dmulti(dB)
|
|
||||||
if not self.unsafe:
|
|
||||||
self.validate_input(dX)
|
|
||||||
return dX
|
|
||||||
|
|
||||||
# Final Layers
|
|
||||||
|
|
||||||
class Sum(Layer):
|
|
||||||
def multi(self, B):
|
|
||||||
return np.sum(B, axis=0)
|
|
||||||
|
|
||||||
def dmulti(self, dB):
|
|
||||||
#assert len(dB) == 1, "unimplemented"
|
|
||||||
return dB[0] # TODO: does this always work?
|
|
||||||
|
|
||||||
class Input(Layer):
|
|
||||||
def __init__(self, shape):
|
|
||||||
assert shape is not None
|
|
||||||
super().__init__()
|
|
||||||
self.shape = tuple(shape)
|
|
||||||
self.input_shape = self.shape
|
|
||||||
self.output_shape = self.shape
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
return X
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
#self.dY = dY
|
|
||||||
return np.zeros_like(dY)
|
|
||||||
|
|
||||||
class Affine(Layer):
|
|
||||||
def __init__(self, a=1, b=0):
|
|
||||||
super().__init__()
|
|
||||||
self.a = nf(a)
|
|
||||||
self.b = nf(b)
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
return self.a * X + self.b
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
return dY * self.a
|
|
||||||
|
|
||||||
class Sigmoid(Layer): # aka Logistic
|
|
||||||
def F(self, X):
|
|
||||||
self.sig = sigmoid(X)
|
|
||||||
return X * self.sig
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
return dY * self.sig * (1 - self.sig)
|
|
||||||
|
|
||||||
class Tanh(Layer):
|
|
||||||
def F(self, X):
|
|
||||||
self.sig = np.tanh(X)
|
|
||||||
return X * self.sig
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
return dY * (1 - self.sig * self.sig)
|
|
||||||
|
|
||||||
class Relu(Layer):
|
|
||||||
def F(self, X):
|
|
||||||
self.cond = X >= 0
|
|
||||||
return np.where(self.cond, X, 0)
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
return np.where(self.cond, dY, 0)
|
|
||||||
|
|
||||||
class Elu(Layer):
|
|
||||||
# paper: https://arxiv.org/abs/1511.07289
|
|
||||||
|
|
||||||
def __init__(self, alpha=1):
|
|
||||||
super().__init__()
|
|
||||||
self.alpha = nf(alpha)
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
self.cond = X >= 0
|
|
||||||
self.neg = np.exp(X) - 1
|
|
||||||
return np.where(self.cond, X, self.neg)
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
return dY * np.where(self.cond, 1, self.neg + 1)
|
|
||||||
|
|
||||||
class GeluApprox(Layer):
|
|
||||||
# paper: https://arxiv.org/abs/1606.08415
|
|
||||||
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
self.a = 1.704 * X
|
|
||||||
self.sig = sigmoid(self.a)
|
|
||||||
return X * self.sig
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
|
||||||
|
|
||||||
class Dense(Layer):
|
|
||||||
def __init__(self, dim, init=init_he_uniform):
|
|
||||||
super().__init__()
|
|
||||||
self.dim = ni(dim)
|
|
||||||
self.output_shape = (dim,)
|
|
||||||
self.weight_init = init
|
|
||||||
self.size = None
|
|
||||||
|
|
||||||
def make_shape(self, shape):
|
|
||||||
super().make_shape(shape)
|
|
||||||
if len(shape) != 1:
|
|
||||||
return False
|
|
||||||
self.nW = self.dim * shape[0]
|
|
||||||
self.nb = self.dim
|
|
||||||
self.size = self.nW + self.nb
|
|
||||||
return shape
|
|
||||||
|
|
||||||
def init(self, W, dW):
|
|
||||||
ins, outs = self.input_shape[0], self.output_shape[0]
|
|
||||||
|
|
||||||
self.W = W
|
|
||||||
self.dW = dW
|
|
||||||
self.coeffs = self.W[:self.nW].reshape(ins, outs)
|
|
||||||
self.biases = self.W[self.nW:].reshape(1, outs)
|
|
||||||
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
|
|
||||||
self.dbiases = self.dW[self.nW:].reshape(1, outs)
|
|
||||||
|
|
||||||
self.coeffs.flat = self.weight_init(self.nW, ins, outs)
|
|
||||||
self.biases.flat = 0
|
|
||||||
|
|
||||||
self.std = np.std(self.W)
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
self.X = X
|
|
||||||
Y = X.dot(self.coeffs) + self.biases
|
|
||||||
return Y
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
dX = dY.dot(self.coeffs.T)
|
|
||||||
self.dcoeffs[:] = self.X.T.dot(dY)
|
|
||||||
self.dbiases[:] = dY.sum(0, keepdims=True)
|
|
||||||
return dX
|
|
||||||
|
|
||||||
class DenseOneLess(Dense):
|
|
||||||
def init(self, W, dW):
|
|
||||||
super().init(W, dW)
|
|
||||||
ins, outs = self.input_shape[0], self.output_shape[0]
|
|
||||||
assert ins == outs, (ins, outs)
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
np.fill_diagonal(self.coeffs, 0)
|
|
||||||
self.X = X
|
|
||||||
Y = X.dot(self.coeffs) + self.biases
|
|
||||||
return Y
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
dX = dY.dot(self.coeffs.T)
|
|
||||||
self.dcoeffs[:] = self.X.T.dot(dY)
|
|
||||||
self.dbiases[:] = dY.sum(0, keepdims=True)
|
|
||||||
np.fill_diagonal(self.dcoeffs, 0)
|
|
||||||
return dX
|
|
||||||
|
|
||||||
class LayerNorm(Layer):
|
class LayerNorm(Layer):
|
||||||
# paper: https://arxiv.org/abs/1607.06450
|
# paper: https://arxiv.org/abs/1607.06450
|
||||||
|
@ -446,161 +66,7 @@ class LayerNorm(Layer):
|
||||||
|
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
# Model
|
# Rituals {{{1
|
||||||
|
|
||||||
class Model:
|
|
||||||
def __init__(self, x, y, unsafe=False):
|
|
||||||
assert isinstance(x, Layer), x
|
|
||||||
assert isinstance(y, Layer), y
|
|
||||||
self.x = x
|
|
||||||
self.y = y
|
|
||||||
self.ordered_nodes = self.traverse([], self.y)
|
|
||||||
self.make_weights()
|
|
||||||
for node in self.ordered_nodes:
|
|
||||||
node.unsafe = unsafe
|
|
||||||
|
|
||||||
def make_weights(self):
|
|
||||||
self.param_count = 0
|
|
||||||
for node in self.ordered_nodes:
|
|
||||||
if node.size is not None:
|
|
||||||
self.param_count += node.size
|
|
||||||
self.W = np.zeros(self.param_count, dtype=nf)
|
|
||||||
self.dW = np.zeros(self.param_count, dtype=nf)
|
|
||||||
|
|
||||||
offset = 0
|
|
||||||
for node in self.ordered_nodes:
|
|
||||||
if node.size is not None:
|
|
||||||
end = offset + node.size
|
|
||||||
node.init(self.W[offset:end], self.dW[offset:end])
|
|
||||||
offset += node.size
|
|
||||||
|
|
||||||
def traverse(self, nodes, node):
|
|
||||||
if node == self.x:
|
|
||||||
return [node]
|
|
||||||
for parent in node.parents:
|
|
||||||
if parent not in nodes:
|
|
||||||
new_nodes = self.traverse(nodes, parent)
|
|
||||||
for new_node in new_nodes:
|
|
||||||
if new_node not in nodes:
|
|
||||||
nodes.append(new_node)
|
|
||||||
if nodes:
|
|
||||||
nodes.append(node)
|
|
||||||
return nodes
|
|
||||||
|
|
||||||
def forward(self, X):
|
|
||||||
lut = dict()
|
|
||||||
input_node = self.ordered_nodes[0]
|
|
||||||
output_node = self.ordered_nodes[-1]
|
|
||||||
lut[input_node] = input_node.multi(np.expand_dims(X, 0))
|
|
||||||
for node in self.ordered_nodes[1:]:
|
|
||||||
lut[node] = node.forward(lut)
|
|
||||||
return lut[output_node]
|
|
||||||
|
|
||||||
def backward(self, error):
|
|
||||||
lut = dict()
|
|
||||||
input_node = self.ordered_nodes[0]
|
|
||||||
output_node = self.ordered_nodes[-1]
|
|
||||||
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
|
|
||||||
for node in reversed(self.ordered_nodes[:-1]):
|
|
||||||
lut[node] = node.backward(lut)
|
|
||||||
#return lut[input_node] # meaningless value
|
|
||||||
return self.dW
|
|
||||||
|
|
||||||
def load_weights(self, fn):
|
|
||||||
# seemingly compatible with keras' Dense layers.
|
|
||||||
# ignores any non-Dense layer types.
|
|
||||||
# TODO: assert file actually exists
|
|
||||||
import h5py
|
|
||||||
f = h5py.File(fn)
|
|
||||||
weights = {}
|
|
||||||
def visitor(name, obj):
|
|
||||||
if isinstance(obj, h5py.Dataset):
|
|
||||||
weights[name.split('/')[-1]] = nfa(obj[:])
|
|
||||||
f.visititems(visitor)
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
|
|
||||||
for i in range(len(denses)):
|
|
||||||
a, b = i, i + 1
|
|
||||||
b_name = "dense_{}".format(b)
|
|
||||||
# TODO: write a Dense method instead of assigning directly
|
|
||||||
denses[a].coeffs[:] = weights[b_name+'_W']
|
|
||||||
denses[a].biases[:] = np.expand_dims(weights[b_name+'_b'], 0)
|
|
||||||
|
|
||||||
def save_weights(self, fn, overwrite=False):
|
|
||||||
import h5py
|
|
||||||
f = h5py.File(fn, 'w')
|
|
||||||
|
|
||||||
denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
|
|
||||||
for i in range(len(denses)):
|
|
||||||
a, b = i, i + 1
|
|
||||||
b_name = "dense_{}".format(b)
|
|
||||||
# TODO: write a Dense method instead of assigning directly
|
|
||||||
grp = f.create_group(b_name)
|
|
||||||
data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=nf)
|
|
||||||
data[:] = denses[a].coeffs
|
|
||||||
data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=nf)
|
|
||||||
data[:] = denses[a].biases
|
|
||||||
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
class Ritual: # i'm just making up names at this point
|
|
||||||
def __init__(self, learner=None, loss=None, mloss=None):
|
|
||||||
self.learner = learner if learner is not None else Learner(Optimizer())
|
|
||||||
self.loss = loss if loss is not None else Squared()
|
|
||||||
self.mloss = mloss if mloss is not None else loss
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.learner.reset(optim=True)
|
|
||||||
|
|
||||||
def measure(self, residual):
|
|
||||||
return self.mloss.mean(residual)
|
|
||||||
|
|
||||||
def derive(self, residual):
|
|
||||||
return self.loss.dmean(residual)
|
|
||||||
|
|
||||||
def learn(self, inputs, outputs):
|
|
||||||
predicted = self.model.forward(inputs)
|
|
||||||
residual = predicted - outputs
|
|
||||||
self.model.backward(self.derive(residual))
|
|
||||||
return residual
|
|
||||||
|
|
||||||
def update(self):
|
|
||||||
self.learner.optim.update(self.model.dW, self.model.W)
|
|
||||||
|
|
||||||
def prepare(self, model):
|
|
||||||
self.en = 0
|
|
||||||
self.bn = 0
|
|
||||||
self.model = model
|
|
||||||
|
|
||||||
def train_batched(self, inputs, outputs, batch_size, return_losses=False):
|
|
||||||
self.en += 1
|
|
||||||
cumsum_loss = 0
|
|
||||||
batch_count = inputs.shape[0] // batch_size
|
|
||||||
losses = []
|
|
||||||
for b in range(batch_count):
|
|
||||||
self.bn += 1
|
|
||||||
bi = b * batch_size
|
|
||||||
batch_inputs = inputs[ bi:bi+batch_size]
|
|
||||||
batch_outputs = outputs[bi:bi+batch_size]
|
|
||||||
|
|
||||||
if self.learner.per_batch:
|
|
||||||
self.learner.batch(b / batch_count)
|
|
||||||
|
|
||||||
residual = self.learn(batch_inputs, batch_outputs)
|
|
||||||
self.update()
|
|
||||||
|
|
||||||
batch_loss = self.measure(residual)
|
|
||||||
if np.isnan(batch_loss):
|
|
||||||
raise Exception("nan")
|
|
||||||
cumsum_loss += batch_loss
|
|
||||||
if return_losses:
|
|
||||||
losses.append(batch_loss)
|
|
||||||
avg_loss = cumsum_loss / batch_count
|
|
||||||
if return_losses:
|
|
||||||
return avg_loss, losses
|
|
||||||
else:
|
|
||||||
return avg_loss
|
|
||||||
|
|
||||||
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
||||||
# paper: https://arxiv.org/abs/1606.01981
|
# paper: https://arxiv.org/abs/1606.01981
|
||||||
|
@ -681,72 +147,7 @@ class NoisyRitual(Ritual):
|
||||||
self.model.dW += np.random.normal(0, s, size=size)
|
self.model.dW += np.random.normal(0, s, size=size)
|
||||||
super().update()
|
super().update()
|
||||||
|
|
||||||
class Learner:
|
# Learners {{{1
|
||||||
per_batch = False
|
|
||||||
|
|
||||||
def __init__(self, optim, epochs=100, rate=None):
|
|
||||||
assert isinstance(optim, Optimizer)
|
|
||||||
self.optim = optim
|
|
||||||
self.start_rate = optim.alpha if rate is None else float(rate)
|
|
||||||
self.epochs = int(epochs)
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self, optim=False):
|
|
||||||
self.started = False
|
|
||||||
self.epoch = 0
|
|
||||||
if optim:
|
|
||||||
self.optim.reset()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def epoch(self):
|
|
||||||
return self._epoch
|
|
||||||
|
|
||||||
@epoch.setter
|
|
||||||
def epoch(self, new_epoch):
|
|
||||||
self._epoch = int(new_epoch)
|
|
||||||
self.rate = self.rate_at(self._epoch)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def rate(self):
|
|
||||||
return self.optim.alpha
|
|
||||||
|
|
||||||
@rate.setter
|
|
||||||
def rate(self, new_rate):
|
|
||||||
self.optim.alpha = new_rate
|
|
||||||
|
|
||||||
def rate_at(self, epoch):
|
|
||||||
return self.start_rate
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
# prepares the next epoch. returns whether or not to continue training.
|
|
||||||
if self.epoch + 1 >= self.epochs:
|
|
||||||
return False
|
|
||||||
if self.started:
|
|
||||||
self.epoch += 1
|
|
||||||
else:
|
|
||||||
self.started = True
|
|
||||||
self.epoch = self.epoch # poke property setter just in case
|
|
||||||
return True
|
|
||||||
|
|
||||||
def batch(self, progress): # TODO: rename
|
|
||||||
# interpolates rates between epochs.
|
|
||||||
# unlike epochs, we do not store batch number as a state.
|
|
||||||
# i.e. calling next() will not respect progress.
|
|
||||||
assert 0 <= progress <= 1
|
|
||||||
self.rate = self.rate_at(self._epoch + progress)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def final_rate(self):
|
|
||||||
return self.rate_at(self.epochs - 1)
|
|
||||||
|
|
||||||
class AnnealingLearner(Learner):
|
|
||||||
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
|
||||||
self.halve_every = float(halve_every)
|
|
||||||
self.anneal = 0.5**(1/self.halve_every)
|
|
||||||
super().__init__(optim, epochs, rate)
|
|
||||||
|
|
||||||
def rate_at(self, epoch):
|
|
||||||
return self.start_rate * self.anneal**epoch
|
|
||||||
|
|
||||||
class DumbLearner(AnnealingLearner):
|
class DumbLearner(AnnealingLearner):
|
||||||
# this is my own awful contraption. it's not really "SGD with restarts".
|
# this is my own awful contraption. it's not really "SGD with restarts".
|
||||||
|
@ -773,58 +174,6 @@ class DumbLearner(AnnealingLearner):
|
||||||
self.restart_callback(restart)
|
self.restart_callback(restart)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def cosmod(x):
|
|
||||||
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
|
||||||
return (1 + np.cos((x % 1) * np.pi)) / 2
|
|
||||||
|
|
||||||
class SGDR(Learner):
|
|
||||||
# Stochastic Gradient Descent with Restarts
|
|
||||||
# paper: https://arxiv.org/abs/1608.03983
|
|
||||||
# NOTE: this is missing a couple features.
|
|
||||||
|
|
||||||
per_batch = True
|
|
||||||
|
|
||||||
def __init__(self, optim, epochs=100, rate=None,
|
|
||||||
restarts=0, restart_decay=0.5, callback=None,
|
|
||||||
expando=None):
|
|
||||||
self.restart_epochs = int(epochs)
|
|
||||||
self.decay = float(restart_decay)
|
|
||||||
self.restarts = int(restarts)
|
|
||||||
self.restart_callback = callback
|
|
||||||
# TODO: rename expando to something not insane
|
|
||||||
self.expando = expando if expando is not None else lambda i: 1
|
|
||||||
|
|
||||||
self.splits = []
|
|
||||||
epochs = 0
|
|
||||||
for i in range(0, self.restarts + 1):
|
|
||||||
split = epochs + int(self.restart_epochs * self.expando(i))
|
|
||||||
self.splits.append(split)
|
|
||||||
epochs = split
|
|
||||||
super().__init__(optim, epochs, rate)
|
|
||||||
|
|
||||||
def split_num(self, epoch):
|
|
||||||
shit = [0] + self.splits # hack
|
|
||||||
for i in range(0, len(self.splits)):
|
|
||||||
if epoch < self.splits[i]:
|
|
||||||
sub_epoch = epoch - shit[i]
|
|
||||||
next_restart = self.splits[i] - shit[i]
|
|
||||||
return i, sub_epoch, next_restart
|
|
||||||
raise Exception('this should never happen.')
|
|
||||||
|
|
||||||
def rate_at(self, epoch):
|
|
||||||
restart, sub_epoch, next_restart = self.split_num(epoch)
|
|
||||||
x = sub_epoch / next_restart
|
|
||||||
return self.start_rate * self.decay**restart * cosmod(x)
|
|
||||||
|
|
||||||
def next(self):
|
|
||||||
if not super().next():
|
|
||||||
return False
|
|
||||||
restart, sub_epoch, next_restart = self.split_num(self.epoch)
|
|
||||||
if restart > 0 and sub_epoch == 0:
|
|
||||||
if self.restart_callback is not None:
|
|
||||||
self.restart_callback(restart)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def multiresnet(x, width, depth, block=2, multi=1,
|
def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
activation=Relu, style='batchless',
|
activation=Relu, style='batchless',
|
||||||
init=init_he_normal):
|
init=init_he_normal):
|
||||||
|
@ -882,6 +231,8 @@ def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
# etc. {{{1
|
||||||
|
|
||||||
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
|
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
|
||||||
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
|
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
|
||||||
|
|
||||||
|
@ -953,74 +304,7 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
|
|
||||||
return (inputs, outputs), (valid_inputs, valid_outputs)
|
return (inputs, outputs), (valid_inputs, valid_outputs)
|
||||||
|
|
||||||
def run(program, args=[]):
|
def model_from_config(config, input_features, output_features):
|
||||||
import sys
|
|
||||||
lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
|
|
||||||
def log(left, right):
|
|
||||||
lament("{:>20}: {}".format(left, right))
|
|
||||||
|
|
||||||
# Config
|
|
||||||
|
|
||||||
from dotmap import DotMap
|
|
||||||
config = DotMap(
|
|
||||||
fn_load = None,
|
|
||||||
fn_save = 'optim_nn.h5',
|
|
||||||
log_fn = 'losses.npz',
|
|
||||||
|
|
||||||
# multi-residual network parameters
|
|
||||||
res_width = 28,
|
|
||||||
res_depth = 2,
|
|
||||||
res_block = 3, # normally 2 for plain resnet
|
|
||||||
res_multi = 2, # normally 1 for plain resnet
|
|
||||||
|
|
||||||
# style of resnet (order of layers, which layers, etc.)
|
|
||||||
parallel_style = 'onelesssum',
|
|
||||||
activation = 'gelu',
|
|
||||||
|
|
||||||
optim = 'adam',
|
|
||||||
nesterov = False, # only used with SGD or Adam
|
|
||||||
momentum = 0.50, # only used with SGD
|
|
||||||
batch_size = 64,
|
|
||||||
|
|
||||||
# learning parameters
|
|
||||||
learner = 'sgdr',
|
|
||||||
learn = 1e-2,
|
|
||||||
learn_halve_every = 16, # unused with SGDR
|
|
||||||
learn_restart_advance = 16, # unused with SGDR
|
|
||||||
epochs = 24,
|
|
||||||
restarts = 2,
|
|
||||||
restart_decay = 0.25, # only used with SGDR
|
|
||||||
expando = lambda i: i + 1,
|
|
||||||
|
|
||||||
# misc
|
|
||||||
init = 'he_normal',
|
|
||||||
loss = 'mse',
|
|
||||||
mloss = 'mse',
|
|
||||||
ritual = 'default',
|
|
||||||
restart_optim = False, # restarts also reset internal state of optimizer
|
|
||||||
|
|
||||||
problem = 3,
|
|
||||||
# best results for ~10,000 parameters
|
|
||||||
# (keep these paired; update both at the same time!)
|
|
||||||
train_compare = 1.854613e-05,
|
|
||||||
valid_compare = 1.623881e-05,
|
|
||||||
|
|
||||||
unsafe = True, # aka gotta go fast mode
|
|
||||||
)
|
|
||||||
|
|
||||||
for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']:
|
|
||||||
config[k] = config[k].lower()
|
|
||||||
|
|
||||||
config.pprint()
|
|
||||||
|
|
||||||
# toy data
|
|
||||||
# (our model is probably complete overkill for this, so TODO: better data)
|
|
||||||
|
|
||||||
(inputs, outputs), (valid_inputs, valid_outputs) = \
|
|
||||||
toy_data(2**14, 2**11, problem=config.problem)
|
|
||||||
input_features = inputs.shape[-1]
|
|
||||||
output_features = outputs.shape[-1]
|
|
||||||
|
|
||||||
# Our Test Model
|
# Our Test Model
|
||||||
|
|
||||||
init = inits[config.init]
|
init = inits[config.init]
|
||||||
|
@ -1038,17 +322,6 @@ def run(program, args=[]):
|
||||||
|
|
||||||
model = Model(x, y, unsafe=config.unsafe)
|
model = Model(x, y, unsafe=config.unsafe)
|
||||||
|
|
||||||
if 0:
|
|
||||||
node_names = ' '.join([str(node) for node in model.ordered_nodes])
|
|
||||||
log('{} nodes'.format(len(model.ordered_nodes)), node_names)
|
|
||||||
else:
|
|
||||||
for node in model.ordered_nodes:
|
|
||||||
children = [str(n) for n in node.children]
|
|
||||||
if len(children) > 0:
|
|
||||||
sep = '->'
|
|
||||||
print(str(node)+sep+('\n'+str(node)+sep).join(children))
|
|
||||||
log('parameters', model.param_count)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
||||||
training = config.epochs > 0 and config.restarts >= 0
|
training = config.epochs > 0 and config.restarts >= 0
|
||||||
|
@ -1129,6 +402,92 @@ def run(program, args=[]):
|
||||||
else:
|
else:
|
||||||
raise Exception('unknown ritual', config.ritual)
|
raise Exception('unknown ritual', config.ritual)
|
||||||
|
|
||||||
|
#
|
||||||
|
|
||||||
|
return model, learner, ritual, (loss, mloss)
|
||||||
|
|
||||||
|
# main {{{1
|
||||||
|
|
||||||
|
def run(program, args=[]):
|
||||||
|
|
||||||
|
# Config
|
||||||
|
|
||||||
|
from dotmap import DotMap
|
||||||
|
config = DotMap(
|
||||||
|
fn_load = None,
|
||||||
|
fn_save = 'optim_nn.h5',
|
||||||
|
log_fn = 'losses.npz',
|
||||||
|
|
||||||
|
# multi-residual network parameters
|
||||||
|
res_width = 28,
|
||||||
|
res_depth = 2,
|
||||||
|
res_block = 3, # normally 2 for plain resnet
|
||||||
|
res_multi = 2, # normally 1 for plain resnet
|
||||||
|
|
||||||
|
# style of resnet (order of layers, which layers, etc.)
|
||||||
|
parallel_style = 'onelesssum',
|
||||||
|
activation = 'gelu',
|
||||||
|
|
||||||
|
optim = 'adam',
|
||||||
|
nesterov = False, # only used with SGD or Adam
|
||||||
|
momentum = 0.50, # only used with SGD
|
||||||
|
batch_size = 64,
|
||||||
|
|
||||||
|
# learning parameters
|
||||||
|
learner = 'sgdr',
|
||||||
|
learn = 1e-2,
|
||||||
|
learn_halve_every = 16, # unused with SGDR
|
||||||
|
learn_restart_advance = 16, # unused with SGDR
|
||||||
|
epochs = 24,
|
||||||
|
restarts = 2,
|
||||||
|
restart_decay = 0.25, # only used with SGDR
|
||||||
|
expando = lambda i: i + 1,
|
||||||
|
|
||||||
|
# misc
|
||||||
|
init = 'he_normal',
|
||||||
|
loss = 'mse',
|
||||||
|
mloss = 'mse',
|
||||||
|
ritual = 'default',
|
||||||
|
restart_optim = False, # restarts also reset internal state of optimizer
|
||||||
|
|
||||||
|
problem = 3,
|
||||||
|
# best results for ~10,000 parameters
|
||||||
|
# (keep these paired; update both at the same time!)
|
||||||
|
train_compare = 1.854613e-05,
|
||||||
|
valid_compare = 1.623881e-05,
|
||||||
|
|
||||||
|
unsafe = True, # aka gotta go fast mode
|
||||||
|
)
|
||||||
|
|
||||||
|
for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']:
|
||||||
|
config[k] = config[k].lower()
|
||||||
|
|
||||||
|
config.pprint()
|
||||||
|
|
||||||
|
# toy data
|
||||||
|
# (our model is probably complete overkill for this, so TODO: better data)
|
||||||
|
|
||||||
|
(inputs, outputs), (valid_inputs, valid_outputs) = \
|
||||||
|
toy_data(2**14, 2**11, problem=config.problem)
|
||||||
|
input_features = inputs.shape[-1]
|
||||||
|
output_features = outputs.shape[-1]
|
||||||
|
|
||||||
|
model, learner, ritual, (loss, mloss) = \
|
||||||
|
model_from_config(config, input_features, output_features)
|
||||||
|
|
||||||
|
# Model Information
|
||||||
|
|
||||||
|
if 0:
|
||||||
|
node_names = ' '.join([str(node) for node in model.ordered_nodes])
|
||||||
|
log('{} nodes'.format(len(model.ordered_nodes)), node_names)
|
||||||
|
else:
|
||||||
|
for node in model.ordered_nodes:
|
||||||
|
children = [str(n) for n in node.children]
|
||||||
|
if len(children) > 0:
|
||||||
|
sep = '->'
|
||||||
|
print(str(node)+sep+('\n'+str(node)+sep).join(children))
|
||||||
|
log('parameters', model.param_count)
|
||||||
|
|
||||||
# Training
|
# Training
|
||||||
|
|
||||||
batch_losses = []
|
batch_losses = []
|
||||||
|
@ -1188,6 +547,7 @@ def run(program, args=[]):
|
||||||
# TODO: write this portion again
|
# TODO: write this portion again
|
||||||
|
|
||||||
if config.log_fn is not None:
|
if config.log_fn is not None:
|
||||||
|
log('saving losses', config.log_fn)
|
||||||
np.savez_compressed(config.log_fn,
|
np.savez_compressed(config.log_fn,
|
||||||
batch_losses=nfa(batch_losses),
|
batch_losses=nfa(batch_losses),
|
||||||
train_losses=nfa(train_losses),
|
train_losses=nfa(train_losses),
|
||||||
|
|
672
optim_nn_core.py
Normal file
672
optim_nn_core.py
Normal file
|
@ -0,0 +1,672 @@
|
||||||
|
import numpy as np
|
||||||
|
# ugly shorthand:
|
||||||
|
nf = np.float32
|
||||||
|
nfa = lambda x: np.array(x, dtype=nf)
|
||||||
|
ni = np.int
|
||||||
|
nia = lambda x: np.array(x, dtype=ni)
|
||||||
|
|
||||||
|
# just for speed, not strictly essential:
|
||||||
|
from scipy.special import expit as sigmoid
|
||||||
|
|
||||||
|
# used for numbering layers like Keras:
|
||||||
|
from collections import defaultdict
|
||||||
|
_layer_counters = defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
# Initializations {{{1
|
||||||
|
|
||||||
|
# note: these are currently only implemented for 2D shapes.
|
||||||
|
|
||||||
|
def init_he_normal(size, ins, outs):
|
||||||
|
s = np.sqrt(2 / ins)
|
||||||
|
return np.random.normal(0, s, size=size)
|
||||||
|
|
||||||
|
def init_he_uniform(size, ins, outs):
|
||||||
|
s = np.sqrt(6 / ins)
|
||||||
|
return np.random.uniform(-s, s, size=size)
|
||||||
|
|
||||||
|
# Loss functions {{{1
|
||||||
|
|
||||||
|
class Loss:
|
||||||
|
def mean(self, r):
|
||||||
|
return np.average(self.f(r))
|
||||||
|
|
||||||
|
def dmean(self, r):
|
||||||
|
d = self.df(r)
|
||||||
|
return d / len(d)
|
||||||
|
|
||||||
|
class Squared(Loss):
|
||||||
|
def f(self, r):
|
||||||
|
return np.square(r)
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return 2 * r
|
||||||
|
|
||||||
|
class Absolute(Loss):
|
||||||
|
def f(self, r):
|
||||||
|
return np.abs(r)
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return np.sign(r)
|
||||||
|
|
||||||
|
# Optimizers {{{1
|
||||||
|
|
||||||
|
class Optimizer:
|
||||||
|
def __init__(self, alpha=0.1):
|
||||||
|
self.alpha = nf(alpha)
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
return -self.alpha * dW
|
||||||
|
|
||||||
|
def update(self, dW, W):
|
||||||
|
W += self.compute(dW, W)
|
||||||
|
|
||||||
|
# the following optimizers are blatantly lifted from tiny-dnn:
|
||||||
|
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
||||||
|
|
||||||
|
class Momentum(Optimizer):
|
||||||
|
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
||||||
|
self.alpha = np.asfarray(alpha) # learning rate
|
||||||
|
self.lamb = np.asfarray(lamb) # weight decay
|
||||||
|
self.mu = np.asfarray(mu) # momentum
|
||||||
|
self.nesterov = bool(nesterov)
|
||||||
|
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.dWprev = None
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.dWprev is None:
|
||||||
|
#self.dWprev = np.zeros_like(dW)
|
||||||
|
self.dWprev = np.copy(dW)
|
||||||
|
|
||||||
|
V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb)
|
||||||
|
self.dWprev[:] = V
|
||||||
|
if self.nesterov: # TODO: is this correct? looks weird
|
||||||
|
return self.mu * V - self.alpha * (dW + W * self.lamb)
|
||||||
|
else:
|
||||||
|
return V
|
||||||
|
|
||||||
|
class Adam(Optimizer):
|
||||||
|
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
||||||
|
self.alpha = nf(alpha) # learning rate
|
||||||
|
self.b1 = nf(b1) # decay term
|
||||||
|
self.b2 = nf(b2) # decay term
|
||||||
|
self.b1_t_default = nf(b1_t) # decay term power t
|
||||||
|
self.b2_t_default = nf(b2_t) # decay term power t
|
||||||
|
self.eps = nf(eps)
|
||||||
|
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.mt = None
|
||||||
|
self.vt = None
|
||||||
|
self.b1_t = self.b1_t_default
|
||||||
|
self.b2_t = self.b2_t_default
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
if self.mt is None:
|
||||||
|
self.mt = np.zeros_like(W)
|
||||||
|
if self.vt is None:
|
||||||
|
self.vt = np.zeros_like(W)
|
||||||
|
|
||||||
|
# decay
|
||||||
|
self.b1_t *= self.b1
|
||||||
|
self.b2_t *= self.b2
|
||||||
|
|
||||||
|
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
|
||||||
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
||||||
|
|
||||||
|
return -self.alpha * (self.mt / (1 - self.b1_t)) \
|
||||||
|
/ np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
|
||||||
|
|
||||||
|
# Abstract Layers {{{1
|
||||||
|
|
||||||
|
class Layer:
|
||||||
|
def __init__(self):
|
||||||
|
self.parents = []
|
||||||
|
self.children = []
|
||||||
|
self.input_shape = None
|
||||||
|
self.output_shape = None
|
||||||
|
kind = self.__class__.__name__
|
||||||
|
global _layer_counters
|
||||||
|
_layer_counters[kind] += 1
|
||||||
|
self.name = "{}_{}".format(kind, _layer_counters[kind])
|
||||||
|
self.size = None # total weight count (if any)
|
||||||
|
self.unsafe = False # disables assertions for better performance
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
# methods we might want to override:
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
|
||||||
|
def do_feed(self, child):
|
||||||
|
self.children.append(child)
|
||||||
|
|
||||||
|
def be_fed(self, parent):
|
||||||
|
self.parents.append(parent)
|
||||||
|
|
||||||
|
def make_shape(self, shape):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert shape is not None
|
||||||
|
if self.output_shape is None:
|
||||||
|
self.output_shape = shape
|
||||||
|
return shape
|
||||||
|
|
||||||
|
# TODO: rename this multi and B crap to something actually relevant.
|
||||||
|
|
||||||
|
def multi(self, B):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert len(B) == 1, self
|
||||||
|
return self.F(B[0])
|
||||||
|
|
||||||
|
def dmulti(self, dB):
|
||||||
|
if len(dB) == 1:
|
||||||
|
return self.dF(dB[0])
|
||||||
|
else:
|
||||||
|
dX = None
|
||||||
|
for dY in dB:
|
||||||
|
if dX is None:
|
||||||
|
dX = self.dF(dY)
|
||||||
|
else:
|
||||||
|
dX += self.dF(dY)
|
||||||
|
return dX
|
||||||
|
|
||||||
|
# general utility methods:
|
||||||
|
|
||||||
|
def compatible(self, parent):
|
||||||
|
if self.input_shape is None:
|
||||||
|
# inherit shape from output
|
||||||
|
shape = self.make_shape(parent.output_shape)
|
||||||
|
if shape is None:
|
||||||
|
return False
|
||||||
|
self.input_shape = shape
|
||||||
|
if np.all(self.input_shape == parent.output_shape):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def feed(self, child):
|
||||||
|
if not child.compatible(self):
|
||||||
|
fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}"
|
||||||
|
raise Exception(fmt.format(self, child, self.output_shape, child.input_shape))
|
||||||
|
self.do_feed(child)
|
||||||
|
child.be_fed(self)
|
||||||
|
return child
|
||||||
|
|
||||||
|
def validate_input(self, X):
|
||||||
|
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
|
||||||
|
|
||||||
|
def validate_output(self, Y):
|
||||||
|
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
||||||
|
|
||||||
|
def forward(self, lut):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert len(self.parents) > 0, self
|
||||||
|
B = []
|
||||||
|
for parent in self.parents:
|
||||||
|
# TODO: skip over irrelevant nodes (if any)
|
||||||
|
X = lut[parent]
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_input(X)
|
||||||
|
B.append(X)
|
||||||
|
Y = self.multi(B)
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_output(Y)
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def backward(self, lut):
|
||||||
|
if not self.unsafe:
|
||||||
|
assert len(self.children) > 0, self
|
||||||
|
dB = []
|
||||||
|
for child in self.children:
|
||||||
|
# TODO: skip over irrelevant nodes (if any)
|
||||||
|
dY = lut[child]
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_output(dY)
|
||||||
|
dB.append(dY)
|
||||||
|
dX = self.dmulti(dB)
|
||||||
|
if not self.unsafe:
|
||||||
|
self.validate_input(dX)
|
||||||
|
return dX
|
||||||
|
|
||||||
|
# Nonparametric Layers {{{1
|
||||||
|
|
||||||
|
class Sum(Layer):
|
||||||
|
def multi(self, B):
|
||||||
|
return np.sum(B, axis=0)
|
||||||
|
|
||||||
|
def dmulti(self, dB):
|
||||||
|
#assert len(dB) == 1, "unimplemented"
|
||||||
|
return dB[0] # TODO: does this always work?
|
||||||
|
|
||||||
|
class Input(Layer):
|
||||||
|
def __init__(self, shape):
|
||||||
|
assert shape is not None
|
||||||
|
super().__init__()
|
||||||
|
self.shape = tuple(shape)
|
||||||
|
self.input_shape = self.shape
|
||||||
|
self.output_shape = self.shape
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
return X
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
#self.dY = dY
|
||||||
|
return np.zeros_like(dY)
|
||||||
|
|
||||||
|
class Affine(Layer):
|
||||||
|
def __init__(self, a=1, b=0):
|
||||||
|
super().__init__()
|
||||||
|
self.a = nf(a)
|
||||||
|
self.b = nf(b)
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
return self.a * X + self.b
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return dY * self.a
|
||||||
|
|
||||||
|
class Sigmoid(Layer): # aka Logistic
|
||||||
|
def F(self, X):
|
||||||
|
self.sig = sigmoid(X)
|
||||||
|
return X * self.sig
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return dY * self.sig * (1 - self.sig)
|
||||||
|
|
||||||
|
class Tanh(Layer):
|
||||||
|
def F(self, X):
|
||||||
|
self.sig = np.tanh(X)
|
||||||
|
return X * self.sig
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return dY * (1 - self.sig * self.sig)
|
||||||
|
|
||||||
|
class Relu(Layer):
|
||||||
|
def F(self, X):
|
||||||
|
self.cond = X >= 0
|
||||||
|
return np.where(self.cond, X, 0)
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return np.where(self.cond, dY, 0)
|
||||||
|
|
||||||
|
class Elu(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1511.07289
|
||||||
|
|
||||||
|
def __init__(self, alpha=1):
|
||||||
|
super().__init__()
|
||||||
|
self.alpha = nf(alpha)
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
self.cond = X >= 0
|
||||||
|
self.neg = np.exp(X) - 1
|
||||||
|
return np.where(self.cond, X, self.neg)
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return dY * np.where(self.cond, 1, self.neg + 1)
|
||||||
|
|
||||||
|
class GeluApprox(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1606.08415
|
||||||
|
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
self.a = 1.704 * X
|
||||||
|
self.sig = sigmoid(self.a)
|
||||||
|
return X * self.sig
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
||||||
|
|
||||||
|
# Parametric Layers {{{1
|
||||||
|
|
||||||
|
class Dense(Layer):
|
||||||
|
def __init__(self, dim, init=init_he_uniform):
|
||||||
|
super().__init__()
|
||||||
|
self.dim = ni(dim)
|
||||||
|
self.output_shape = (dim,)
|
||||||
|
self.weight_init = init
|
||||||
|
self.size = None
|
||||||
|
|
||||||
|
def make_shape(self, shape):
|
||||||
|
super().make_shape(shape)
|
||||||
|
if len(shape) != 1:
|
||||||
|
return False
|
||||||
|
self.nW = self.dim * shape[0]
|
||||||
|
self.nb = self.dim
|
||||||
|
self.size = self.nW + self.nb
|
||||||
|
return shape
|
||||||
|
|
||||||
|
def init(self, W, dW):
|
||||||
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
||||||
|
|
||||||
|
self.W = W
|
||||||
|
self.dW = dW
|
||||||
|
self.coeffs = self.W[:self.nW].reshape(ins, outs)
|
||||||
|
self.biases = self.W[self.nW:].reshape(1, outs)
|
||||||
|
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
|
||||||
|
self.dbiases = self.dW[self.nW:].reshape(1, outs)
|
||||||
|
|
||||||
|
self.coeffs.flat = self.weight_init(self.nW, ins, outs)
|
||||||
|
self.biases.flat = 0
|
||||||
|
|
||||||
|
self.std = np.std(self.W)
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
self.X = X
|
||||||
|
Y = X.dot(self.coeffs) + self.biases
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
dX = dY.dot(self.coeffs.T)
|
||||||
|
self.dcoeffs[:] = self.X.T.dot(dY)
|
||||||
|
self.dbiases[:] = dY.sum(0, keepdims=True)
|
||||||
|
return dX
|
||||||
|
|
||||||
|
class DenseOneLess(Dense):
|
||||||
|
def init(self, W, dW):
|
||||||
|
super().init(W, dW)
|
||||||
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
||||||
|
assert ins == outs, (ins, outs)
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
np.fill_diagonal(self.coeffs, 0)
|
||||||
|
self.X = X
|
||||||
|
Y = X.dot(self.coeffs) + self.biases
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
dX = dY.dot(self.coeffs.T)
|
||||||
|
self.dcoeffs[:] = self.X.T.dot(dY)
|
||||||
|
self.dbiases[:] = dY.sum(0, keepdims=True)
|
||||||
|
np.fill_diagonal(self.dcoeffs, 0)
|
||||||
|
return dX
|
||||||
|
|
||||||
|
# Models {{{1
|
||||||
|
|
||||||
|
class Model:
|
||||||
|
def __init__(self, x, y, unsafe=False):
|
||||||
|
assert isinstance(x, Layer), x
|
||||||
|
assert isinstance(y, Layer), y
|
||||||
|
self.x = x
|
||||||
|
self.y = y
|
||||||
|
self.ordered_nodes = self.traverse([], self.y)
|
||||||
|
self.make_weights()
|
||||||
|
for node in self.ordered_nodes:
|
||||||
|
node.unsafe = unsafe
|
||||||
|
|
||||||
|
def make_weights(self):
|
||||||
|
self.param_count = 0
|
||||||
|
for node in self.ordered_nodes:
|
||||||
|
if node.size is not None:
|
||||||
|
self.param_count += node.size
|
||||||
|
self.W = np.zeros(self.param_count, dtype=nf)
|
||||||
|
self.dW = np.zeros(self.param_count, dtype=nf)
|
||||||
|
|
||||||
|
offset = 0
|
||||||
|
for node in self.ordered_nodes:
|
||||||
|
if node.size is not None:
|
||||||
|
end = offset + node.size
|
||||||
|
node.init(self.W[offset:end], self.dW[offset:end])
|
||||||
|
offset += node.size
|
||||||
|
|
||||||
|
def traverse(self, nodes, node):
|
||||||
|
if node == self.x:
|
||||||
|
return [node]
|
||||||
|
for parent in node.parents:
|
||||||
|
if parent not in nodes:
|
||||||
|
new_nodes = self.traverse(nodes, parent)
|
||||||
|
for new_node in new_nodes:
|
||||||
|
if new_node not in nodes:
|
||||||
|
nodes.append(new_node)
|
||||||
|
if nodes:
|
||||||
|
nodes.append(node)
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
def forward(self, X):
|
||||||
|
lut = dict()
|
||||||
|
input_node = self.ordered_nodes[0]
|
||||||
|
output_node = self.ordered_nodes[-1]
|
||||||
|
lut[input_node] = input_node.multi(np.expand_dims(X, 0))
|
||||||
|
for node in self.ordered_nodes[1:]:
|
||||||
|
lut[node] = node.forward(lut)
|
||||||
|
return lut[output_node]
|
||||||
|
|
||||||
|
def backward(self, error):
|
||||||
|
lut = dict()
|
||||||
|
input_node = self.ordered_nodes[0]
|
||||||
|
output_node = self.ordered_nodes[-1]
|
||||||
|
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
|
||||||
|
for node in reversed(self.ordered_nodes[:-1]):
|
||||||
|
lut[node] = node.backward(lut)
|
||||||
|
#return lut[input_node] # meaningless value
|
||||||
|
return self.dW
|
||||||
|
|
||||||
|
def load_weights(self, fn):
|
||||||
|
# seemingly compatible with keras' Dense layers.
|
||||||
|
# ignores any non-Dense layer types.
|
||||||
|
# TODO: assert file actually exists
|
||||||
|
import h5py
|
||||||
|
f = h5py.File(fn)
|
||||||
|
weights = {}
|
||||||
|
def visitor(name, obj):
|
||||||
|
if isinstance(obj, h5py.Dataset):
|
||||||
|
weights[name.split('/')[-1]] = nfa(obj[:])
|
||||||
|
f.visititems(visitor)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
|
||||||
|
for i in range(len(denses)):
|
||||||
|
a, b = i, i + 1
|
||||||
|
b_name = "dense_{}".format(b)
|
||||||
|
# TODO: write a Dense method instead of assigning directly
|
||||||
|
denses[a].coeffs[:] = weights[b_name+'_W']
|
||||||
|
denses[a].biases[:] = np.expand_dims(weights[b_name+'_b'], 0)
|
||||||
|
|
||||||
|
def save_weights(self, fn, overwrite=False):
|
||||||
|
import h5py
|
||||||
|
f = h5py.File(fn, 'w')
|
||||||
|
|
||||||
|
denses = [node for node in self.ordered_nodes if isinstance(node, Dense)]
|
||||||
|
for i in range(len(denses)):
|
||||||
|
a, b = i, i + 1
|
||||||
|
b_name = "dense_{}".format(b)
|
||||||
|
# TODO: write a Dense method instead of assigning directly
|
||||||
|
grp = f.create_group(b_name)
|
||||||
|
data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=nf)
|
||||||
|
data[:] = denses[a].coeffs
|
||||||
|
data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=nf)
|
||||||
|
data[:] = denses[a].biases
|
||||||
|
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
# Rituals {{{1
|
||||||
|
|
||||||
|
class Ritual: # i'm just making up names at this point
|
||||||
|
def __init__(self, learner=None, loss=None, mloss=None):
|
||||||
|
self.learner = learner if learner is not None else Learner(Optimizer())
|
||||||
|
self.loss = loss if loss is not None else Squared()
|
||||||
|
self.mloss = mloss if mloss is not None else loss
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.learner.reset(optim=True)
|
||||||
|
|
||||||
|
def measure(self, residual):
|
||||||
|
return self.mloss.mean(residual)
|
||||||
|
|
||||||
|
def derive(self, residual):
|
||||||
|
return self.loss.dmean(residual)
|
||||||
|
|
||||||
|
def learn(self, inputs, outputs):
|
||||||
|
predicted = self.model.forward(inputs)
|
||||||
|
residual = predicted - outputs
|
||||||
|
self.model.backward(self.derive(residual))
|
||||||
|
return residual
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
self.learner.optim.update(self.model.dW, self.model.W)
|
||||||
|
|
||||||
|
def prepare(self, model):
|
||||||
|
self.en = 0
|
||||||
|
self.bn = 0
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def train_batched(self, inputs, outputs, batch_size, return_losses=False):
|
||||||
|
self.en += 1
|
||||||
|
cumsum_loss = 0
|
||||||
|
batch_count = inputs.shape[0] // batch_size
|
||||||
|
losses = []
|
||||||
|
for b in range(batch_count):
|
||||||
|
self.bn += 1
|
||||||
|
bi = b * batch_size
|
||||||
|
batch_inputs = inputs[ bi:bi+batch_size]
|
||||||
|
batch_outputs = outputs[bi:bi+batch_size]
|
||||||
|
|
||||||
|
if self.learner.per_batch:
|
||||||
|
self.learner.batch(b / batch_count)
|
||||||
|
|
||||||
|
residual = self.learn(batch_inputs, batch_outputs)
|
||||||
|
self.update()
|
||||||
|
|
||||||
|
batch_loss = self.measure(residual)
|
||||||
|
if np.isnan(batch_loss):
|
||||||
|
raise Exception("nan")
|
||||||
|
cumsum_loss += batch_loss
|
||||||
|
if return_losses:
|
||||||
|
losses.append(batch_loss)
|
||||||
|
avg_loss = cumsum_loss / batch_count
|
||||||
|
if return_losses:
|
||||||
|
return avg_loss, losses
|
||||||
|
else:
|
||||||
|
return avg_loss
|
||||||
|
|
||||||
|
# Learners {{{1
|
||||||
|
|
||||||
|
class Learner:
|
||||||
|
per_batch = False
|
||||||
|
|
||||||
|
def __init__(self, optim, epochs=100, rate=None):
|
||||||
|
assert isinstance(optim, Optimizer)
|
||||||
|
self.optim = optim
|
||||||
|
self.start_rate = optim.alpha if rate is None else float(rate)
|
||||||
|
self.epochs = int(epochs)
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def reset(self, optim=False):
|
||||||
|
self.started = False
|
||||||
|
self.epoch = 0
|
||||||
|
if optim:
|
||||||
|
self.optim.reset()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def epoch(self):
|
||||||
|
return self._epoch
|
||||||
|
|
||||||
|
@epoch.setter
|
||||||
|
def epoch(self, new_epoch):
|
||||||
|
self._epoch = int(new_epoch)
|
||||||
|
self.rate = self.rate_at(self._epoch)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def rate(self):
|
||||||
|
return self.optim.alpha
|
||||||
|
|
||||||
|
@rate.setter
|
||||||
|
def rate(self, new_rate):
|
||||||
|
self.optim.alpha = new_rate
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
return self.start_rate
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
# prepares the next epoch. returns whether or not to continue training.
|
||||||
|
if self.epoch + 1 >= self.epochs:
|
||||||
|
return False
|
||||||
|
if self.started:
|
||||||
|
self.epoch += 1
|
||||||
|
else:
|
||||||
|
self.started = True
|
||||||
|
self.epoch = self.epoch # poke property setter just in case
|
||||||
|
return True
|
||||||
|
|
||||||
|
def batch(self, progress): # TODO: rename
|
||||||
|
# interpolates rates between epochs.
|
||||||
|
# unlike epochs, we do not store batch number as a state.
|
||||||
|
# i.e. calling next() will not respect progress.
|
||||||
|
assert 0 <= progress <= 1
|
||||||
|
self.rate = self.rate_at(self._epoch + progress)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def final_rate(self):
|
||||||
|
return self.rate_at(self.epochs - 1)
|
||||||
|
|
||||||
|
class AnnealingLearner(Learner):
|
||||||
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
||||||
|
self.halve_every = float(halve_every)
|
||||||
|
self.anneal = 0.5**(1/self.halve_every)
|
||||||
|
super().__init__(optim, epochs, rate)
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
return self.start_rate * self.anneal**epoch
|
||||||
|
|
||||||
|
def cosmod(x):
|
||||||
|
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
||||||
|
return (1 + np.cos((x % 1) * np.pi)) / 2
|
||||||
|
|
||||||
|
class SGDR(Learner):
|
||||||
|
# Stochastic Gradient Descent with Restarts
|
||||||
|
# paper: https://arxiv.org/abs/1608.03983
|
||||||
|
# NOTE: this is missing a couple features.
|
||||||
|
|
||||||
|
per_batch = True
|
||||||
|
|
||||||
|
def __init__(self, optim, epochs=100, rate=None,
|
||||||
|
restarts=0, restart_decay=0.5, callback=None,
|
||||||
|
expando=None):
|
||||||
|
self.restart_epochs = int(epochs)
|
||||||
|
self.decay = float(restart_decay)
|
||||||
|
self.restarts = int(restarts)
|
||||||
|
self.restart_callback = callback
|
||||||
|
# TODO: rename expando to something not insane
|
||||||
|
self.expando = expando if expando is not None else lambda i: 1
|
||||||
|
|
||||||
|
self.splits = []
|
||||||
|
epochs = 0
|
||||||
|
for i in range(0, self.restarts + 1):
|
||||||
|
split = epochs + int(self.restart_epochs * self.expando(i))
|
||||||
|
self.splits.append(split)
|
||||||
|
epochs = split
|
||||||
|
super().__init__(optim, epochs, rate)
|
||||||
|
|
||||||
|
def split_num(self, epoch):
|
||||||
|
shit = [0] + self.splits # hack
|
||||||
|
for i in range(0, len(self.splits)):
|
||||||
|
if epoch < self.splits[i]:
|
||||||
|
sub_epoch = epoch - shit[i]
|
||||||
|
next_restart = self.splits[i] - shit[i]
|
||||||
|
return i, sub_epoch, next_restart
|
||||||
|
raise Exception('this should never happen.')
|
||||||
|
|
||||||
|
def rate_at(self, epoch):
|
||||||
|
restart, sub_epoch, next_restart = self.split_num(epoch)
|
||||||
|
x = sub_epoch / next_restart
|
||||||
|
return self.start_rate * self.decay**restart * cosmod(x)
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
if not super().next():
|
||||||
|
return False
|
||||||
|
restart, sub_epoch, next_restart = self.split_num(self.epoch)
|
||||||
|
if restart > 0 and sub_epoch == 0:
|
||||||
|
if self.restart_callback is not None:
|
||||||
|
self.restart_callback(restart)
|
||||||
|
return True
|
Loading…
Reference in New Issue
Block a user