.
This commit is contained in:
parent
a8952bebf5
commit
166644023e
2 changed files with 97 additions and 16 deletions
65
optim_nn.py
65
optim_nn.py
|
@ -132,6 +132,38 @@ class DenseOneLess(Dense):
|
||||||
np.fill_diagonal(self.dcoeffs, 0)
|
np.fill_diagonal(self.dcoeffs, 0)
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
|
class CosineDense(Dense):
|
||||||
|
# paper: https://arxiv.org/abs/1702.05870
|
||||||
|
# another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
|
||||||
|
# the paper doesn't mention bias,
|
||||||
|
# so we treat bias as an additional weight with a constant input of 1.
|
||||||
|
# this is correct in Dense layers, so i hope it's correct here too.
|
||||||
|
|
||||||
|
eps = 1e-4
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
self.X = X
|
||||||
|
self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
|
||||||
|
+ 1 + self.eps)
|
||||||
|
self.W_norm = np.sqrt(np.square(self.coeffs).sum(0, keepdims=True) \
|
||||||
|
+ np.square(self.biases) + self.eps)
|
||||||
|
self.dot = X.dot(self.coeffs) + self.biases
|
||||||
|
Y = self.dot / (self.X_norm * self.W_norm)
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
ddot = dY / self.X_norm / self.W_norm
|
||||||
|
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
|
||||||
|
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
|
||||||
|
|
||||||
|
self.dcoeffs[:] = self.X.T.dot(ddot) \
|
||||||
|
+ dW_norm / self.W_norm * self.coeffs
|
||||||
|
self.dbiases[:] = ddot.sum(0, keepdims=True) \
|
||||||
|
+ dW_norm / self.W_norm * self.biases
|
||||||
|
dX = ddot.dot(self.coeffs.T) + dX_norm / self.X_norm * self.X
|
||||||
|
|
||||||
|
return dX
|
||||||
|
|
||||||
# Rituals {{{1
|
# Rituals {{{1
|
||||||
|
|
||||||
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
||||||
|
@ -300,7 +332,6 @@ def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
|
||||||
y = merger
|
y = merger
|
||||||
else:
|
else:
|
||||||
y = z
|
y = z
|
||||||
#y = y.feed(LayerNorm())
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
_mr_styles = dict(
|
_mr_styles = dict(
|
||||||
|
@ -312,6 +343,11 @@ _mr_styles = dict(
|
||||||
def multiresnet(x, width, depth, block=2, multi=1,
|
def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
activation=Relu, style='batchless',
|
activation=Relu, style='batchless',
|
||||||
init=init_he_normal):
|
init=init_he_normal):
|
||||||
|
if style == 'cossim':
|
||||||
|
style = 'batchless'
|
||||||
|
DenseClass = CosineDense
|
||||||
|
else:
|
||||||
|
DenseClass = Dense
|
||||||
if style not in _mr_styles:
|
if style not in _mr_styles:
|
||||||
raise Exception('unknown resnet style', style)
|
raise Exception('unknown resnet style', style)
|
||||||
|
|
||||||
|
@ -320,7 +356,7 @@ def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
|
|
||||||
for d in range(depth):
|
for d in range(depth):
|
||||||
size = width
|
size = width
|
||||||
FC = lambda: Dense(size, init)
|
FC = lambda: DenseClass(size, init)
|
||||||
|
|
||||||
if last_size != size:
|
if last_size != size:
|
||||||
y = y.feed(FC())
|
y = y.feed(FC())
|
||||||
|
@ -433,8 +469,10 @@ def optim_from_config(config):
|
||||||
mu = np.exp(-1/d2)
|
mu = np.exp(-1/d2)
|
||||||
optim = RMSprop(mu=mu)
|
optim = RMSprop(mu=mu)
|
||||||
elif config.optim == 'sgd':
|
elif config.optim == 'sgd':
|
||||||
if config.momentum != 0:
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 0
|
||||||
optim = Momentum(mu=config.momentum, nesterov=config.nesterov)
|
if d1 > 0:
|
||||||
|
b1 = np.exp(-1/d1)
|
||||||
|
optim = Momentum(mu=b1, nesterov=config.nesterov)
|
||||||
else:
|
else:
|
||||||
optim = Optimizer()
|
optim = Optimizer()
|
||||||
else:
|
else:
|
||||||
|
@ -453,6 +491,7 @@ def learner_from_config(config, optim, rscb):
|
||||||
elif config.learner == 'anneal':
|
elif config.learner == 'anneal':
|
||||||
learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
|
learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
|
||||||
halve_every=config.learn_halve_every)
|
halve_every=config.learn_halve_every)
|
||||||
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
||||||
elif config.learner == 'dumb':
|
elif config.learner == 'dumb':
|
||||||
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
||||||
halve_every=config.learn_halve_every,
|
halve_every=config.learn_halve_every,
|
||||||
|
@ -462,7 +501,6 @@ def learner_from_config(config, optim, rscb):
|
||||||
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
||||||
elif config.learner == 'sgd':
|
elif config.learner == 'sgd':
|
||||||
learner = Learner(optim, epochs=config.epochs, rate=config.learn)
|
learner = Learner(optim, epochs=config.epochs, rate=config.learn)
|
||||||
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
|
||||||
else:
|
else:
|
||||||
raise Exception('unknown learner', config.learner)
|
raise Exception('unknown learner', config.learner)
|
||||||
|
|
||||||
|
@ -559,11 +597,10 @@ def run(program, args=None):
|
||||||
parallel_style = 'onelesssum',
|
parallel_style = 'onelesssum',
|
||||||
activation = 'gelu',
|
activation = 'gelu',
|
||||||
|
|
||||||
optim = 'adam',
|
optim = 'adam', # note: most features only implemented for Adam
|
||||||
optim_decay1 = 2, # given in epochs (optional)
|
optim_decay1 = 2, # first momentum given in epochs (optional)
|
||||||
optim_decay2 = 100, # given in epochs (optional)
|
optim_decay2 = 100, # second momentum given in epochs (optional)
|
||||||
momentum = 0.90, # only used with SGD
|
nesterov = True,
|
||||||
nesterov = True, # only used with SGD or Adam
|
|
||||||
batch_size = 64,
|
batch_size = 64,
|
||||||
|
|
||||||
# learning parameters
|
# learning parameters
|
||||||
|
@ -571,7 +608,7 @@ def run(program, args=None):
|
||||||
learn = 1e-2,
|
learn = 1e-2,
|
||||||
epochs = 24,
|
epochs = 24,
|
||||||
learn_halve_every = 16, # only used with anneal/dumb
|
learn_halve_every = 16, # only used with anneal/dumb
|
||||||
restarts = 2,
|
restarts = 8,
|
||||||
restart_decay = 0.25, # only used with SGDR
|
restart_decay = 0.25, # only used with SGDR
|
||||||
expando = lambda i: 24 * i,
|
expando = lambda i: 24 * i,
|
||||||
|
|
||||||
|
@ -585,9 +622,9 @@ def run(program, args=None):
|
||||||
|
|
||||||
# logging/output
|
# logging/output
|
||||||
log10_loss = True, # personally, i'm sick of looking linear loss values!
|
log10_loss = True, # personally, i'm sick of looking linear loss values!
|
||||||
#fancy_logs = True, # unimplemented
|
#fancy_logs = True, # unimplemented (can't turn it off yet)
|
||||||
|
|
||||||
problem = 3,
|
problem = 2,
|
||||||
compare = (
|
compare = (
|
||||||
# best results for ~10,000 parameters
|
# best results for ~10,000 parameters
|
||||||
# training/validation pairs for each problem (starting from problem 0):
|
# training/validation pairs for each problem (starting from problem 0):
|
||||||
|
@ -595,7 +632,7 @@ def run(program, args=None):
|
||||||
(7.577717e-04, 1.255284e-03),
|
(7.577717e-04, 1.255284e-03),
|
||||||
# 1080 epochs on these...
|
# 1080 epochs on these...
|
||||||
(1.790511e-07, 2.785208e-07),
|
(1.790511e-07, 2.785208e-07),
|
||||||
(2.233277e-08, 3.580281e-08),
|
( 10**-7.774, 10**-7.626),
|
||||||
(5.266719e-07, 5.832677e-06), # overfitting? bad valid set?
|
(5.266719e-07, 5.832677e-06), # overfitting? bad valid set?
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,14 @@ def init_he_uniform(size, ins, outs):
|
||||||
s = np.sqrt(6 / ins)
|
s = np.sqrt(6 / ins)
|
||||||
return np.random.uniform(-s, s, size=size)
|
return np.random.uniform(-s, s, size=size)
|
||||||
|
|
||||||
|
def init_glorot_normal(size, ins, outs):
|
||||||
|
s = np.sqrt(2 / (ins + outs))
|
||||||
|
return np.random.normal(0, s, size=size)
|
||||||
|
|
||||||
|
def init_glorot_uniform(size, ins, outs):
|
||||||
|
s = np.sqrt(6 / (ins + outs))
|
||||||
|
return np.random.uniform(-s, s, size=size)
|
||||||
|
|
||||||
# Loss functions {{{1
|
# Loss functions {{{1
|
||||||
|
|
||||||
class Loss:
|
class Loss:
|
||||||
|
@ -162,8 +170,6 @@ class Adam(Optimizer):
|
||||||
# * Adam == RMSprop when
|
# * Adam == RMSprop when
|
||||||
# Adam.b1 == 0
|
# Adam.b1 == 0
|
||||||
# Adam.b2 == RMSprop.mu
|
# Adam.b2 == RMSprop.mu
|
||||||
# Adam.b1_t == 0
|
|
||||||
# Adam.b2_t == 0
|
|
||||||
|
|
||||||
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
self.b1 = _f(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
|
@ -373,6 +379,20 @@ class Input(Layer):
|
||||||
#self.dY = dY
|
#self.dY = dY
|
||||||
return np.zeros_like(dY)
|
return np.zeros_like(dY)
|
||||||
|
|
||||||
|
class Reshape(Layer):
|
||||||
|
def __init__(self, new_shape):
|
||||||
|
super().__init__()
|
||||||
|
self.shape = tuple(new_shape)
|
||||||
|
self.output_shape = self.shape
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
self.batch_size = X.shape[0]
|
||||||
|
return X.reshape(self.batch_size, *self.output_shape)
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
assert dY.shape[0] == self.batch_size
|
||||||
|
return dY.reshape(self.batch_size, *self.input_shape)
|
||||||
|
|
||||||
class Affine(Layer):
|
class Affine(Layer):
|
||||||
def __init__(self, a=1, b=0):
|
def __init__(self, a=1, b=0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -666,6 +686,30 @@ class Ritual: # i'm just making up names at this point
|
||||||
return avg_loss, losses
|
return avg_loss, losses
|
||||||
return avg_loss
|
return avg_loss
|
||||||
|
|
||||||
|
def test_batched(self, inputs, outputs, batch_size, return_losses=False):
|
||||||
|
cumsum_loss = _0
|
||||||
|
batch_count = inputs.shape[0] // batch_size
|
||||||
|
losses = []
|
||||||
|
assert inputs.shape[0] % batch_size == 0, \
|
||||||
|
"inputs is not evenly divisible by batch_size" # TODO: lift this restriction
|
||||||
|
for b in range(batch_count):
|
||||||
|
bi = b * batch_size
|
||||||
|
batch_inputs = inputs[ bi:bi+batch_size]
|
||||||
|
batch_outputs = outputs[bi:bi+batch_size]
|
||||||
|
|
||||||
|
predicted = self.model.forward(batch_inputs)
|
||||||
|
|
||||||
|
batch_loss = self.measure(predicted, batch_outputs)
|
||||||
|
if np.isnan(batch_loss):
|
||||||
|
raise Exception("nan")
|
||||||
|
cumsum_loss += batch_loss
|
||||||
|
if return_losses:
|
||||||
|
losses.append(batch_loss)
|
||||||
|
avg_loss = cumsum_loss / _f(batch_count)
|
||||||
|
if return_losses:
|
||||||
|
return avg_loss, losses
|
||||||
|
return avg_loss
|
||||||
|
|
||||||
# Learners {{{1
|
# Learners {{{1
|
||||||
|
|
||||||
class Learner:
|
class Learner:
|
||||||
|
|
Loading…
Reference in a new issue