.
This commit is contained in:
parent
389bde3cdb
commit
4ac04baa1f
2 changed files with 114 additions and 119 deletions
139
optim_nn.py
139
optim_nn.py
|
@ -10,7 +10,9 @@ from optim_nn_core import *
|
||||||
from optim_nn_core import _check, _f
|
from optim_nn_core import _check, _f
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
|
|
||||||
|
def lament(*args, **kwargs):
|
||||||
|
print(*args, file=sys.stderr, **kwargs)
|
||||||
|
|
||||||
def log(left, right):
|
def log(left, right):
|
||||||
lament("{:>20}: {}".format(left, right))
|
lament("{:>20}: {}".format(left, right))
|
||||||
|
@ -160,7 +162,8 @@ class NoisyRitual(Ritual):
|
||||||
|
|
||||||
class DumbLearner(AnnealingLearner):
|
class DumbLearner(AnnealingLearner):
|
||||||
# this is my own awful contraption. it's not really "SGD with restarts".
|
# this is my own awful contraption. it's not really "SGD with restarts".
|
||||||
def __init__(self, optim, epochs=100, rate=None, halve_every=10, restarts=0, restart_advance=20, callback=None):
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10,
|
||||||
|
restarts=0, restart_advance=20, callback=None):
|
||||||
self.restart_epochs = int(epochs)
|
self.restart_epochs = int(epochs)
|
||||||
self.restarts = int(restarts)
|
self.restarts = int(restarts)
|
||||||
self.restart_advance = float(restart_advance)
|
self.restart_advance = float(restart_advance)
|
||||||
|
@ -183,35 +186,22 @@ class DumbLearner(AnnealingLearner):
|
||||||
self.restart_callback(restart)
|
self.restart_callback(restart)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def multiresnet(x, width, depth, block=2, multi=1,
|
def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
|
||||||
activation=Relu, style='batchless',
|
|
||||||
init=init_he_normal):
|
|
||||||
y = x
|
|
||||||
last_size = x.output_shape[0]
|
|
||||||
|
|
||||||
FC = lambda size: Dense(size, init)
|
|
||||||
#FC = lambda size: DenseOneLess(size, init)
|
|
||||||
|
|
||||||
for d in range(depth):
|
|
||||||
size = width
|
|
||||||
|
|
||||||
if last_size != size:
|
|
||||||
y = y.feed(Dense(size, init))
|
|
||||||
|
|
||||||
if style == 'batchless':
|
|
||||||
skip = y
|
skip = y
|
||||||
merger = Sum()
|
merger = Sum()
|
||||||
skip.feed(merger)
|
skip.feed(merger)
|
||||||
z_start = skip.feed(activation())
|
z_start = skip.feed(activation())
|
||||||
for i in range(multi):
|
for _ in range(multi):
|
||||||
z = z_start
|
z = z_start
|
||||||
for i in range(block):
|
for j in range(block):
|
||||||
if i > 0:
|
if j > 0:
|
||||||
z = z.feed(activation())
|
z = z.feed(activation())
|
||||||
z = z.feed(FC(size))
|
z = z.feed(FC())
|
||||||
z.feed(merger)
|
z.feed(merger)
|
||||||
y = merger
|
y = merger
|
||||||
elif style == 'onelesssum':
|
return y
|
||||||
|
|
||||||
|
def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
|
||||||
# this is my own awful contraption.
|
# this is my own awful contraption.
|
||||||
is_last = d + 1 == depth
|
is_last = d + 1 == depth
|
||||||
needs_sum = not is_last or multi > 1
|
needs_sum = not is_last or multi > 1
|
||||||
|
@ -221,21 +211,43 @@ def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
if not is_last:
|
if not is_last:
|
||||||
skip.feed(merger)
|
skip.feed(merger)
|
||||||
z_start = skip.feed(activation())
|
z_start = skip.feed(activation())
|
||||||
for i in range(multi):
|
for _ in range(multi):
|
||||||
z = z_start
|
z = z_start
|
||||||
for i in range(block):
|
for j in range(block):
|
||||||
if i > 0:
|
if j > 0:
|
||||||
z = z.feed(activation())
|
z = z.feed(activation())
|
||||||
z = z.feed(FC(size))
|
z = z.feed(FC())
|
||||||
if needs_sum:
|
if needs_sum:
|
||||||
z.feed(merger)
|
z.feed(merger)
|
||||||
if needs_sum:
|
if needs_sum:
|
||||||
y = merger
|
y = merger
|
||||||
else:
|
else:
|
||||||
y = z
|
y = z
|
||||||
else:
|
return y
|
||||||
|
|
||||||
|
_mr_styles = dict(
|
||||||
|
batchless=_mr_batchless,
|
||||||
|
onelesssum=_mr_onelesssum,
|
||||||
|
)
|
||||||
|
|
||||||
|
def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
|
activation=Relu, style='batchless',
|
||||||
|
init=init_he_normal):
|
||||||
|
if style not in _mr_styles:
|
||||||
raise Exception('unknown resnet style', style)
|
raise Exception('unknown resnet style', style)
|
||||||
|
|
||||||
|
y = x
|
||||||
|
last_size = x.output_shape[0]
|
||||||
|
|
||||||
|
for d in range(depth):
|
||||||
|
size = width
|
||||||
|
FC = lambda: Dense(size, init)
|
||||||
|
|
||||||
|
if last_size != size:
|
||||||
|
y = y.feed(FC())
|
||||||
|
|
||||||
|
y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d)
|
||||||
|
|
||||||
last_size = size
|
last_size = size
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
@ -260,17 +272,17 @@ def normalize_data(data, mean=None, std=None):
|
||||||
def toy_data(train_samples, valid_samples, problem=2):
|
def toy_data(train_samples, valid_samples, problem=2):
|
||||||
total_samples = train_samples + valid_samples
|
total_samples = train_samples + valid_samples
|
||||||
|
|
||||||
|
nod = normalize_data # shorthand to keep a sane indentation
|
||||||
|
|
||||||
if problem == 0:
|
if problem == 0:
|
||||||
from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, \
|
from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs
|
||||||
inputs, outputs, valid_inputs, valid_outputs, \
|
|
||||||
x_scale, y_scale
|
|
||||||
inputs, outputs = _f(inputs), _f(outputs)
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)
|
valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)
|
||||||
|
|
||||||
normalize_data(inputs, 127.5, 73.9)
|
nod(inputs, 127.5, 73.9)
|
||||||
normalize_data(outputs, 44.8, 21.7)
|
nod(outputs, 44.8, 21.7)
|
||||||
normalize_data(valid_inputs, 127.5, 73.9)
|
nod(valid_inputs, 127.5, 73.9)
|
||||||
normalize_data(valid_outputs, 44.8, 21.7)
|
nod(valid_outputs, 44.8, 21.7)
|
||||||
|
|
||||||
elif problem == 1:
|
elif problem == 1:
|
||||||
from sklearn.datasets import make_friedman1
|
from sklearn.datasets import make_friedman1
|
||||||
|
@ -278,8 +290,8 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
inputs, outputs = _f(inputs), _f(outputs)
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
outputs = np.expand_dims(outputs, -1)
|
outputs = np.expand_dims(outputs, -1)
|
||||||
|
|
||||||
normalize_data(inputs, 0.5, 1/np.sqrt(12))
|
nod(inputs, 0.5, 1/np.sqrt(12))
|
||||||
normalize_data(outputs, 14.4, 4.9)
|
nod(outputs, 14.4, 4.9)
|
||||||
|
|
||||||
elif problem == 2:
|
elif problem == 2:
|
||||||
from sklearn.datasets import make_friedman2
|
from sklearn.datasets import make_friedman2
|
||||||
|
@ -287,11 +299,11 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
inputs, outputs = _f(inputs), _f(outputs)
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
outputs = np.expand_dims(outputs, -1)
|
outputs = np.expand_dims(outputs, -1)
|
||||||
|
|
||||||
normalize_data(inputs,
|
nod(inputs,
|
||||||
[5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
|
[5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
|
||||||
[2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
|
[2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
|
||||||
|
|
||||||
normalize_data(outputs, [482], [380])
|
nod(outputs, [482], [380])
|
||||||
|
|
||||||
elif problem == 3:
|
elif problem == 3:
|
||||||
from sklearn.datasets import make_friedman3
|
from sklearn.datasets import make_friedman3
|
||||||
|
@ -299,11 +311,11 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
inputs, outputs = _f(inputs), _f(outputs)
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
outputs = np.expand_dims(outputs, -1)
|
outputs = np.expand_dims(outputs, -1)
|
||||||
|
|
||||||
normalize_data(inputs,
|
nod(inputs,
|
||||||
[4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
|
[4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
|
||||||
[2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
|
[2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
|
||||||
|
|
||||||
normalize_data(outputs, [1.32327931], [0.31776295])
|
nod(outputs, [1.32327931], [0.31776295])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception("unknown toy data set", problem)
|
raise Exception("unknown toy data set", problem)
|
||||||
|
@ -341,9 +353,6 @@ def model_from_config(config, input_features, output_features, callbacks):
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
||||||
# FIXME: unused variable
|
|
||||||
training = config.epochs > 0 and config.restarts >= 0
|
|
||||||
|
|
||||||
if config.fn_load is not None:
|
if config.fn_load is not None:
|
||||||
log('loading weights', config.fn_load)
|
log('loading weights', config.fn_load)
|
||||||
model.load_weights(config.fn_load)
|
model.load_weights(config.fn_load)
|
||||||
|
@ -390,7 +399,8 @@ def model_from_config(config, input_features, output_features, callbacks):
|
||||||
elif config.learner == 'dumb':
|
elif config.learner == 'dumb':
|
||||||
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
||||||
halve_every=config.learn_halve_every,
|
halve_every=config.learn_halve_every,
|
||||||
restarts=config.restarts, restart_advance=config.learn_restart_advance,
|
restarts=config.restarts,
|
||||||
|
restart_advance=config.learn_restart_advance,
|
||||||
callback=rscb)
|
callback=rscb)
|
||||||
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
||||||
elif config.learner == 'sgd':
|
elif config.learner == 'sgd':
|
||||||
|
@ -430,11 +440,12 @@ def model_from_config(config, input_features, output_features, callbacks):
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
||||||
return model, learner, ritual, (loss, mloss)
|
return model, learner, ritual
|
||||||
|
|
||||||
# main program {{{1
|
# main program {{{1
|
||||||
|
|
||||||
def run(program, args=[]):
|
def run(program, args=None):
|
||||||
|
args = args if args else []
|
||||||
|
|
||||||
np.random.seed(42069)
|
np.random.seed(42069)
|
||||||
|
|
||||||
|
@ -469,7 +480,7 @@ def run(program, args=[]):
|
||||||
epochs = 24,
|
epochs = 24,
|
||||||
restarts = 2,
|
restarts = 2,
|
||||||
restart_decay = 0.25, # only used with SGDR
|
restart_decay = 0.25, # only used with SGDR
|
||||||
expando = lambda i: i + 1,
|
expando = lambda i: 24 * i,
|
||||||
|
|
||||||
# misc
|
# misc
|
||||||
init = 'he_normal',
|
init = 'he_normal',
|
||||||
|
@ -484,15 +495,17 @@ def run(program, args=[]):
|
||||||
# training/validation pairs for each problem (starting from problem 0):
|
# training/validation pairs for each problem (starting from problem 0):
|
||||||
#(5.08e-05, 6.78e-05),
|
#(5.08e-05, 6.78e-05),
|
||||||
(7.577717e-04, 1.255284e-03),
|
(7.577717e-04, 1.255284e-03),
|
||||||
(3.032806e-06, 3.963775e-06),
|
# 1080 epochs on these...
|
||||||
(3.676451e-07, 4.495362e-07),
|
(1.790511e-07, 2.785208e-07),
|
||||||
(1.854613e-05, 1.623881e-05)
|
(2.233277e-08, 3.580281e-08),
|
||||||
|
(5.266719e-07, 5.832677e-06), # overfitting? bad valid set?
|
||||||
),
|
),
|
||||||
|
|
||||||
unsafe = True, # aka gotta go fast mode
|
unsafe = True, # aka gotta go fast mode
|
||||||
)
|
)
|
||||||
|
|
||||||
for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']:
|
for k in ['parallel_style', 'activation', 'optim', 'learner',
|
||||||
|
'init', 'loss', 'mloss', 'ritual']:
|
||||||
config[k] = config[k].lower()
|
config[k] = config[k].lower()
|
||||||
|
|
||||||
config.pprint()
|
config.pprint()
|
||||||
|
@ -507,18 +520,14 @@ def run(program, args=[]):
|
||||||
|
|
||||||
callbacks = Dummy()
|
callbacks = Dummy()
|
||||||
|
|
||||||
model, learner, ritual, (loss, mloss) = \
|
model, learner, ritual = \
|
||||||
model_from_config(config, input_features, output_features, callbacks)
|
model_from_config(config, input_features, output_features, callbacks)
|
||||||
|
|
||||||
# Model Information
|
# Model Information
|
||||||
|
|
||||||
if 0:
|
|
||||||
node_names = ' '.join([str(node) for node in model.ordered_nodes])
|
|
||||||
log('{} nodes'.format(len(model.ordered_nodes)), node_names)
|
|
||||||
else:
|
|
||||||
for node in model.ordered_nodes:
|
for node in model.ordered_nodes:
|
||||||
children = [str(n) for n in node.children]
|
children = [str(n) for n in node.children]
|
||||||
if len(children) > 0:
|
if children:
|
||||||
sep = '->'
|
sep = '->'
|
||||||
print(str(node)+sep+('\n'+str(node)+sep).join(children))
|
print(str(node)+sep+('\n'+str(node)+sep).join(children))
|
||||||
log('parameters', model.param_count)
|
log('parameters', model.param_count)
|
||||||
|
@ -534,10 +543,8 @@ def run(program, args=[]):
|
||||||
predicted = model.forward(inputs)
|
predicted = model.forward(inputs)
|
||||||
err = ritual.measure(predicted, outputs)
|
err = ritual.measure(predicted, outputs)
|
||||||
log(name + " loss", "{:12.6e}".format(err))
|
log(name + " loss", "{:12.6e}".format(err))
|
||||||
# TODO: print logarithmic difference as it might be more meaningful
|
|
||||||
# (fewer results stuck around -99%)
|
|
||||||
if comparison:
|
if comparison:
|
||||||
log("improvement", "{:+7.2f}%".format((comparison / err - 1) * 100))
|
log("improvement", "10**({:+7.4f}) times".format(np.log10(comparison / err)))
|
||||||
return err
|
return err
|
||||||
|
|
||||||
train_err = print_error("train",
|
train_err = print_error("train",
|
||||||
|
@ -551,10 +558,13 @@ def run(program, args=[]):
|
||||||
|
|
||||||
callbacks.restart = measure_error
|
callbacks.restart = measure_error
|
||||||
|
|
||||||
|
training = config.epochs > 0 and config.restarts >= 0
|
||||||
|
|
||||||
|
if training:
|
||||||
measure_error()
|
measure_error()
|
||||||
|
|
||||||
ritual.prepare(model)
|
ritual.prepare(model)
|
||||||
while learner.next():
|
while training and learner.next():
|
||||||
indices = np.arange(inputs.shape[0])
|
indices = np.arange(inputs.shape[0])
|
||||||
np.random.shuffle(indices)
|
np.random.shuffle(indices)
|
||||||
shuffled_inputs = inputs[indices]
|
shuffled_inputs = inputs[indices]
|
||||||
|
@ -573,11 +583,11 @@ def run(program, args=[]):
|
||||||
|
|
||||||
measure_error()
|
measure_error()
|
||||||
|
|
||||||
if config.fn_save is not None:
|
if training and config.fn_save is not None:
|
||||||
log('saving weights', config.fn_save)
|
log('saving weights', config.fn_save)
|
||||||
model.save_weights(config.fn_save, overwrite=True)
|
model.save_weights(config.fn_save, overwrite=True)
|
||||||
|
|
||||||
if config.log_fn is not None:
|
if training and config.log_fn is not None:
|
||||||
log('saving losses', config.log_fn)
|
log('saving losses', config.log_fn)
|
||||||
np.savez_compressed(config.log_fn,
|
np.savez_compressed(config.log_fn,
|
||||||
batch_losses=np.array(batch_losses, dtype=_f),
|
batch_losses=np.array(batch_losses, dtype=_f),
|
||||||
|
@ -592,5 +602,4 @@ def run(program, args=[]):
|
||||||
# run main program {{{1
|
# run main program {{{1
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
|
||||||
sys.exit(run(sys.argv[0], sys.argv[1:]))
|
sys.exit(run(sys.argv[0], sys.argv[1:]))
|
||||||
|
|
|
@ -48,19 +48,20 @@ class CategoricalCrossentropy(Loss):
|
||||||
# TODO: assert dimensionality and p > 0 (if not self.unsafe?)
|
# TODO: assert dimensionality and p > 0 (if not self.unsafe?)
|
||||||
p = np.clip(p, self.eps, 1 - self.eps)
|
p = np.clip(p, self.eps, 1 - self.eps)
|
||||||
f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
|
f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
|
||||||
return np.mean(f, axis=-1)
|
return np.mean(f)
|
||||||
|
|
||||||
def dF(self, p, y):
|
def dF(self, p, y):
|
||||||
p = np.clip(p, self.eps, 1 - self.eps)
|
p = np.clip(p, self.eps, 1 - self.eps)
|
||||||
df = (p - y) / (p * (1 - p))
|
df = (p - y) / (p * (1 - p))
|
||||||
return df / y.shape[-1]
|
return df / len(y)
|
||||||
|
|
||||||
class ResidualLoss(Loss):
|
class ResidualLoss(Loss):
|
||||||
def F(self, p, y): # mean
|
def F(self, p, y): # mean
|
||||||
return np.mean(self.f(p - y))
|
return np.mean(self.f(p - y))
|
||||||
|
|
||||||
def dF(self, p, y): # dmean
|
def dF(self, p, y): # dmean
|
||||||
return self.df(p - y) / y.shape[-1]
|
ret = self.df(p - y) / len(y)
|
||||||
|
return ret
|
||||||
|
|
||||||
class Squared(ResidualLoss):
|
class Squared(ResidualLoss):
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
|
@ -80,7 +81,7 @@ class Absolute(ResidualLoss):
|
||||||
|
|
||||||
class Optimizer:
|
class Optimizer:
|
||||||
def __init__(self, alpha=0.1):
|
def __init__(self, alpha=0.1):
|
||||||
self.alpha = _f(alpha)
|
self.alpha = _f(alpha) # learning rate
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
@ -97,12 +98,11 @@ class Optimizer:
|
||||||
|
|
||||||
class Momentum(Optimizer):
|
class Momentum(Optimizer):
|
||||||
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
||||||
self.alpha = _f(alpha) # learning rate
|
|
||||||
self.lamb = _f(lamb) # weight decay
|
self.lamb = _f(lamb) # weight decay
|
||||||
self.mu = _f(mu) # momentum
|
self.mu = _f(mu) # momentum
|
||||||
self.nesterov = bool(nesterov)
|
self.nesterov = bool(nesterov)
|
||||||
|
|
||||||
self.reset()
|
super().__init__(alpha)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.dWprev = None
|
self.dWprev = None
|
||||||
|
@ -116,7 +116,6 @@ class Momentum(Optimizer):
|
||||||
self.dWprev[:] = V
|
self.dWprev[:] = V
|
||||||
if self.nesterov: # TODO: is this correct? looks weird
|
if self.nesterov: # TODO: is this correct? looks weird
|
||||||
return self.mu * V - self.alpha * (dW + W * self.lamb)
|
return self.mu * V - self.alpha * (dW + W * self.lamb)
|
||||||
else:
|
|
||||||
return V
|
return V
|
||||||
|
|
||||||
class RMSprop(Optimizer):
|
class RMSprop(Optimizer):
|
||||||
|
@ -127,7 +126,6 @@ class RMSprop(Optimizer):
|
||||||
# RMSprop.mu == 1
|
# RMSprop.mu == 1
|
||||||
|
|
||||||
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
|
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
|
||||||
self.alpha = _f(alpha) # learning rate
|
|
||||||
self.mu = _f(mu) # decay term
|
self.mu = _f(mu) # decay term
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
@ -138,7 +136,7 @@ class RMSprop(Optimizer):
|
||||||
# an input decays to 1/e its original amplitude over 99.5 epochs.
|
# an input decays to 1/e its original amplitude over 99.5 epochs.
|
||||||
# (this is from DSP, so how relevant it is in SGD is debatable)
|
# (this is from DSP, so how relevant it is in SGD is debatable)
|
||||||
|
|
||||||
self.reset()
|
super().__init__(alpha)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.g = None
|
self.g = None
|
||||||
|
@ -168,14 +166,13 @@ class Adam(Optimizer):
|
||||||
# Adam.b2_t == 0
|
# Adam.b2_t == 0
|
||||||
|
|
||||||
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
||||||
self.alpha = _f(alpha) # learning rate
|
|
||||||
self.b1 = _f(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
self.b2 = _f(b2) # decay term
|
self.b2 = _f(b2) # decay term
|
||||||
self.b1_t_default = _f(b1_t) # decay term power t
|
self.b1_t_default = _f(b1_t) # decay term power t
|
||||||
self.b2_t_default = _f(b2_t) # decay term power t
|
self.b2_t_default = _f(b2_t) # decay term power t
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
self.reset()
|
super().__init__(alpha)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.mt = None
|
self.mt = None
|
||||||
|
@ -249,14 +246,7 @@ class Layer:
|
||||||
def dmulti(self, dB):
|
def dmulti(self, dB):
|
||||||
if len(dB) == 1:
|
if len(dB) == 1:
|
||||||
return self.dF(dB[0])
|
return self.dF(dB[0])
|
||||||
else:
|
return sum((self.dF(dY) for dY in dB))
|
||||||
dX = None
|
|
||||||
for dY in dB:
|
|
||||||
if dX is None:
|
|
||||||
dX = self.dF(dY)
|
|
||||||
else:
|
|
||||||
dX += self.dF(dY)
|
|
||||||
return dX
|
|
||||||
|
|
||||||
# general utility methods:
|
# general utility methods:
|
||||||
|
|
||||||
|
@ -267,10 +257,7 @@ class Layer:
|
||||||
if shape is None:
|
if shape is None:
|
||||||
return False
|
return False
|
||||||
self.input_shape = shape
|
self.input_shape = shape
|
||||||
if np.all(self.input_shape == parent.output_shape):
|
return np.all(self.input_shape == parent.output_shape)
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def feed(self, child):
|
def feed(self, child):
|
||||||
if not child.compatible(self):
|
if not child.compatible(self):
|
||||||
|
@ -288,7 +275,7 @@ class Layer:
|
||||||
|
|
||||||
def forward(self, lut):
|
def forward(self, lut):
|
||||||
if not self.unsafe:
|
if not self.unsafe:
|
||||||
assert len(self.parents) > 0, self
|
assert self.parents, self
|
||||||
B = []
|
B = []
|
||||||
for parent in self.parents:
|
for parent in self.parents:
|
||||||
# TODO: skip over irrelevant nodes (if any)
|
# TODO: skip over irrelevant nodes (if any)
|
||||||
|
@ -303,7 +290,7 @@ class Layer:
|
||||||
|
|
||||||
def backward(self, lut):
|
def backward(self, lut):
|
||||||
if not self.unsafe:
|
if not self.unsafe:
|
||||||
assert len(self.children) > 0, self
|
assert self.children, self
|
||||||
dB = []
|
dB = []
|
||||||
for child in self.children:
|
for child in self.children:
|
||||||
# TODO: skip over irrelevant nodes (if any)
|
# TODO: skip over irrelevant nodes (if any)
|
||||||
|
@ -643,7 +630,6 @@ class Ritual: # i'm just making up names at this point
|
||||||
avg_loss = cumsum_loss / _f(batch_count)
|
avg_loss = cumsum_loss / _f(batch_count)
|
||||||
if return_losses:
|
if return_losses:
|
||||||
return avg_loss, losses
|
return avg_loss, losses
|
||||||
else:
|
|
||||||
return avg_loss
|
return avg_loss
|
||||||
|
|
||||||
# Learners {{{1
|
# Learners {{{1
|
||||||
|
@ -734,12 +720,12 @@ class SGDR(Learner):
|
||||||
self.restarts = int(restarts)
|
self.restarts = int(restarts)
|
||||||
self.restart_callback = callback
|
self.restart_callback = callback
|
||||||
# TODO: rename expando to something not insane
|
# TODO: rename expando to something not insane
|
||||||
self.expando = expando if expando is not None else lambda i: 1
|
self.expando = expando if expando is not None else lambda i: i
|
||||||
|
|
||||||
self.splits = []
|
self.splits = []
|
||||||
epochs = 0
|
epochs = 0
|
||||||
for i in range(0, self.restarts + 1):
|
for i in range(0, self.restarts + 1):
|
||||||
split = epochs + int(self.restart_epochs * self.expando(i))
|
split = epochs + self.restart_epochs + int(self.expando(i))
|
||||||
self.splits.append(split)
|
self.splits.append(split)
|
||||||
epochs = split
|
epochs = split
|
||||||
super().__init__(optim, epochs, rate)
|
super().__init__(optim, epochs, rate)
|
||||||
|
|
Loading…
Reference in a new issue