.
This commit is contained in:
parent
b5a6bb0f19
commit
8fc1c198b4
1 changed files with 90 additions and 25 deletions
115
optim_nn.py
115
optim_nn.py
|
@ -12,6 +12,7 @@ from scipy.special import expit as sigmoid
|
|||
|
||||
# used for numbering layers like Keras:
|
||||
from collections import defaultdict
|
||||
_layer_counters = defaultdict(lambda: 0)
|
||||
|
||||
# Initializations
|
||||
|
||||
|
@ -142,8 +143,6 @@ class Adam(Optimizer):
|
|||
|
||||
# Abstract Layers
|
||||
|
||||
_layer_counters = defaultdict(lambda: 0)
|
||||
|
||||
class Layer:
|
||||
def __init__(self):
|
||||
self.parents = []
|
||||
|
@ -297,7 +296,6 @@ class Affine(Layer):
|
|||
|
||||
class Sigmoid(Layer): # aka Logistic
|
||||
def F(self, X):
|
||||
from scipy.special import expit as sigmoid
|
||||
self.sig = sigmoid(X)
|
||||
return X * self.sig
|
||||
|
||||
|
@ -322,6 +320,7 @@ class Relu(Layer):
|
|||
|
||||
class Elu(Layer):
|
||||
# paper: https://arxiv.org/abs/1511.07289
|
||||
|
||||
def __init__(self, alpha=1):
|
||||
super().__init__()
|
||||
self.alpha = nf(alpha)
|
||||
|
@ -337,6 +336,7 @@ class Elu(Layer):
|
|||
class GeluApprox(Layer):
|
||||
# paper: https://arxiv.org/abs/1606.08415
|
||||
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
||||
|
||||
def F(self, X):
|
||||
self.a = 1.704 * X
|
||||
self.sig = sigmoid(self.a)
|
||||
|
@ -407,7 +407,10 @@ class DenseOneLess(Dense):
|
|||
np.fill_diagonal(self.dcoeffs, 0)
|
||||
return dX
|
||||
|
||||
class LayerNorm(Layer): # TODO: inherit Affine instead?
|
||||
class LayerNorm(Layer):
|
||||
# paper: https://arxiv.org/abs/1607.06450
|
||||
# my implementation may be incorrect.
|
||||
|
||||
def __init__(self, eps=1e-3, axis=-1):
|
||||
super().__init__()
|
||||
self.eps = nf(eps)
|
||||
|
@ -556,13 +559,17 @@ class Ritual: # i'm just making up names at this point
|
|||
self.learner.optim.update(self.model.dW, self.model.W)
|
||||
|
||||
def prepare(self, model):
|
||||
self.en = 0
|
||||
self.bn = 0
|
||||
self.model = model
|
||||
|
||||
def train_batched(self, inputs, outputs, batch_size, return_losses=False):
|
||||
self.en += 1
|
||||
cumsum_loss = 0
|
||||
batch_count = inputs.shape[0] // batch_size
|
||||
losses = []
|
||||
for b in range(batch_count):
|
||||
self.bn += 1
|
||||
bi = b * batch_size
|
||||
batch_inputs = inputs[ bi:bi+batch_size]
|
||||
batch_outputs = outputs[bi:bi+batch_size]
|
||||
|
@ -587,6 +594,7 @@ class Ritual: # i'm just making up names at this point
|
|||
|
||||
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
||||
# paper: https://arxiv.org/abs/1606.01981
|
||||
|
||||
assert W.ndim == 1, W.ndim
|
||||
assert 0 < gamma < 1, gamma
|
||||
size = len(W)
|
||||
|
@ -632,7 +640,28 @@ class StochMRitual(Ritual):
|
|||
for layer in self.model.ordered_nodes:
|
||||
if isinstance(layer, Dense):
|
||||
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
|
||||
# np.clip(layer.W, -1, 1, out=layer.W)
|
||||
# np.clip(layer.W, -1, 1, out=layer.W)
|
||||
|
||||
class NoisyRitual(Ritual):
|
||||
def __init__(self, learner=None, loss=None, mloss=None,
|
||||
input_noise=0, output_noise=0, gradient_noise=0):
|
||||
self.input_noise = nf(input_noise) # TODO: implement
|
||||
self.output_noise = nf(output_noise) # TODO: implement
|
||||
self.gradient_noise = nf(gradient_noise)
|
||||
super().__init__(learner, loss, mloss)
|
||||
|
||||
def update(self):
|
||||
# gradient noise paper: https://arxiv.org/abs/1511.06807
|
||||
if self.gradient_noise > 0:
|
||||
size = len(self.model.dW)
|
||||
gamma = 0.55
|
||||
s = self.gradient_noise / (1 + self.bn) ** gamma
|
||||
# experiments:
|
||||
#s = np.sqrt(self.learner.rate)
|
||||
#s = np.square(self.learner.rate)
|
||||
#s = self.learner.rate / self.en
|
||||
self.model.dW += np.random.normal(0, s, size=size)
|
||||
super().update()
|
||||
|
||||
class Learner:
|
||||
per_batch = False
|
||||
|
@ -733,28 +762,46 @@ def cosmod(x):
|
|||
class SGDR(Learner):
|
||||
# Stochastic Gradient Descent with Restarts
|
||||
# paper: https://arxiv.org/abs/1608.03983
|
||||
# NOTE: this is not a complete implementation.
|
||||
# NOTE: this is missing a couple features.
|
||||
|
||||
per_batch = True
|
||||
|
||||
def __init__(self, optim, epochs=100, rate=None, restarts=0, restart_decay=0.5, callback=None):
|
||||
def __init__(self, optim, epochs=100, rate=None,
|
||||
restarts=0, restart_decay=0.5, callback=None,
|
||||
expando=None):
|
||||
self.restart_epochs = int(epochs)
|
||||
self.decay = float(restart_decay)
|
||||
self.restarts = int(restarts)
|
||||
self.restart_callback = callback
|
||||
epochs = self.restart_epochs * (self.restarts + 1)
|
||||
# TODO: rename expando to something not insane
|
||||
self.expando = expando if expando is not None else lambda i: 1
|
||||
|
||||
self.splits = []
|
||||
epochs = 0
|
||||
for i in range(0, self.restarts + 1):
|
||||
split = epochs + int(self.restart_epochs * self.expando(i))
|
||||
self.splits.append(split)
|
||||
epochs = split
|
||||
super().__init__(optim, epochs, rate)
|
||||
|
||||
def split_num(self, epoch):
|
||||
shit = [0] + self.splits # hack
|
||||
for i in range(0, len(self.splits)):
|
||||
if epoch < self.splits[i]:
|
||||
sub_epoch = epoch - shit[i]
|
||||
next_restart = self.splits[i] - shit[i]
|
||||
return i, sub_epoch, next_restart
|
||||
raise Exception('this should never happen.')
|
||||
|
||||
def rate_at(self, epoch):
|
||||
sub_epoch = epoch % self.restart_epochs
|
||||
x = sub_epoch / self.restart_epochs
|
||||
restart = epoch // self.restart_epochs
|
||||
restart, sub_epoch, next_restart = self.split_num(epoch)
|
||||
x = sub_epoch / next_restart
|
||||
return self.start_rate * self.decay**restart * cosmod(x)
|
||||
|
||||
def next(self):
|
||||
if not super().next():
|
||||
return False
|
||||
sub_epoch = self.epoch % self.restart_epochs
|
||||
restart = self.epoch // self.restart_epochs
|
||||
restart, sub_epoch, next_restart = self.split_num(self.epoch)
|
||||
if restart > 0 and sub_epoch == 0:
|
||||
if self.restart_callback is not None:
|
||||
self.restart_callback(restart)
|
||||
|
@ -789,6 +836,7 @@ def multiresnet(x, width, depth, block=2, multi=1,
|
|||
z.feed(merger)
|
||||
y = merger
|
||||
elif style == 'onelesssum':
|
||||
# this is my own awful contraption.
|
||||
is_last = d + 1 == depth
|
||||
needs_sum = not is_last or multi > 1
|
||||
skip = y
|
||||
|
@ -845,16 +893,17 @@ def run(program, args=[]):
|
|||
|
||||
optim = 'adam',
|
||||
nesterov = False, # only used with SGD or Adam
|
||||
momentum = 0.33, # only used with SGD
|
||||
momentum = 0.50, # only used with SGD
|
||||
|
||||
# learning parameters
|
||||
learner = 'SGDR',
|
||||
learner = 'sgdr',
|
||||
learn = 1e-2,
|
||||
epochs = 24,
|
||||
restarts = 2,
|
||||
learn_decay = 0.25, # only used with SGDR
|
||||
learn_halve_every = 16, # unused with SGDR
|
||||
learn_restart_advance = 16, # unused with SGDR
|
||||
epochs = 12,
|
||||
restarts = 2,
|
||||
restart_decay = 1, # only used with SGDR
|
||||
expando = lambda i: i + 1,
|
||||
|
||||
# misc
|
||||
batch_size = 64,
|
||||
|
@ -866,10 +915,13 @@ def run(program, args=[]):
|
|||
train_compare = 0.0000508,
|
||||
valid_compare = 0.0000678,
|
||||
|
||||
ritual = None,
|
||||
ritual = 'default',
|
||||
)
|
||||
|
||||
config.pprint()
|
||||
for k in ['parallel_style', 'optim', 'learner', 'ritual']:
|
||||
config[k] = config[k].lower()
|
||||
|
||||
#config.pprint()
|
||||
|
||||
# toy CIE-2000 data
|
||||
from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, \
|
||||
|
@ -933,17 +985,27 @@ def run(program, args=[]):
|
|||
|
||||
#
|
||||
|
||||
if config.learner == 'SGDR':
|
||||
if config.learner == 'sgdr':
|
||||
expando = config.expando if 'expando' in config else None
|
||||
learner = SGDR(optim, epochs=config.epochs, rate=config.learn,
|
||||
restart_decay=config.learn_decay, restarts=config.restarts,
|
||||
callback=rscb)
|
||||
restart_decay=config.restart_decay, restarts=config.restarts,
|
||||
callback=rscb, expando=expando)
|
||||
# final learning rate isn't of interest here; it's gonna be close to 0.
|
||||
else:
|
||||
log('total epochs:', learner.epochs)
|
||||
elif config.learner == 'anneal':
|
||||
learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
|
||||
halve_every=config.learn_halve_every)
|
||||
elif config.learner == 'dumb':
|
||||
learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
|
||||
halve_every=config.learn_halve_every,
|
||||
restarts=config.restarts, restart_advance=config.learn_restart_advance,
|
||||
callback=rscb)
|
||||
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
||||
elif config.learner == 'sgd':
|
||||
learner = Learner(optim, epochs=config.epochs, rate=config.learn)
|
||||
log("final learning rate", "{:10.8f}".format(learner.final_rate))
|
||||
else:
|
||||
raise Exception('unknown learner', config.learner)
|
||||
|
||||
#
|
||||
|
||||
|
@ -961,10 +1023,13 @@ def run(program, args=[]):
|
|||
loss = lookup_loss(config.loss)
|
||||
mloss = lookup_loss(config.mloss) if config.mloss else loss
|
||||
|
||||
if config.ritual == None:
|
||||
if config.ritual == 'default':
|
||||
ritual = Ritual(learner=learner, loss=loss, mloss=mloss)
|
||||
elif config.ritual == 'stochm':
|
||||
ritual = StochMRitual(learner=learner, loss=loss, mloss=mloss)
|
||||
elif config.ritual == 'noisy':
|
||||
ritual = NoisyRitual(learner=learner, loss=loss, mloss=mloss,
|
||||
gradient_noise=0.01)
|
||||
else:
|
||||
raise Exception('unknown ritual', config.ritual)
|
||||
|
||||
|
|
Loading…
Reference in a new issue