.
This commit is contained in:
parent
3106495704
commit
42a66d4d6c
2 changed files with 111 additions and 39 deletions
65
optim_nn.py
65
optim_nn.py
|
@ -68,9 +68,7 @@ class LayerNorm(Layer):
|
||||||
return shape
|
return shape
|
||||||
|
|
||||||
def init(self, W, dW):
|
def init(self, W, dW):
|
||||||
# TODO: move this little bit into super(), also assert against self.size
|
super().init(W, dW)
|
||||||
self.W = W
|
|
||||||
self.dW = dW
|
|
||||||
|
|
||||||
f = self.features
|
f = self.features
|
||||||
|
|
||||||
|
@ -95,7 +93,6 @@ class LayerNorm(Layer):
|
||||||
length = dY.shape[0]
|
length = dY.shape[0]
|
||||||
|
|
||||||
if self.affine:
|
if self.affine:
|
||||||
# Y = gamma * Xnorm + beta
|
|
||||||
dXnorm = dY * self.gamma
|
dXnorm = dY * self.gamma
|
||||||
self.dgamma[:] = (dY * self.Xnorm).sum(0)
|
self.dgamma[:] = (dY * self.Xnorm).sum(0)
|
||||||
self.dbeta[:] = dY.sum(0)
|
self.dbeta[:] = dY.sum(0)
|
||||||
|
@ -332,14 +329,22 @@ def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
|
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
|
||||||
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
|
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
|
||||||
|
|
||||||
|
def prettyize(data):
|
||||||
|
if isinstance(data, np.ndarray):
|
||||||
|
s = ', '.join(('{:8.2e}'.format(n) for n in data))
|
||||||
|
s = '[' + s + ']'
|
||||||
|
else:
|
||||||
|
s = '{:8.2e}'.format(data)
|
||||||
|
return s
|
||||||
|
|
||||||
def normalize_data(data, mean=None, std=None):
|
def normalize_data(data, mean=None, std=None):
|
||||||
# in-place
|
# in-place
|
||||||
if mean is None or std is None:
|
if mean is None or std is None:
|
||||||
mean = np.mean(data, axis=0)
|
mean = np.mean(data, axis=0)
|
||||||
std = np.std(data, axis=0)
|
std = np.std(data, axis=0)
|
||||||
# TODO: construct function call string for copy-paste convenience
|
mean_str = prettyize(mean)
|
||||||
lament('mean:', mean)
|
std_str = prettyize(std)
|
||||||
lament('std: ', std)
|
lament('nod(...,\n {},\n {})'.format(mean_str, std_str))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
data -= _f(mean)
|
data -= _f(mean)
|
||||||
data /= _f(std)
|
data /= _f(std)
|
||||||
|
@ -410,12 +415,12 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
|
|
||||||
def optim_from_config(config):
|
def optim_from_config(config):
|
||||||
if config.optim == 'adam':
|
if config.optim == 'adam':
|
||||||
assert not config.nesterov, "unimplemented"
|
|
||||||
d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
|
d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
|
||||||
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
|
||||||
b1 = np.exp(-1/d1)
|
b1 = np.exp(-1/d1)
|
||||||
b2 = np.exp(-1/d2)
|
b2 = np.exp(-1/d2)
|
||||||
optim = Adam(b1=b1, b1_t=b1, b2=b2, b2_t=b2)
|
o = Nadam if config.nesterov else Adam
|
||||||
|
optim = o(b1=b1, b2=b2)
|
||||||
elif config.optim in ('rms', 'rmsprop'):
|
elif config.optim in ('rms', 'rmsprop'):
|
||||||
d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
|
d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
|
||||||
mu = np.exp(-1/d2)
|
mu = np.exp(-1/d2)
|
||||||
|
@ -550,14 +555,15 @@ def run(program, args=None):
|
||||||
optim = 'adam',
|
optim = 'adam',
|
||||||
optim_decay1 = 2, # given in epochs (optional)
|
optim_decay1 = 2, # given in epochs (optional)
|
||||||
optim_decay2 = 100, # given in epochs (optional)
|
optim_decay2 = 100, # given in epochs (optional)
|
||||||
momentum = 0.50, # only used with SGD
|
momentum = 0.90, # only used with SGD
|
||||||
nesterov = False, # only used with SGD or Adam
|
nesterov = True, # only used with SGD or Adam
|
||||||
batch_size = 64,
|
batch_size = 64,
|
||||||
|
|
||||||
# learning parameters
|
# learning parameters
|
||||||
learner = 'sgdr',
|
learner = 'sgdr',
|
||||||
learn = 1e-2,
|
learn = 1e-2,
|
||||||
epochs = 24,
|
epochs = 24,
|
||||||
|
learn_halve_every = 16, # only used with anneal/dumb
|
||||||
restarts = 2,
|
restarts = 2,
|
||||||
restart_decay = 0.25, # only used with SGDR
|
restart_decay = 0.25, # only used with SGDR
|
||||||
expando = lambda i: 24 * i,
|
expando = lambda i: 24 * i,
|
||||||
|
@ -569,8 +575,9 @@ def run(program, args=None):
|
||||||
ritual = 'default',
|
ritual = 'default',
|
||||||
restart_optim = False, # restarts also reset internal state of optimizer
|
restart_optim = False, # restarts also reset internal state of optimizer
|
||||||
warmup = True,
|
warmup = True,
|
||||||
|
log10_loss = True, # personally, i'm sick of looking linear loss values!
|
||||||
|
|
||||||
problem = 2,
|
problem = 3,
|
||||||
compare = (
|
compare = (
|
||||||
# best results for ~10,000 parameters
|
# best results for ~10,000 parameters
|
||||||
# training/validation pairs for each problem (starting from problem 0):
|
# training/validation pairs for each problem (starting from problem 0):
|
||||||
|
@ -592,7 +599,6 @@ def run(program, args=None):
|
||||||
config.pprint()
|
config.pprint()
|
||||||
|
|
||||||
# Toy Data {{{2
|
# Toy Data {{{2
|
||||||
# (our model is probably complete overkill for this, so TODO: better data)
|
|
||||||
|
|
||||||
(inputs, outputs), (valid_inputs, valid_outputs) = \
|
(inputs, outputs), (valid_inputs, valid_outputs) = \
|
||||||
toy_data(2**14, 2**11, problem=config.problem)
|
toy_data(2**14, 2**11, problem=config.problem)
|
||||||
|
@ -624,8 +630,11 @@ def run(program, args=None):
|
||||||
predicted = model.forward(inputs)
|
predicted = model.forward(inputs)
|
||||||
err = ritual.measure(predicted, outputs)
|
err = ritual.measure(predicted, outputs)
|
||||||
log(name + " loss", "{:12.6e}".format(err))
|
log(name + " loss", "{:12.6e}".format(err))
|
||||||
if comparison:
|
if config.log10_loss:
|
||||||
log("improvement", "10**({:+7.4f}) times".format(np.log10(comparison / err)))
|
log(name + " log10-loss", "{:+6.3f}".format(np.log10(err)))
|
||||||
|
elif comparison:
|
||||||
|
fmt = "10**({:+7.4f}) times"
|
||||||
|
log("improvement", fmt.format(np.log10(comparison / err)))
|
||||||
return err
|
return err
|
||||||
|
|
||||||
train_err = print_error("train",
|
train_err = print_error("train",
|
||||||
|
@ -645,11 +654,19 @@ def run(program, args=None):
|
||||||
|
|
||||||
if training and config.warmup:
|
if training and config.warmup:
|
||||||
log("warming", "up")
|
log("warming", "up")
|
||||||
ritual.train_batched(
|
|
||||||
np.random.normal(size=inputs.shape),
|
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
|
||||||
np.random.normal(size=outputs.shape),
|
temp_optim = learner.optim
|
||||||
config.batch_size)
|
learner.optim = Optimizer(alpha=0.01)
|
||||||
ritual.reset()
|
|
||||||
|
for _ in range(2):
|
||||||
|
ritual.train_batched(
|
||||||
|
np.random.normal(size=inputs.shape),
|
||||||
|
np.random.normal(size=outputs.shape),
|
||||||
|
config.batch_size)
|
||||||
|
ritual.reset()
|
||||||
|
|
||||||
|
learner.optim = temp_optim
|
||||||
|
|
||||||
if training:
|
if training:
|
||||||
measure_error()
|
measure_error()
|
||||||
|
@ -668,8 +685,12 @@ def run(program, args=None):
|
||||||
|
|
||||||
#log("learning rate", "{:10.8f}".format(learner.rate))
|
#log("learning rate", "{:10.8f}".format(learner.rate))
|
||||||
#log("average loss", "{:11.7f}".format(avg_loss))
|
#log("average loss", "{:11.7f}".format(avg_loss))
|
||||||
fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
|
if config.log10_loss:
|
||||||
log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss))
|
fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}"
|
||||||
|
log("info", fmt.format(learner.epoch + 1, learner.rate, np.log10(avg_loss)))
|
||||||
|
else:
|
||||||
|
fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
|
||||||
|
log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss))
|
||||||
|
|
||||||
measure_error()
|
measure_error()
|
||||||
|
|
||||||
|
|
|
@ -97,25 +97,24 @@ class Optimizer:
|
||||||
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
||||||
|
|
||||||
class Momentum(Optimizer):
|
class Momentum(Optimizer):
|
||||||
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
def __init__(self, alpha=0.01, mu=0.9, nesterov=False):
|
||||||
self.lamb = _f(lamb) # weight decay
|
|
||||||
self.mu = _f(mu) # momentum
|
self.mu = _f(mu) # momentum
|
||||||
self.nesterov = bool(nesterov)
|
self.nesterov = bool(nesterov)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(alpha)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.dWprev = None
|
self.Vprev = None
|
||||||
|
|
||||||
def compute(self, dW, W):
|
def compute(self, dW, W):
|
||||||
if self.dWprev is None:
|
if self.Vprev is None:
|
||||||
#self.dWprev = np.zeros_like(dW)
|
self.Vprev = np.copy(dW)
|
||||||
self.dWprev = np.copy(dW)
|
|
||||||
|
V = self.mu * self.Vprev - self.alpha * dW
|
||||||
|
self.Vprev[:] = V
|
||||||
|
if self.nesterov:
|
||||||
|
return self.mu * V - self.alpha * dW
|
||||||
|
|
||||||
V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb)
|
|
||||||
self.dWprev[:] = V
|
|
||||||
if self.nesterov: # TODO: is this correct? looks weird
|
|
||||||
return self.mu * V - self.alpha * (dW + W * self.lamb)
|
|
||||||
return V
|
return V
|
||||||
|
|
||||||
class RMSprop(Optimizer):
|
class RMSprop(Optimizer):
|
||||||
|
@ -154,6 +153,7 @@ class RMSprop(Optimizer):
|
||||||
return -self.alpha * dW / np.sqrt(self.g + self.eps)
|
return -self.alpha * dW / np.sqrt(self.g + self.eps)
|
||||||
|
|
||||||
class Adam(Optimizer):
|
class Adam(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1412.6980
|
||||||
# Adam generalizes* RMSprop, and
|
# Adam generalizes* RMSprop, and
|
||||||
# adds a decay term to the regular (non-squared) delta, and
|
# adds a decay term to the regular (non-squared) delta, and
|
||||||
# does some decay-gain voodoo. (i guess it's compensating
|
# does some decay-gain voodoo. (i guess it's compensating
|
||||||
|
@ -165,11 +165,11 @@ class Adam(Optimizer):
|
||||||
# Adam.b1_t == 0
|
# Adam.b1_t == 0
|
||||||
# Adam.b2_t == 0
|
# Adam.b2_t == 0
|
||||||
|
|
||||||
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
self.b1 = _f(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
self.b2 = _f(b2) # decay term
|
self.b2 = _f(b2) # decay term
|
||||||
self.b1_t_default = _f(b1_t) # decay term power t
|
self.b1_t_default = _f(b1) # decay term power t
|
||||||
self.b2_t_default = _f(b2_t) # decay term power t
|
self.b2_t_default = _f(b2) # decay term power t
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(alpha)
|
||||||
|
@ -197,6 +197,53 @@ class Adam(Optimizer):
|
||||||
return -self.alpha * (self.mt / (1 - self.b1_t)) \
|
return -self.alpha * (self.mt / (1 - self.b1_t)) \
|
||||||
/ np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
|
/ np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
|
||||||
|
|
||||||
|
class Nadam(Optimizer):
|
||||||
|
# paper: https://arxiv.org/abs/1412.6980
|
||||||
|
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
|
||||||
|
# TODO; double-check this implementation. also actually read the damn paper.
|
||||||
|
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
||||||
|
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
||||||
|
|
||||||
|
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
|
self.b1 = _f(b1) # decay term
|
||||||
|
self.b2 = _f(b2) # decay term
|
||||||
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
super().__init__(alpha)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.mt = None
|
||||||
|
self.vt = None
|
||||||
|
self.t = 0
|
||||||
|
self.sched = 1
|
||||||
|
|
||||||
|
def compute(self, dW, W):
|
||||||
|
self.t += 1
|
||||||
|
|
||||||
|
if self.mt is None:
|
||||||
|
self.mt = np.zeros_like(dW)
|
||||||
|
if self.vt is None:
|
||||||
|
self.vt = np.zeros_like(dW)
|
||||||
|
|
||||||
|
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
|
||||||
|
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
|
||||||
|
|
||||||
|
sched0 = self.sched * ut0
|
||||||
|
sched1 = self.sched * ut0 * ut1
|
||||||
|
self.sched = sched0
|
||||||
|
|
||||||
|
gp = dW / (1 - sched0)
|
||||||
|
|
||||||
|
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
|
||||||
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * np.square(dW)
|
||||||
|
|
||||||
|
mtp = self.mt / (1 - sched1)
|
||||||
|
vtp = self.vt / (1 - self.b2**self.t)
|
||||||
|
|
||||||
|
mt_bar = (1 - ut0) * gp + ut1 * mtp
|
||||||
|
|
||||||
|
return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps)
|
||||||
|
|
||||||
# Abstract Layers {{{1
|
# Abstract Layers {{{1
|
||||||
|
|
||||||
class Layer:
|
class Layer:
|
||||||
|
@ -273,6 +320,12 @@ class Layer:
|
||||||
def validate_output(self, Y):
|
def validate_output(self, Y):
|
||||||
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
||||||
|
|
||||||
|
def init(self, W, dW):
|
||||||
|
assert W.ndim == 1 and W.shape[0] == self.size, W.shape
|
||||||
|
assert dW.ndim == 1 and dW.shape[0] == self.size, dW.shape
|
||||||
|
self.W = W
|
||||||
|
self.dW = dW
|
||||||
|
|
||||||
def forward(self, lut):
|
def forward(self, lut):
|
||||||
if not self.unsafe:
|
if not self.unsafe:
|
||||||
assert self.parents, self
|
assert self.parents, self
|
||||||
|
@ -430,10 +483,10 @@ class Dense(Layer):
|
||||||
return shape
|
return shape
|
||||||
|
|
||||||
def init(self, W, dW):
|
def init(self, W, dW):
|
||||||
|
super().init(W, dW)
|
||||||
|
|
||||||
ins, outs = self.input_shape[0], self.output_shape[0]
|
ins, outs = self.input_shape[0], self.output_shape[0]
|
||||||
|
|
||||||
self.W = W
|
|
||||||
self.dW = dW
|
|
||||||
self.coeffs = self.W[:self.nW].reshape(ins, outs)
|
self.coeffs = self.W[:self.nW].reshape(ins, outs)
|
||||||
self.biases = self.W[self.nW:].reshape(1, outs)
|
self.biases = self.W[self.nW:].reshape(1, outs)
|
||||||
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
|
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
|
||||||
|
@ -507,12 +560,10 @@ class Model:
|
||||||
|
|
||||||
def backward(self, error):
|
def backward(self, error):
|
||||||
lut = dict()
|
lut = dict()
|
||||||
input_node = self.ordered_nodes[0]
|
|
||||||
output_node = self.ordered_nodes[-1]
|
output_node = self.ordered_nodes[-1]
|
||||||
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
|
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
|
||||||
for node in reversed(self.ordered_nodes[:-1]):
|
for node in reversed(self.ordered_nodes[:-1]):
|
||||||
lut[node] = node.backward(lut)
|
lut[node] = node.backward(lut)
|
||||||
#return lut[input_node] # meaningless value
|
|
||||||
return self.dW
|
return self.dW
|
||||||
|
|
||||||
def load_weights(self, fn):
|
def load_weights(self, fn):
|
||||||
|
|
Loading…
Reference in a new issue