This commit is contained in:
Connor Olding 2017-02-17 22:53:44 -08:00
parent 3106495704
commit 42a66d4d6c
2 changed files with 111 additions and 39 deletions

View file

@ -68,9 +68,7 @@ class LayerNorm(Layer):
return shape return shape
def init(self, W, dW): def init(self, W, dW):
# TODO: move this little bit into super(), also assert against self.size super().init(W, dW)
self.W = W
self.dW = dW
f = self.features f = self.features
@ -95,7 +93,6 @@ class LayerNorm(Layer):
length = dY.shape[0] length = dY.shape[0]
if self.affine: if self.affine:
# Y = gamma * Xnorm + beta
dXnorm = dY * self.gamma dXnorm = dY * self.gamma
self.dgamma[:] = (dY * self.Xnorm).sum(0) self.dgamma[:] = (dY * self.Xnorm).sum(0)
self.dbeta[:] = dY.sum(0) self.dbeta[:] = dY.sum(0)
@ -332,14 +329,22 @@ def multiresnet(x, width, depth, block=2, multi=1,
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform) inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox) activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
def prettyize(data):
if isinstance(data, np.ndarray):
s = ', '.join(('{:8.2e}'.format(n) for n in data))
s = '[' + s + ']'
else:
s = '{:8.2e}'.format(data)
return s
def normalize_data(data, mean=None, std=None): def normalize_data(data, mean=None, std=None):
# in-place # in-place
if mean is None or std is None: if mean is None or std is None:
mean = np.mean(data, axis=0) mean = np.mean(data, axis=0)
std = np.std(data, axis=0) std = np.std(data, axis=0)
# TODO: construct function call string for copy-paste convenience mean_str = prettyize(mean)
lament('mean:', mean) std_str = prettyize(std)
lament('std: ', std) lament('nod(...,\n {},\n {})'.format(mean_str, std_str))
sys.exit(1) sys.exit(1)
data -= _f(mean) data -= _f(mean)
data /= _f(std) data /= _f(std)
@ -410,12 +415,12 @@ def toy_data(train_samples, valid_samples, problem=2):
def optim_from_config(config): def optim_from_config(config):
if config.optim == 'adam': if config.optim == 'adam':
assert not config.nesterov, "unimplemented"
d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5 d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5 d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
b1 = np.exp(-1/d1) b1 = np.exp(-1/d1)
b2 = np.exp(-1/d2) b2 = np.exp(-1/d2)
optim = Adam(b1=b1, b1_t=b1, b2=b2, b2_t=b2) o = Nadam if config.nesterov else Adam
optim = o(b1=b1, b2=b2)
elif config.optim in ('rms', 'rmsprop'): elif config.optim in ('rms', 'rmsprop'):
d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5 d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
mu = np.exp(-1/d2) mu = np.exp(-1/d2)
@ -550,14 +555,15 @@ def run(program, args=None):
optim = 'adam', optim = 'adam',
optim_decay1 = 2, # given in epochs (optional) optim_decay1 = 2, # given in epochs (optional)
optim_decay2 = 100, # given in epochs (optional) optim_decay2 = 100, # given in epochs (optional)
momentum = 0.50, # only used with SGD momentum = 0.90, # only used with SGD
nesterov = False, # only used with SGD or Adam nesterov = True, # only used with SGD or Adam
batch_size = 64, batch_size = 64,
# learning parameters # learning parameters
learner = 'sgdr', learner = 'sgdr',
learn = 1e-2, learn = 1e-2,
epochs = 24, epochs = 24,
learn_halve_every = 16, # only used with anneal/dumb
restarts = 2, restarts = 2,
restart_decay = 0.25, # only used with SGDR restart_decay = 0.25, # only used with SGDR
expando = lambda i: 24 * i, expando = lambda i: 24 * i,
@ -569,8 +575,9 @@ def run(program, args=None):
ritual = 'default', ritual = 'default',
restart_optim = False, # restarts also reset internal state of optimizer restart_optim = False, # restarts also reset internal state of optimizer
warmup = True, warmup = True,
log10_loss = True, # personally, i'm sick of looking linear loss values!
problem = 2, problem = 3,
compare = ( compare = (
# best results for ~10,000 parameters # best results for ~10,000 parameters
# training/validation pairs for each problem (starting from problem 0): # training/validation pairs for each problem (starting from problem 0):
@ -592,7 +599,6 @@ def run(program, args=None):
config.pprint() config.pprint()
# Toy Data {{{2 # Toy Data {{{2
# (our model is probably complete overkill for this, so TODO: better data)
(inputs, outputs), (valid_inputs, valid_outputs) = \ (inputs, outputs), (valid_inputs, valid_outputs) = \
toy_data(2**14, 2**11, problem=config.problem) toy_data(2**14, 2**11, problem=config.problem)
@ -624,8 +630,11 @@ def run(program, args=None):
predicted = model.forward(inputs) predicted = model.forward(inputs)
err = ritual.measure(predicted, outputs) err = ritual.measure(predicted, outputs)
log(name + " loss", "{:12.6e}".format(err)) log(name + " loss", "{:12.6e}".format(err))
if comparison: if config.log10_loss:
log("improvement", "10**({:+7.4f}) times".format(np.log10(comparison / err))) log(name + " log10-loss", "{:+6.3f}".format(np.log10(err)))
elif comparison:
fmt = "10**({:+7.4f}) times"
log("improvement", fmt.format(np.log10(comparison / err)))
return err return err
train_err = print_error("train", train_err = print_error("train",
@ -645,12 +654,20 @@ def run(program, args=None):
if training and config.warmup: if training and config.warmup:
log("warming", "up") log("warming", "up")
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
temp_optim = learner.optim
learner.optim = Optimizer(alpha=0.01)
for _ in range(2):
ritual.train_batched( ritual.train_batched(
np.random.normal(size=inputs.shape), np.random.normal(size=inputs.shape),
np.random.normal(size=outputs.shape), np.random.normal(size=outputs.shape),
config.batch_size) config.batch_size)
ritual.reset() ritual.reset()
learner.optim = temp_optim
if training: if training:
measure_error() measure_error()
@ -668,6 +685,10 @@ def run(program, args=None):
#log("learning rate", "{:10.8f}".format(learner.rate)) #log("learning rate", "{:10.8f}".format(learner.rate))
#log("average loss", "{:11.7f}".format(avg_loss)) #log("average loss", "{:11.7f}".format(avg_loss))
if config.log10_loss:
fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}"
log("info", fmt.format(learner.epoch + 1, learner.rate, np.log10(avg_loss)))
else:
fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}" fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}"
log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss)) log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss))

View file

@ -97,25 +97,24 @@ class Optimizer:
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
class Momentum(Optimizer): class Momentum(Optimizer):
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False): def __init__(self, alpha=0.01, mu=0.9, nesterov=False):
self.lamb = _f(lamb) # weight decay
self.mu = _f(mu) # momentum self.mu = _f(mu) # momentum
self.nesterov = bool(nesterov) self.nesterov = bool(nesterov)
super().__init__(alpha) super().__init__(alpha)
def reset(self): def reset(self):
self.dWprev = None self.Vprev = None
def compute(self, dW, W): def compute(self, dW, W):
if self.dWprev is None: if self.Vprev is None:
#self.dWprev = np.zeros_like(dW) self.Vprev = np.copy(dW)
self.dWprev = np.copy(dW)
V = self.mu * self.Vprev - self.alpha * dW
self.Vprev[:] = V
if self.nesterov:
return self.mu * V - self.alpha * dW
V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb)
self.dWprev[:] = V
if self.nesterov: # TODO: is this correct? looks weird
return self.mu * V - self.alpha * (dW + W * self.lamb)
return V return V
class RMSprop(Optimizer): class RMSprop(Optimizer):
@ -154,6 +153,7 @@ class RMSprop(Optimizer):
return -self.alpha * dW / np.sqrt(self.g + self.eps) return -self.alpha * dW / np.sqrt(self.g + self.eps)
class Adam(Optimizer): class Adam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
# Adam generalizes* RMSprop, and # Adam generalizes* RMSprop, and
# adds a decay term to the regular (non-squared) delta, and # adds a decay term to the regular (non-squared) delta, and
# does some decay-gain voodoo. (i guess it's compensating # does some decay-gain voodoo. (i guess it's compensating
@ -165,11 +165,11 @@ class Adam(Optimizer):
# Adam.b1_t == 0 # Adam.b1_t == 0
# Adam.b2_t == 0 # Adam.b2_t == 0
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8): def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.b1_t_default = _f(b1_t) # decay term power t self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2_t) # decay term power t self.b2_t_default = _f(b2) # decay term power t
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(alpha) super().__init__(alpha)
@ -197,6 +197,53 @@ class Adam(Optimizer):
return -self.alpha * (self.mt / (1 - self.b1_t)) \ return -self.alpha * (self.mt / (1 - self.b1_t)) \
/ np.sqrt((self.vt / (1 - self.b2_t)) + self.eps) / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps)
class Nadam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
# TODO; double-check this implementation. also actually read the damn paper.
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.eps = _f(eps)
super().__init__(alpha)
def reset(self):
self.mt = None
self.vt = None
self.t = 0
self.sched = 1
def compute(self, dW, W):
self.t += 1
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
sched0 = self.sched * ut0
sched1 = self.sched * ut0 * ut1
self.sched = sched0
gp = dW / (1 - sched0)
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * np.square(dW)
mtp = self.mt / (1 - sched1)
vtp = self.vt / (1 - self.b2**self.t)
mt_bar = (1 - ut0) * gp + ut1 * mtp
return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps)
# Abstract Layers {{{1 # Abstract Layers {{{1
class Layer: class Layer:
@ -273,6 +320,12 @@ class Layer:
def validate_output(self, Y): def validate_output(self, Y):
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
def init(self, W, dW):
assert W.ndim == 1 and W.shape[0] == self.size, W.shape
assert dW.ndim == 1 and dW.shape[0] == self.size, dW.shape
self.W = W
self.dW = dW
def forward(self, lut): def forward(self, lut):
if not self.unsafe: if not self.unsafe:
assert self.parents, self assert self.parents, self
@ -430,10 +483,10 @@ class Dense(Layer):
return shape return shape
def init(self, W, dW): def init(self, W, dW):
super().init(W, dW)
ins, outs = self.input_shape[0], self.output_shape[0] ins, outs = self.input_shape[0], self.output_shape[0]
self.W = W
self.dW = dW
self.coeffs = self.W[:self.nW].reshape(ins, outs) self.coeffs = self.W[:self.nW].reshape(ins, outs)
self.biases = self.W[self.nW:].reshape(1, outs) self.biases = self.W[self.nW:].reshape(1, outs)
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs) self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
@ -507,12 +560,10 @@ class Model:
def backward(self, error): def backward(self, error):
lut = dict() lut = dict()
input_node = self.ordered_nodes[0]
output_node = self.ordered_nodes[-1] output_node = self.ordered_nodes[-1]
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0)) lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
for node in reversed(self.ordered_nodes[:-1]): for node in reversed(self.ordered_nodes[:-1]):
lut[node] = node.backward(lut) lut[node] = node.backward(lut)
#return lut[input_node] # meaningless value
return self.dW return self.dW
def load_weights(self, fn): def load_weights(self, fn):