From 42a66d4d6cb538c4f698c2877e619a04ec56fdb2 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Fri, 17 Feb 2017 22:53:44 -0800 Subject: [PATCH] . --- optim_nn.py | 65 +++++++++++++++++++++++------------- optim_nn_core.py | 85 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 111 insertions(+), 39 deletions(-) diff --git a/optim_nn.py b/optim_nn.py index 655c478..0c87e1f 100644 --- a/optim_nn.py +++ b/optim_nn.py @@ -68,9 +68,7 @@ class LayerNorm(Layer): return shape def init(self, W, dW): - # TODO: move this little bit into super(), also assert against self.size - self.W = W - self.dW = dW + super().init(W, dW) f = self.features @@ -95,7 +93,6 @@ class LayerNorm(Layer): length = dY.shape[0] if self.affine: - # Y = gamma * Xnorm + beta dXnorm = dY * self.gamma self.dgamma[:] = (dY * self.Xnorm).sum(0) self.dbeta[:] = dY.sum(0) @@ -332,14 +329,22 @@ def multiresnet(x, width, depth, block=2, multi=1, inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform) activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox) +def prettyize(data): + if isinstance(data, np.ndarray): + s = ', '.join(('{:8.2e}'.format(n) for n in data)) + s = '[' + s + ']' + else: + s = '{:8.2e}'.format(data) + return s + def normalize_data(data, mean=None, std=None): # in-place if mean is None or std is None: mean = np.mean(data, axis=0) std = np.std(data, axis=0) - # TODO: construct function call string for copy-paste convenience - lament('mean:', mean) - lament('std: ', std) + mean_str = prettyize(mean) + std_str = prettyize(std) + lament('nod(...,\n {},\n {})'.format(mean_str, std_str)) sys.exit(1) data -= _f(mean) data /= _f(std) @@ -410,12 +415,12 @@ def toy_data(train_samples, valid_samples, problem=2): def optim_from_config(config): if config.optim == 'adam': - assert not config.nesterov, "unimplemented" d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5 d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5 b1 = np.exp(-1/d1) b2 = np.exp(-1/d2) - optim = Adam(b1=b1, b1_t=b1, b2=b2, b2_t=b2) + o = Nadam if config.nesterov else Adam + optim = o(b1=b1, b2=b2) elif config.optim in ('rms', 'rmsprop'): d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5 mu = np.exp(-1/d2) @@ -550,14 +555,15 @@ def run(program, args=None): optim = 'adam', optim_decay1 = 2, # given in epochs (optional) optim_decay2 = 100, # given in epochs (optional) - momentum = 0.50, # only used with SGD - nesterov = False, # only used with SGD or Adam + momentum = 0.90, # only used with SGD + nesterov = True, # only used with SGD or Adam batch_size = 64, # learning parameters learner = 'sgdr', learn = 1e-2, epochs = 24, + learn_halve_every = 16, # only used with anneal/dumb restarts = 2, restart_decay = 0.25, # only used with SGDR expando = lambda i: 24 * i, @@ -569,8 +575,9 @@ def run(program, args=None): ritual = 'default', restart_optim = False, # restarts also reset internal state of optimizer warmup = True, + log10_loss = True, # personally, i'm sick of looking linear loss values! - problem = 2, + problem = 3, compare = ( # best results for ~10,000 parameters # training/validation pairs for each problem (starting from problem 0): @@ -592,7 +599,6 @@ def run(program, args=None): config.pprint() # Toy Data {{{2 - # (our model is probably complete overkill for this, so TODO: better data) (inputs, outputs), (valid_inputs, valid_outputs) = \ toy_data(2**14, 2**11, problem=config.problem) @@ -624,8 +630,11 @@ def run(program, args=None): predicted = model.forward(inputs) err = ritual.measure(predicted, outputs) log(name + " loss", "{:12.6e}".format(err)) - if comparison: - log("improvement", "10**({:+7.4f}) times".format(np.log10(comparison / err))) + if config.log10_loss: + log(name + " log10-loss", "{:+6.3f}".format(np.log10(err))) + elif comparison: + fmt = "10**({:+7.4f}) times" + log("improvement", fmt.format(np.log10(comparison / err))) return err train_err = print_error("train", @@ -645,11 +654,19 @@ def run(program, args=None): if training and config.warmup: log("warming", "up") - ritual.train_batched( - np.random.normal(size=inputs.shape), - np.random.normal(size=outputs.shape), - config.batch_size) - ritual.reset() + + # use plain SGD in warmup to prevent (or possibly cause?) numeric issues + temp_optim = learner.optim + learner.optim = Optimizer(alpha=0.01) + + for _ in range(2): + ritual.train_batched( + np.random.normal(size=inputs.shape), + np.random.normal(size=outputs.shape), + config.batch_size) + ritual.reset() + + learner.optim = temp_optim if training: measure_error() @@ -668,8 +685,12 @@ def run(program, args=None): #log("learning rate", "{:10.8f}".format(learner.rate)) #log("average loss", "{:11.7f}".format(avg_loss)) - fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}" - log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss)) + if config.log10_loss: + fmt = "epoch {:4.0f}, rate {:10.8f}, log10-loss {:+6.3f}" + log("info", fmt.format(learner.epoch + 1, learner.rate, np.log10(avg_loss))) + else: + fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}" + log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss)) measure_error() diff --git a/optim_nn_core.py b/optim_nn_core.py index f314db5..176959c 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -97,25 +97,24 @@ class Optimizer: # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h class Momentum(Optimizer): - def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False): - self.lamb = _f(lamb) # weight decay + def __init__(self, alpha=0.01, mu=0.9, nesterov=False): self.mu = _f(mu) # momentum self.nesterov = bool(nesterov) super().__init__(alpha) def reset(self): - self.dWprev = None + self.Vprev = None def compute(self, dW, W): - if self.dWprev is None: - #self.dWprev = np.zeros_like(dW) - self.dWprev = np.copy(dW) + if self.Vprev is None: + self.Vprev = np.copy(dW) + + V = self.mu * self.Vprev - self.alpha * dW + self.Vprev[:] = V + if self.nesterov: + return self.mu * V - self.alpha * dW - V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb) - self.dWprev[:] = V - if self.nesterov: # TODO: is this correct? looks weird - return self.mu * V - self.alpha * (dW + W * self.lamb) return V class RMSprop(Optimizer): @@ -154,6 +153,7 @@ class RMSprop(Optimizer): return -self.alpha * dW / np.sqrt(self.g + self.eps) class Adam(Optimizer): + # paper: https://arxiv.org/abs/1412.6980 # Adam generalizes* RMSprop, and # adds a decay term to the regular (non-squared) delta, and # does some decay-gain voodoo. (i guess it's compensating @@ -165,11 +165,11 @@ class Adam(Optimizer): # Adam.b1_t == 0 # Adam.b2_t == 0 - def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8): + def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8): self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term - self.b1_t_default = _f(b1_t) # decay term power t - self.b2_t_default = _f(b2_t) # decay term power t + self.b1_t_default = _f(b1) # decay term power t + self.b2_t_default = _f(b2) # decay term power t self.eps = _f(eps) super().__init__(alpha) @@ -197,6 +197,53 @@ class Adam(Optimizer): return -self.alpha * (self.mt / (1 - self.b1_t)) \ / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps) +class Nadam(Optimizer): + # paper: https://arxiv.org/abs/1412.6980 + # paper: http://cs229.stanford.edu/proj2015/054_report.pdf + # TODO; double-check this implementation. also actually read the damn paper. + # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 + # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py + + def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8): + self.b1 = _f(b1) # decay term + self.b2 = _f(b2) # decay term + self.eps = _f(eps) + + super().__init__(alpha) + + def reset(self): + self.mt = None + self.vt = None + self.t = 0 + self.sched = 1 + + def compute(self, dW, W): + self.t += 1 + + if self.mt is None: + self.mt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + + ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0)) + ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1)) + + sched0 = self.sched * ut0 + sched1 = self.sched * ut0 * ut1 + self.sched = sched0 + + gp = dW / (1 - sched0) + + self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW + self.vt[:] = self.b2 * self.vt + (1 - self.b2) * np.square(dW) + + mtp = self.mt / (1 - sched1) + vtp = self.vt / (1 - self.b2**self.t) + + mt_bar = (1 - ut0) * gp + ut1 * mtp + + return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps) + # Abstract Layers {{{1 class Layer: @@ -273,6 +320,12 @@ class Layer: def validate_output(self, Y): assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) + def init(self, W, dW): + assert W.ndim == 1 and W.shape[0] == self.size, W.shape + assert dW.ndim == 1 and dW.shape[0] == self.size, dW.shape + self.W = W + self.dW = dW + def forward(self, lut): if not self.unsafe: assert self.parents, self @@ -430,10 +483,10 @@ class Dense(Layer): return shape def init(self, W, dW): + super().init(W, dW) + ins, outs = self.input_shape[0], self.output_shape[0] - self.W = W - self.dW = dW self.coeffs = self.W[:self.nW].reshape(ins, outs) self.biases = self.W[self.nW:].reshape(1, outs) self.dcoeffs = self.dW[:self.nW].reshape(ins, outs) @@ -507,12 +560,10 @@ class Model: def backward(self, error): lut = dict() - input_node = self.ordered_nodes[0] output_node = self.ordered_nodes[-1] lut[output_node] = output_node.dmulti(np.expand_dims(error, 0)) for node in reversed(self.ordered_nodes[:-1]): lut[node] = node.backward(lut) - #return lut[input_node] # meaningless value return self.dW def load_weights(self, fn):