From db65fbdd62a93e2cb55c63718793b337457bf696 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Fri, 12 Jan 2018 15:42:04 +0000 Subject: [PATCH] add Neumann optimizer --- onn.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/onn.py b/onn.py index e88acff..126e2f3 100755 --- a/onn.py +++ b/onn.py @@ -362,6 +362,65 @@ class PowerSign(Optimizer): else: return -self.lr * dW * np.power(self.alpha, signed) +class Neumann(Optimizer): + # paper: https://arxiv.org/abs/1712.03298 + # NOTE: this implementation is missing resetting as described in the paper. + # resetting is totally disabled for now. + # NOTE: this implementation does not use vanilla SGD for its first epochs. + # you should do this yourself if you need it. + # it seems like using a Learner like SineCLR makes this unnecessary. + + def __init__(self, lr=0.01): + self.alpha = _f(1e-7) # cubic. + self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later. + self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing. + # momentum is ∝ (in the shape of) 1 - 1/(1 + t) + self.mu_min = _f(0.5) + self.mu_max = _f(0.9) + self.reset_period = 0 # TODO + + super().__init__(lr) + + def reset(self): + # NOTE: mt and vt are different than the pair in Adam-like optimizers. + self.mt = None # momentum accumulator. + self.vt = None # weight accumulator. + self.t = 0 + + def compute(self, dW, W): + raise Exception("compute() is not available for this Optimizer.") + + def update(self, dW, W): + self.t += 1 + + if self.mt is None: + self.mt = np.zeros_like(dW) + if self.vt is None: + self.vt = np.zeros_like(dW) + + if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0: + self.mt = -self.lr * dW + return + + mu = _1 - _1/_f(self.t) # the + 1 is implicit. + mu = (mu + self.mu_min) * (self.mu_max - self.mu_min) + + delta = W - self.vt + delta_norm_squared = np.square(delta).sum() + delta_norm = np.sqrt(delta_norm_squared) + + alpha = self.alpha + beta = self.beta * dW.size + + cubic_reg = alpha * delta_norm_squared + repulsive_reg = beta / delta_norm_squared + dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm) + + self.mt = mu * self.mt - self.lr * dt + + W += mu * self.mt - self.lr * dt + self.vt = W + self.gamma * (self.vt - W) + # Nonparametric Layers {{{1 class AlphaDropout(Layer):