optim/onn/optimizer.py

644 lines
18 KiB
Python

import numpy as np
from .float import _f, _0, _1
from .optimizer_base import *
from .utility import *
def filter_gradients(accum, grads, param):
# NOTE: this modifies accum in-place.
# param > 0 acts as a simple one-pole low-pass filter, unity at DC.
# param < 0 acts as an accumulator with a decay of -param, nonunity at DC.
# param == 0 simply copies grads into accum.
if param == 0:
accum[:] = grads
elif param == 1:
pass
elif param == -1:
accum += grads
elif param < 0:
accum *= -param
accum += grads
else:
accum += (1 - param) * (grads - accum)
return accum
class Momentum(Optimizer):
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
self.mu = _f(mu)
self.nesterov = bool(nesterov)
super().__init__(lr)
def reset(self):
self.accum = None
super().reset()
def compute(self, dW, W):
if self.accum is None:
self.accum = np.zeros_like(dW)
self.accum[:] = self.accum * self.mu + dW
if self.nesterov:
return -self.lr * (self.accum * self.mu + dW)
else:
return -self.lr * self.accum
class Adadelta(Optimizer):
# paper: https://arxiv.org/abs/1212.5701
def __init__(self, lr=1.0, mu=0.95, eps=1e-8):
self.mu = _f(mu)
self.eps = _f(eps)
super().__init__(lr)
def reset(self):
self.g = None
self.x = None
super().reset()
def compute(self, dW, W):
if self.g is None:
self.g = np.zeros_like(dW)
if self.x is None:
self.x = np.zeros_like(dW)
self.g += (self.mu - 1) * (self.g - np.square(dW))
delta = dW * np.sqrt(self.x + self.eps) / (np.sqrt(self.g) + self.eps)
self.x += (self.mu - 1) * (self.x - np.square(delta))
return -self.lr * delta
class RMSpropCentered(Optimizer):
# referenced TensorFlow/PyTorch.
# paper: https://arxiv.org/abs/1308.0850
def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8):
self.aleph = _f(aleph)
self.momentum = _f(momentum)
self.eps = _f(eps)
super().__init__(lr)
def reset(self):
self.g = None
self.mt = None
self.vt = None
self.delta = None
super().reset()
def compute(self, dW, W):
if self.g is None:
self.g = np.zeros_like(dW)
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
if self.delta is None:
self.delta = np.zeros_like(dW)
self.mt += (1 - self.aleph) * (dW - self.mt)
self.vt += (1 - self.aleph) * (np.square(dW) - self.vt)
# PyTorch has the epsilon outside of the sqrt,
# TensorFlow and the paper have it within.
# in onn, we generally do it outside, as this seems to work better.
temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps)
# TensorFlow does it this way.
self.delta[:] = self.momentum * self.delta + self.lr * temp
return -self.delta
# PyTorch does it this way.
# self.delta[:] = self.momentum * self.delta + temp
# return -self.lr * self.delta
# they are equivalent only when LR is constant, which it might not be.
class Nadam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
# TODO: double-check this implementation. also read the damn paper.
# lifted from:
# https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.eps = _f(eps)
super().__init__(lr)
def reset(self):
self.mt = None
self.vt = None
self.t = 0
self.sched = 1
super().reset()
def compute(self, dW, W):
self.t += 1
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
sched0 = self.sched * ut0
sched1 = self.sched * ut0 * ut1
self.sched = sched0
gp = dW / (1 - sched0)
self.mt += (1 - self.b1) * (dW - self.mt)
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
mtp = self.mt / (1 - sched1)
vtp = self.vt / (1 - self.b2**self.t)
mt_bar = (1 - ut0) * gp + ut1 * mtp
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
class FTML(Optimizer):
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
# author's implementation: https://github.com/szhengac/optim/commit/923555e
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
self.iterations = _0
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.eps = _f(eps)
super().__init__(lr)
def reset(self):
self.dt1 = None
self.dt = None
self.vt = None
self.zt = None
self.b1_t = _1
self.b2_t = _1
super().reset()
def compute(self, dW, W):
if self.dt1 is None:
self.dt1 = np.zeros_like(dW)
if self.dt is None:
self.dt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
if self.zt is None:
self.zt = np.zeros_like(dW)
# NOTE: we could probably rewrite these equations to avoid this copy.
self.dt1[:] = self.dt[:]
self.b1_t *= self.b1
self.b2_t *= self.b2
# hardly an elegant solution.
lr = max(self.lr, self.eps)
# same as Adam's vt.
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
# you can factor "inner" out of Adam as well.
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
self.dt[:] = (1 - self.b1_t) / lr * inner
sigma_t = self.dt - self.b1 * self.dt1
# Adam's mt minus the sigma term.
self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
# subtract by weights to avoid having to override self.update.
return -self.zt / self.dt - W
class MomentumClip(Optimizer):
def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
self.mu = _f(mu)
self.clip = _f(clip)
self.nesterov = bool(nesterov)
self.debug = bool(debug)
super().__init__(lr)
def reset(self):
self.accum = None
super().reset()
def compute(self, dW, W):
if self.accum is None:
self.accum = np.zeros_like(dW)
total_norm = np.linalg.norm(dW)
clip_scale = self.clip / (total_norm + 1e-6)
if clip_scale < 1:
if self.debug:
lament("clipping gradients; norm: {:10.5f}".format(total_norm))
dW *= clip_scale
self.accum[:] = self.accum * self.mu + dW
if self.nesterov:
return -self.lr * (self.accum * self.mu + dW)
else:
return -self.lr * self.accum
class AddSign(Optimizer):
# paper: https://arxiv.org/abs/1709.07417
def __init__(self, lr=0.01, mu=0.9, alpha=1):
self.mu = _f(mu)
self.alpha = _f(alpha)
super().__init__(lr)
def reset(self):
self.accum = None
super().reset()
def compute(self, dW, W):
if self.accum is None:
self.accum = np.zeros_like(dW)
self.accum[:] = self.accum * self.mu + dW
signed = np.sign(dW) * np.sign(self.accum)
# signed *= decay
return -self.lr * dW * (self.alpha + signed)
class PowerSign(Optimizer):
# paper: https://arxiv.org/abs/1709.07417
def __init__(self, lr=0.01, mu=0.9, alpha=np.e):
self.mu = _f(mu)
self.alpha = _f(alpha)
self.use_exp = np.isclose(self.alpha, _f(np.e))
super().__init__(lr)
def reset(self):
self.accum = None
super().reset()
def compute(self, dW, W):
if self.accum is None:
self.accum = np.zeros_like(dW)
self.accum[:] = self.accum * self.mu + dW
signed = np.sign(dW) * np.sign(self.accum)
# signed *= decay
if self.use_exp:
return -self.lr * dW * np.exp(signed)
else:
return -self.lr * dW * np.power(self.alpha, signed)
class Neumann(Optimizer):
# paper: https://arxiv.org/abs/1712.03298
# NOTE: this implementation omits resetting as described in the paper.
# resetting is totally disabled for now.
# NOTE: this implementation does not use vanilla SGD for its first epochs.
# you can do this yourself if you really want to.
# it seems to be enough to use a slow-starting Learner like SineCLR.
def __init__(self, lr=0.01, delta=1.0,
alpha=1e-7, beta=1e-5, gamma=0.99, mu_min=0.5, mu_max=0.9):
self.delta = _f(delta) # delta-time.
self.alpha = _f(alpha) # cubic.
self.beta = _f(beta) # repulsive. NOTE: multiplied by len(dW) later.
self.gamma = _f(gamma) # EMA, or 1-pole low-pass parameter. same thing.
# momentum is in the shape of 1 - 1/(1 + t)
self.mu_min = _f(mu_min)
self.mu_max = _f(mu_max)
self.reset_period = 0 # TODO
super().__init__(lr)
def reset(self):
# NOTE: mt and vt are different than the pair in Adam-like optimizers.
self.mt = None # momentum accumulator.
self.vt = None # weight accumulator.
self.t = 0
super().reset()
def compute(self, dW, W):
raise Exception("compute() is not available for this Optimizer.")
def update(self, dW, W):
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0:
self.mt = -self.lr * dW
return
# momentum quantity:
mu = _1 - _1/_f(self.t + _1)
mu = (self.mu_max - self.mu_max) * mu + self.mu_min
self.t += self.delta
# change in smoothed weights:
delta = W - self.vt
delta_norm_squared = np.square(delta).sum()
delta_norm = np.sqrt(delta_norm_squared)
# regularization terms: (push and pull)
cubic_reg = self.alpha * delta_norm_squared
repulsive_reg = self.beta * dW.size / delta_norm_squared
dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm)
# Richardson iteration disguised as plain momentum:
self.mt = mu * self.mt - self.lr * dt
# this is only a good approximation for small ||self.lr * self.mt||.
# update weights and moving average:
W += mu * self.mt - self.lr * dt # essentially Nesterov momentum.
self.vt = W + self.gamma * (self.vt - W)
class Adamax(Optimizer):
# TODO: paper?
def __init__(self, lr=0.001, b1=0.9, b2=0.999, debias=False, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2) # decay term power t
self.debias = bool(debias)
self.eps = _f(eps)
super().__init__(lr)
def reset(self):
self.mt = None
self.vt = None
self.b1_t = self.b1_t_default
self.b2_t = self.b2_t_default
super().reset()
def compute(self, dW, W):
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
mt = filter_gradients(self.mt, dW, self.b1)
vt = np.maximum(self.b2 * self.vt, np.abs(dW))
if self.debias:
if self.b1_t != 1:
mt = mt / (1 - self.b1_t)
if self.b2_t != 1:
vt = vt / (1 - self.b2_t)
# decay gain.
self.b1_t *= self.b1
self.b2_t *= self.b2
return -self.lr * mt / (vt + self.eps)
class Adamlike(Optimizer):
# this generalizes a lot of algorithms that
# either subsets or supersets the Adam optimizer.
# refer to the subclasses for details.
# these defaults match Adam's.
def __init__(self, lr=0.001, b1=0.9, b2=0.999, power=1/2,
debias=True, runmax=False, yogi=False, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2) # decay term power t
self.power = _f(power)
self.debias = bool(debias)
self.runmax = bool(runmax)
self.yogi = bool(yogi)
self.eps = _f(eps)
super().__init__(lr)
def reset(self):
self.mt = None
self.vt = None
self.vtmax = None
self.b1_t = self.b1_t_default
self.b2_t = self.b2_t_default
super().reset()
def compute(self, dW, W):
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
if self.vtmax is None and self.runmax:
self.vtmax = np.zeros_like(dW)
# keep local references of mt and vt to simplify
# implementing all the variations of Adam later.
mt = filter_gradients(self.mt, dW, self.b1)
if self.yogi:
g2 = np.square(dW)
vt = self.vt
vt -= (1 - self.b2) * np.sign(vt - g2) * g2
else:
vt = filter_gradients(self.vt, np.square(dW), self.b2)
if self.runmax:
self.vtmax[:] = np.maximum(vt, self.vtmax)
vt = self.vtmax
if self.debias:
if self.b1_t != 1:
mt = mt / (1 - self.b1_t)
if self.b2_t != 1:
vt = vt / (1 - self.b2_t)
if self.power == 0:
delta = mt
elif self.power == 1:
delta = mt / (vt + self.eps)
elif self.power == 1/2: # TODO: is this actually faster?
delta = mt / (np.sqrt(vt) + self.eps)
elif self.power == 1/3: # TODO: is this actually faster?
delta = mt / (np.cbrt(vt) + self.eps)
else:
delta = mt / (vt**self.power + self.eps)
if self.debias:
# decay gain.
self.b1_t *= self.b1
self.b2_t *= self.b2
return -self.lr * delta
class Adagrad(Adamlike):
# paper: https://web.stanford.edu/~jduchi/projects/DuchiHaSi11.pdf
def __init__(self, lr=0.01, eps=1e-8):
super().__init__(lr=lr, b1=0.0, b2=-1.0, power=1/2,
debias=False, runmax=False, eps=eps)
@property
def g(self):
return self.vt
@g.setter
def g(self, value):
self.vt = value
class RMSprop(Adamlike):
# slides: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
def __init__(self, lr=0.001, mu=0.99, eps=1e-8):
super().__init__(lr=lr, b1=0.0, b2=mu, power=1/2,
debias=False, runmax=False, eps=eps)
@property
def mu(self):
return self.b2
@mu.setter
def mu(self, value):
self.b2 = value
@property
def g(self):
return self.vt
@g.setter
def g(self, value):
self.vt = value
class Adam(Adamlike):
# paper: https://arxiv.org/abs/1412.6980
# Adam generalizes RMSprop, and
# adds a decay term to the regular (non-squared) delta, and performs
# debiasing to compensate for the filtered deltas starting from zero.
def __init__(self, lr=0.001, b1=0.9, b2=0.999,
debias=True, eps=1e-8):
super().__init__(lr=lr, b1=b1, b2=b2, power=1/2,
debias=debias, runmax=False, yogi=False, eps=eps)
class Yogi(Adamlike):
# paper: https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
# based on Adam. this changes the filtering for vt.
def __init__(self, lr=0.01, b1=0.9, b2=0.999,
debias=True, eps=1e-3):
super().__init__(lr=lr, b1=b1, b2=b2, power=1/2,
debias=debias, runmax=False, yogi=True, eps=eps)
class AMSgrad(Adamlike):
# paper: https://openreview.net/forum?id=ryQu7f-RZ
# based on Adam. this simply adds a running element-wise maximum to vt.
def __init__(self, lr=0.001, b1=0.9, b2=0.999,
debias=True, eps=1e-8):
super().__init__(lr=lr, b1=b1, b2=b2, power=1/2,
debias=debias, runmax=True, yogi=False, eps=eps)
class Padam(Adamlike):
# paper: https://arxiv.org/abs/1806.06763
# paper: https://arxiv.org/abs/1808.05671
# based on AMSgrad. this configures the power of vt to be closer to zero.
def __init__(self, lr=0.1, b1=0.9, b2=0.999,
power=1/8, debias=True, eps=1e-8):
super().__init__(lr=lr, b1=b1, b2=b2, power=power,
debias=debias, runmax=True, yogi=False, eps=eps)
class MSVAG(Optimizer):
# paper: https://arxiv.org/abs/1705.07774
# this is the variance-reducing aspect isolated from the rest of Adam.
def __init__(self, lr=0.1, b=0.99):
self.b = _f(b)
super().__init__(lr=lr)
def reset(self):
self.mt = None
self.vt = None
self.bt = self.b
super().reset()
def compute(self, dW, W):
if self.mt is None:
self.mt = np.zeros_like(dW)
if self.vt is None:
self.vt = np.zeros_like(dW)
mt = filter_gradients(self.mt, dW, self.b)
vt = filter_gradients(self.vt, np.square(dW), self.b)
# debiasing:
if self.bt != 1:
mt = mt / (1 - self.bt)
vt = vt / (1 - self.bt)
num = (1 - self.b) * (1 + self.bt)
den = (1 + self.b) * (1 - self.bt)
rho = num / den
else:
# technically, this should be 1 / (t + 1),
# but we don't keep track of t directly.
rho = 1
if rho != 1:
mt2 = np.square(mt)
s = (vt - mt2) / (1 - rho)
gamma = div0(mt2, mt2 + rho * s)
else:
gamma = 1
self.bt *= self.b
return -self.lr * (gamma * mt)
AMSGrad = AMSgrad
AdaDelta = Adadelta
AdaGrad = Adagrad
AdaMax = Adamax
NAdam = Nadam
PAdam = Padam
RMSProp = RMSprop
RMSPropCentered = RMSpropCentered