2018-01-21 14:04:25 -08:00
|
|
|
import numpy as np
|
|
|
|
|
2018-03-17 06:09:15 -07:00
|
|
|
from .float import _f, _0, _1
|
2018-01-21 14:04:25 -08:00
|
|
|
from .optimizer_base import *
|
2018-01-21 14:16:36 -08:00
|
|
|
from .utility import *
|
2018-01-21 14:04:25 -08:00
|
|
|
|
2019-02-03 05:30:03 -08:00
|
|
|
|
|
|
|
def filter_gradients(accum, grads, param):
|
|
|
|
# NOTE: this modifies accum in-place.
|
|
|
|
# param > 0 acts as a simple one-pole low-pass filter, unity at DC.
|
|
|
|
# param < 0 acts as an accumulator with a decay of -param, nonunity at DC.
|
|
|
|
# param == 0 simply copies grads into accum.
|
|
|
|
if param == 0:
|
|
|
|
accum[:] = grads
|
|
|
|
if param < 0:
|
|
|
|
if param != -1:
|
|
|
|
accum *= -param
|
|
|
|
accum += grads
|
|
|
|
elif param == 1:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
accum += (1 - param) * (grads - accum)
|
|
|
|
return accum
|
2018-01-21 14:04:25 -08:00
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class Momentum(Optimizer):
|
|
|
|
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
|
2018-01-22 11:40:36 -08:00
|
|
|
self.mu = _f(mu) # momentum
|
2018-01-21 14:04:25 -08:00
|
|
|
self.nesterov = bool(nesterov)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.Vprev = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.Vprev is None:
|
|
|
|
self.Vprev = np.copy(dW)
|
|
|
|
|
|
|
|
V = self.mu * self.Vprev - self.lr * dW
|
|
|
|
self.Vprev[:] = V
|
|
|
|
if self.nesterov:
|
|
|
|
return self.mu * V - self.lr * dW
|
|
|
|
|
|
|
|
return V
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2019-02-03 05:30:34 -08:00
|
|
|
class Adadelta(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1212.5701
|
|
|
|
|
|
|
|
def __init__(self, lr=1.0, mu=0.95, eps=1e-8):
|
|
|
|
self.mu = _f(mu)
|
|
|
|
self.eps = _f(eps)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.g = None
|
|
|
|
self.x = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.g is None:
|
|
|
|
self.g = np.zeros_like(dW)
|
|
|
|
if self.x is None:
|
|
|
|
self.x = np.zeros_like(dW)
|
|
|
|
|
|
|
|
self.g += (self.mu - 1) * (self.g - np.square(dW))
|
|
|
|
delta = dW * np.sqrt(self.x + self.eps) / (np.sqrt(self.g) + self.eps)
|
|
|
|
self.x += (self.mu - 1) * (self.x - np.square(delta))
|
|
|
|
return -self.lr * delta
|
|
|
|
|
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class RMSpropCentered(Optimizer):
|
|
|
|
# referenced TensorFlow/PyTorch.
|
2019-02-03 05:30:58 -08:00
|
|
|
# paper: https://arxiv.org/abs/1308.0850v5
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
def __init__(self, lr=1e-4, aleph=0.95, momentum=0.9, eps=1e-8):
|
|
|
|
self.aleph = _f(aleph)
|
|
|
|
self.momentum = _f(momentum)
|
|
|
|
self.eps = _f(eps)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.g = None
|
|
|
|
self.mt = None
|
|
|
|
self.vt = None
|
|
|
|
self.delta = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.g is None:
|
|
|
|
self.g = np.zeros_like(dW)
|
|
|
|
if self.mt is None:
|
|
|
|
self.mt = np.zeros_like(dW)
|
|
|
|
if self.vt is None:
|
|
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
if self.delta is None:
|
|
|
|
self.delta = np.zeros_like(dW)
|
|
|
|
|
|
|
|
self.mt += (1 - self.aleph) * (dW - self.mt)
|
|
|
|
self.vt += (1 - self.aleph) * (np.square(dW) - self.vt)
|
|
|
|
|
|
|
|
# PyTorch has the epsilon outside of the sqrt,
|
|
|
|
# TensorFlow and the paper have it within.
|
|
|
|
# in onn, we generally do it outside, as this seems to work better.
|
|
|
|
temp = dW / (np.sqrt(self.vt - np.square(self.mt)) + self.eps)
|
|
|
|
|
|
|
|
# TensorFlow does it this way.
|
|
|
|
self.delta[:] = self.momentum * self.delta + self.lr * temp
|
|
|
|
return -self.delta
|
|
|
|
# PyTorch does it this way.
|
2018-01-22 11:40:36 -08:00
|
|
|
# self.delta[:] = self.momentum * self.delta + temp
|
|
|
|
# return -self.lr * self.delta
|
2018-01-21 14:04:25 -08:00
|
|
|
# they are equivalent only when LR is constant, which it might not be.
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class Nadam(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1412.6980
|
|
|
|
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
|
|
|
|
# TODO: double-check this implementation. also read the damn paper.
|
2018-01-22 11:40:36 -08:00
|
|
|
# lifted from:
|
|
|
|
# https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
|
|
|
# https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
2018-01-22 11:40:36 -08:00
|
|
|
self.b1 = _f(b1) # decay term
|
|
|
|
self.b2 = _f(b2) # decay term
|
2018-01-21 14:04:25 -08:00
|
|
|
self.eps = _f(eps)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.mt = None
|
|
|
|
self.vt = None
|
|
|
|
self.t = 0
|
|
|
|
self.sched = 1
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
self.t += 1
|
|
|
|
|
|
|
|
if self.mt is None:
|
|
|
|
self.mt = np.zeros_like(dW)
|
|
|
|
if self.vt is None:
|
|
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
|
|
|
|
ut0 = self.b1 * (1 - 0.5 * 0.96**(self.t + 0))
|
|
|
|
ut1 = self.b1 * (1 - 0.5 * 0.96**(self.t + 1))
|
|
|
|
|
|
|
|
sched0 = self.sched * ut0
|
|
|
|
sched1 = self.sched * ut0 * ut1
|
|
|
|
self.sched = sched0
|
|
|
|
|
|
|
|
gp = dW / (1 - sched0)
|
|
|
|
|
|
|
|
self.mt += (1 - self.b1) * (dW - self.mt)
|
|
|
|
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
|
|
|
|
|
|
|
mtp = self.mt / (1 - sched1)
|
|
|
|
vtp = self.vt / (1 - self.b2**self.t)
|
|
|
|
|
|
|
|
mt_bar = (1 - ut0) * gp + ut1 * mtp
|
|
|
|
|
|
|
|
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class FTML(Optimizer):
|
|
|
|
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
|
|
|
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
|
|
|
|
|
|
|
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
|
|
|
self.iterations = _0
|
2018-01-22 11:40:36 -08:00
|
|
|
self.b1 = _f(b1) # decay term
|
|
|
|
self.b2 = _f(b2) # decay term
|
2018-01-21 14:04:25 -08:00
|
|
|
self.eps = _f(eps)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.dt1 = None
|
|
|
|
self.dt = None
|
|
|
|
self.vt = None
|
|
|
|
self.zt = None
|
|
|
|
self.b1_t = _1
|
|
|
|
self.b2_t = _1
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
2018-01-22 11:40:36 -08:00
|
|
|
if self.dt1 is None:
|
|
|
|
self.dt1 = np.zeros_like(dW)
|
|
|
|
if self.dt is None:
|
|
|
|
self.dt = np.zeros_like(dW)
|
|
|
|
if self.vt is None:
|
|
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
if self.zt is None:
|
|
|
|
self.zt = np.zeros_like(dW)
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
# NOTE: we could probably rewrite these equations to avoid this copy.
|
|
|
|
self.dt1[:] = self.dt[:]
|
|
|
|
|
|
|
|
self.b1_t *= self.b1
|
|
|
|
self.b2_t *= self.b2
|
|
|
|
|
|
|
|
# hardly an elegant solution.
|
|
|
|
lr = max(self.lr, self.eps)
|
|
|
|
|
|
|
|
# same as Adam's vt.
|
|
|
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
|
|
|
|
|
|
|
# you can factor "inner" out of Adam as well.
|
|
|
|
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
|
|
|
|
self.dt[:] = (1 - self.b1_t) / lr * inner
|
|
|
|
|
|
|
|
sigma_t = self.dt - self.b1 * self.dt1
|
|
|
|
|
|
|
|
# Adam's mt minus the sigma term.
|
|
|
|
self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
|
|
|
|
|
|
|
|
# subtract by weights to avoid having to override self.update.
|
|
|
|
return -self.zt / self.dt - W
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class MomentumClip(Optimizer):
|
|
|
|
def __init__(self, lr=0.01, mu=0.9, nesterov=False, clip=1.0, debug=False):
|
|
|
|
self.mu = _f(mu)
|
|
|
|
self.clip = _f(clip)
|
|
|
|
self.nesterov = bool(nesterov)
|
|
|
|
self.debug = bool(debug)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.accum = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.accum is None:
|
|
|
|
self.accum = np.zeros_like(dW)
|
|
|
|
|
|
|
|
total_norm = np.linalg.norm(dW)
|
|
|
|
clip_scale = self.clip / (total_norm + 1e-6)
|
|
|
|
if clip_scale < 1:
|
|
|
|
if self.debug:
|
|
|
|
lament("clipping gradients; norm: {:10.5f}".format(total_norm))
|
|
|
|
dW *= clip_scale
|
|
|
|
|
|
|
|
self.accum[:] = self.accum * self.mu + dW
|
|
|
|
if self.nesterov:
|
|
|
|
return -self.lr * (self.accum * self.mu + dW)
|
|
|
|
else:
|
|
|
|
return -self.lr * self.accum
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class YellowFin(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1706.03471
|
|
|
|
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
|
2018-01-22 11:40:36 -08:00
|
|
|
# author's implementation:
|
|
|
|
# https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
|
|
|
|
# code lifted:
|
|
|
|
# https://gist.github.com/botev/f8b32c00eafee222e47393f7f0747666
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
def __init__(self, lr=0.1, mu=0.0, beta=0.999, window_size=20,
|
|
|
|
debias=True, clip=1.0):
|
|
|
|
self.lr_default = _f(lr)
|
|
|
|
self.mu_default = _f(mu)
|
|
|
|
self.beta = _f(beta)
|
2018-01-22 11:40:36 -08:00
|
|
|
self.window_size = int(window_size) # curv_win_width
|
2018-01-21 14:04:25 -08:00
|
|
|
self.debias_enabled = bool(debias)
|
|
|
|
self.clip = _f(clip)
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
self.mu = _f(mu) # momentum
|
2018-01-21 14:04:25 -08:00
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.accum = None
|
|
|
|
|
|
|
|
self.lr = self.lr_default
|
|
|
|
self.mu = self.mu_default
|
|
|
|
|
|
|
|
self.step = 0
|
|
|
|
self.beta_t = self.beta
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
self.curv_win = np.zeros([self.window_size, ], dtype=np.float32)
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
self.h_min = None
|
|
|
|
self.h_max = None
|
|
|
|
|
|
|
|
self.g_lpf = 0
|
2018-01-22 11:40:36 -08:00
|
|
|
# self.g_squared_lpf = 0
|
2018-01-21 14:04:25 -08:00
|
|
|
self.g_norm_squared_lpf = 0
|
|
|
|
self.g_norm_lpf = 0
|
|
|
|
self.h_min_lpf = 0
|
|
|
|
self.h_max_lpf = 0
|
|
|
|
self.dist_lpf = 0
|
|
|
|
self.lr_lpf = 0
|
|
|
|
self.mu_lpf = 0
|
|
|
|
|
|
|
|
def get_lr_mu(self):
|
2018-01-22 11:40:36 -08:00
|
|
|
p = (np.square(self.dist_avg) * np.square(self.h_min)) \
|
|
|
|
/ (2 * self.g_var)
|
2018-01-21 14:04:25 -08:00
|
|
|
w3 = p * (np.sqrt(0.25 + p / 27.0) - 0.5)
|
|
|
|
w = np.power(w3, 1/3)
|
|
|
|
y = w - p / (3 * w)
|
|
|
|
sqrt_mu1 = y + 1
|
|
|
|
|
|
|
|
sqrt_h_min = np.sqrt(self.h_min)
|
|
|
|
sqrt_h_max = np.sqrt(self.h_max)
|
|
|
|
sqrt_mu2 = (sqrt_h_max - sqrt_h_min) / (sqrt_h_max + sqrt_h_min)
|
|
|
|
|
|
|
|
sqrt_mu = max(sqrt_mu1, sqrt_mu2)
|
|
|
|
if sqrt_mu2 > sqrt_mu1:
|
|
|
|
print('note: taking dr calculation. something may have exploded.')
|
|
|
|
|
|
|
|
lr = np.square(1 - sqrt_mu) / self.h_min
|
|
|
|
mu = np.square(sqrt_mu)
|
|
|
|
return lr, mu
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.accum is None:
|
|
|
|
self.accum = np.zeros_like(dW)
|
|
|
|
|
|
|
|
# TODO: prevent allocations everywhere by using [:].
|
|
|
|
# assuming that really works. i haven't actually checked.
|
|
|
|
|
|
|
|
total_norm = np.linalg.norm(dW)
|
|
|
|
clip_scale = self.clip / (total_norm + 1e-6)
|
|
|
|
if clip_scale < 1:
|
2018-01-22 11:40:36 -08:00
|
|
|
# print("clipping gradients; norm: {:10.5f}".format(total_norm))
|
2018-01-21 14:04:25 -08:00
|
|
|
dW *= clip_scale
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
# fmt = 'W std: {:10.7f}e-3, dWstd: {:10.7f}e-3, V std: {:10.7f}e-3'
|
|
|
|
# print(fmt.format(np.std(W), np.std(dW) * 100, np.std(V) * 100))
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
b = self.beta
|
|
|
|
m1b = 1 - self.beta
|
|
|
|
debias = 1 / (1 - self.beta_t) if self.debias_enabled else 1
|
|
|
|
|
|
|
|
g = dW
|
|
|
|
g_squared = np.square(g)
|
|
|
|
g_norm_squared = np.sum(g_squared)
|
|
|
|
g_norm = np.sqrt(g_norm_squared)
|
|
|
|
|
|
|
|
self.curv_win[self.step % self.window_size] = g_norm_squared
|
|
|
|
valid_window = self.curv_win[:min(self.window_size, self.step + 1)]
|
|
|
|
h_min_t = np.min(valid_window)
|
|
|
|
h_max_t = np.max(valid_window)
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
self.g_lpf = b * self.g_lpf + m1b * g
|
|
|
|
# self.g_squared_lpf = b * self.g_squared_lpf + m1b * g_squared
|
|
|
|
self.g_norm_squared_lpf = b * self.g_norm_squared_lpf \
|
|
|
|
+ m1b * g_norm_squared
|
|
|
|
self.g_norm_lpf = b * self.g_norm_lpf + m1b * g_norm
|
|
|
|
self.h_min_lpf = b * self.h_min_lpf + m1b * h_min_t
|
|
|
|
self.h_max_lpf = b * self.h_max_lpf + m1b * h_max_t
|
2018-01-21 14:04:25 -08:00
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
g_avg = debias * self.g_lpf
|
|
|
|
# g_squared_avg = debias * self.g_squared_lpf
|
2018-01-21 14:04:25 -08:00
|
|
|
g_norm_squared_avg = debias * self.g_norm_squared_lpf
|
2018-01-22 11:40:36 -08:00
|
|
|
g_norm_avg = debias * self.g_norm_lpf
|
|
|
|
self.h_min = debias * self.h_min_lpf
|
|
|
|
self.h_max = debias * self.h_max_lpf
|
2018-01-21 14:04:25 -08:00
|
|
|
assert self.h_max >= self.h_min
|
|
|
|
|
|
|
|
dist = g_norm_avg / g_norm_squared_avg
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
self.dist_lpf = b * self.dist_lpf + m1b * dist
|
2018-01-21 14:04:25 -08:00
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
self.dist_avg = debias * self.dist_lpf
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
self.g_var = g_norm_squared_avg - np.sum(np.square(g_avg))
|
|
|
|
# equivalently:
|
2018-01-22 11:40:36 -08:00
|
|
|
# self.g_var = np.sum(np.abs(g_squared_avg - np.square(g_avg)))
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
if self.step > 0:
|
|
|
|
lr_for_real, mu_for_real = self.get_lr_mu()
|
|
|
|
self.mu_lpf = b * self.mu_lpf + m1b * mu_for_real
|
|
|
|
self.lr_lpf = b * self.lr_lpf + m1b * lr_for_real
|
|
|
|
self.mu = debias * self.mu_lpf
|
|
|
|
self.lr = debias * self.lr_lpf
|
|
|
|
|
|
|
|
self.accum[:] = self.accum * self.mu - self.lr * dW
|
|
|
|
V = self.accum
|
|
|
|
|
|
|
|
self.step += 1
|
|
|
|
self.beta_t *= self.beta
|
|
|
|
return V
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class AddSign(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1709.07417
|
|
|
|
|
|
|
|
def __init__(self, lr=0.01, mu=0.9, alpha=1):
|
|
|
|
self.mu = _f(mu)
|
|
|
|
self.alpha = _f(alpha)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.accum = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.accum is None:
|
|
|
|
self.accum = np.zeros_like(dW)
|
|
|
|
|
|
|
|
self.accum[:] = self.accum * self.mu + dW
|
|
|
|
|
|
|
|
signed = np.sign(dW) * np.sign(self.accum)
|
2018-01-22 11:40:36 -08:00
|
|
|
# signed *= decay
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
return -self.lr * dW * (self.alpha + signed)
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class PowerSign(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1709.07417
|
|
|
|
|
|
|
|
def __init__(self, lr=0.01, mu=0.9, alpha=np.e):
|
|
|
|
self.mu = _f(mu)
|
|
|
|
self.alpha = _f(alpha)
|
|
|
|
self.use_exp = np.isclose(self.alpha, _f(np.e))
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.accum = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.accum is None:
|
|
|
|
self.accum = np.zeros_like(dW)
|
|
|
|
|
|
|
|
self.accum[:] = self.accum * self.mu + dW
|
|
|
|
|
|
|
|
signed = np.sign(dW) * np.sign(self.accum)
|
2018-01-22 11:40:36 -08:00
|
|
|
# signed *= decay
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
if self.use_exp:
|
|
|
|
return -self.lr * dW * np.exp(signed)
|
|
|
|
else:
|
|
|
|
return -self.lr * dW * np.power(self.alpha, signed)
|
|
|
|
|
2018-01-22 11:40:36 -08:00
|
|
|
|
2018-01-21 14:04:25 -08:00
|
|
|
class Neumann(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1712.03298
|
2019-02-03 05:30:58 -08:00
|
|
|
# NOTE: this implementation omits resetting as described in the paper.
|
2018-01-21 14:04:25 -08:00
|
|
|
# resetting is totally disabled for now.
|
|
|
|
# NOTE: this implementation does not use vanilla SGD for its first epochs.
|
2019-02-03 05:30:58 -08:00
|
|
|
# you can do this yourself if you really want to.
|
|
|
|
# it seems to be enough to use a slow-starting Learner like SineCLR.
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
def __init__(self, lr=0.01):
|
2018-01-22 11:40:36 -08:00
|
|
|
self.alpha = _f(1e-7) # cubic.
|
|
|
|
self.beta = _f(1e-5) # repulsive. NOTE: multiplied by len(dW) later.
|
|
|
|
self.gamma = _f(0.99) # EMA, or 1-pole low-pass parameter. same thing.
|
2018-01-21 14:04:25 -08:00
|
|
|
# momentum is ∝ (in the shape of) 1 - 1/(1 + t)
|
|
|
|
self.mu_min = _f(0.5)
|
|
|
|
self.mu_max = _f(0.9)
|
2018-01-22 11:40:36 -08:00
|
|
|
self.reset_period = 0 # TODO
|
2018-01-21 14:04:25 -08:00
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
# NOTE: mt and vt are different than the pair in Adam-like optimizers.
|
2018-01-22 11:40:36 -08:00
|
|
|
self.mt = None # momentum accumulator.
|
|
|
|
self.vt = None # weight accumulator.
|
2018-01-21 14:04:25 -08:00
|
|
|
self.t = 0
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
raise Exception("compute() is not available for this Optimizer.")
|
|
|
|
|
|
|
|
def update(self, dW, W):
|
|
|
|
self.t += 1
|
|
|
|
|
|
|
|
if self.mt is None:
|
|
|
|
self.mt = np.zeros_like(dW)
|
|
|
|
if self.vt is None:
|
|
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
|
|
|
|
if self.reset_period > 0 and (self.t - 1) % self.reset_period == 0:
|
|
|
|
self.mt = -self.lr * dW
|
|
|
|
return
|
|
|
|
|
|
|
|
# momentum quantity:
|
2018-01-22 11:40:36 -08:00
|
|
|
mu = _1 - _1/_f(self.t) # the + 1 is implicit.
|
2018-01-21 14:04:25 -08:00
|
|
|
mu = (mu + self.mu_min) * (self.mu_max - self.mu_min)
|
|
|
|
|
|
|
|
# smoothed change in weights:
|
|
|
|
delta = W - self.vt
|
|
|
|
delta_norm_squared = np.square(delta).sum()
|
|
|
|
delta_norm = np.sqrt(delta_norm_squared)
|
|
|
|
|
|
|
|
# regularization terms: (push and pull)
|
|
|
|
cubic_reg = self.alpha * delta_norm_squared
|
|
|
|
repulsive_reg = self.beta * dW.size / delta_norm_squared
|
|
|
|
dt = dW + (cubic_reg - repulsive_reg) * (delta / delta_norm)
|
|
|
|
|
|
|
|
# plain momentum:
|
|
|
|
self.mt = mu * self.mt - self.lr * dt
|
|
|
|
|
|
|
|
# weights and accumulator:
|
|
|
|
W += mu * self.mt - self.lr * dt
|
|
|
|
self.vt = W + self.gamma * (self.vt - W)
|
2018-03-06 16:30:04 -08:00
|
|
|
|
|
|
|
|
2019-02-03 05:30:03 -08:00
|
|
|
class Adamlike(Optimizer):
|
|
|
|
# this generalizes a lot of algorithms that are
|
|
|
|
# either subsets or supersets of the Adam optimizer.
|
|
|
|
# refer to the subclasses for details.
|
|
|
|
|
|
|
|
# the arguments to init default to Adam's.
|
|
|
|
def __init__(self, lr=0.001, b1=0.9, b2=0.999,
|
|
|
|
power=1/2, debias=True, runmax=False, eps=1e-8):
|
|
|
|
self.b1 = _f(b1) # decay term
|
|
|
|
self.b2 = _f(b2) # decay term
|
|
|
|
self.b1_t_default = _f(b1) # decay term power t
|
|
|
|
self.b2_t_default = _f(b2) # decay term power t
|
|
|
|
self.power = _f(power)
|
|
|
|
self.debias = bool(debias)
|
|
|
|
self.runmax = bool(runmax)
|
|
|
|
self.eps = _f(eps)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.mt = None
|
|
|
|
self.vt = None
|
|
|
|
self.vtmax = None
|
|
|
|
self.b1_t = self.b1_t_default
|
|
|
|
self.b2_t = self.b2_t_default
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.mt is None:
|
|
|
|
self.mt = np.zeros_like(dW)
|
|
|
|
if self.vt is None:
|
|
|
|
self.vt = np.zeros_like(dW)
|
|
|
|
if self.vtmax is None and self.runmax:
|
|
|
|
self.vtmax = np.zeros_like(dW)
|
|
|
|
|
|
|
|
# keep local references of mt and vt to simplify
|
|
|
|
# implementing all the variations of Adam later.
|
|
|
|
mt = filter_gradients(self.mt, dW, self.b1)
|
|
|
|
vt = filter_gradients(self.vt, np.square(dW), self.b2)
|
|
|
|
|
|
|
|
if self.runmax:
|
|
|
|
self.vtmax[:] = np.maximum(vt, self.vtmax)
|
|
|
|
vt = self.vtmax
|
|
|
|
|
|
|
|
if self.debias:
|
|
|
|
if self.b1_t != 1:
|
|
|
|
mt = mt / (1 - self.b1_t)
|
|
|
|
if self.b2_t != 1:
|
|
|
|
vt = vt / (1 - self.b2_t)
|
|
|
|
|
|
|
|
if self.power == 0:
|
|
|
|
delta = mt
|
|
|
|
elif self.power == 1:
|
|
|
|
delta = mt / (vt + self.eps)
|
|
|
|
elif self.power == 1/2: # TODO: is this actually faster?
|
|
|
|
delta = mt / (np.sqrt(vt) + self.eps)
|
|
|
|
elif self.power == 1/3: # TODO: is this actually faster?
|
|
|
|
delta = mt / (np.cbrt(vt) + self.eps)
|
|
|
|
else:
|
|
|
|
delta = mt / (vt**self.power + self.eps)
|
|
|
|
|
|
|
|
if self.debias:
|
|
|
|
# decay gain.
|
|
|
|
self.b1_t *= self.b1
|
|
|
|
self.b2_t *= self.b2
|
|
|
|
|
|
|
|
return -self.lr * delta
|
|
|
|
|
|
|
|
|
|
|
|
class Adagrad(Adamlike):
|
|
|
|
# paper: https://web.stanford.edu/~jduchi/projects/DuchiHaSi11.pdf
|
|
|
|
|
|
|
|
def __init__(self, lr=0.01, eps=1e-8):
|
|
|
|
super().__init__(lr=lr, b1=0.0, b2=-1.0,
|
|
|
|
power=1/2, debias=False, runmax=False, eps=eps)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def g(self):
|
|
|
|
return self.vt
|
|
|
|
|
|
|
|
@g.setter
|
|
|
|
def g(self, value):
|
|
|
|
self.vt = value
|
|
|
|
|
|
|
|
|
|
|
|
class RMSprop(Adamlike):
|
|
|
|
# slides: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
|
|
|
|
|
|
|
|
def __init__(self, lr=0.001, mu=0.99, eps=1e-8):
|
|
|
|
super().__init__(lr=lr, b1=0.0, b2=mu,
|
|
|
|
power=1/2, debias=False, runmax=False, eps=eps)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def mu(self):
|
|
|
|
return self.b2
|
|
|
|
|
|
|
|
@mu.setter
|
|
|
|
def mu(self, value):
|
|
|
|
self.b2 = value
|
|
|
|
|
|
|
|
@property
|
|
|
|
def g(self):
|
|
|
|
return self.vt
|
|
|
|
|
|
|
|
@g.setter
|
|
|
|
def g(self, value):
|
|
|
|
self.vt = value
|
|
|
|
|
|
|
|
|
|
|
|
class Adam(Adamlike):
|
|
|
|
# paper: https://arxiv.org/abs/1412.6980
|
|
|
|
# Adam generalizes RMSprop, and
|
|
|
|
# adds a decay term to the regular (non-squared) delta, and performs
|
|
|
|
# debiasing to compensate for the filtered deltas starting from zero.
|
|
|
|
|
|
|
|
def __init__(self, lr=0.001, b1=0.9, b2=0.999,
|
|
|
|
debias=True, eps=1e-8):
|
|
|
|
super().__init__(lr=lr, b1=b1, b2=b2,
|
|
|
|
power=1/2, debias=debias, runmax=False, eps=eps)
|
|
|
|
|
|
|
|
|
|
|
|
class AMSgrad(Adamlike):
|
|
|
|
# paper: https://openreview.net/forum?id=ryQu7f-RZ
|
|
|
|
# based on Adam. this simply adds a running element-wise maximum to vt.
|
|
|
|
|
|
|
|
def __init__(self, lr=0.001, b1=0.9, b2=0.999,
|
|
|
|
debias=True, eps=1e-8):
|
|
|
|
super().__init__(lr=lr, b1=b1, b2=b2,
|
|
|
|
power=1/2, debias=debias, runmax=True, eps=eps)
|
|
|
|
|
|
|
|
|
|
|
|
class Padam(Adamlike):
|
|
|
|
# paper: https://arxiv.org/abs/1806.06763
|
|
|
|
# paper: https://arxiv.org/abs/1808.05671
|
|
|
|
# based on AMSgrad. this configures the power of vt to be closer to zero.
|
|
|
|
|
|
|
|
def __init__(self, lr=0.1, b1=0.9, b2=0.999,
|
|
|
|
power=1/8, debias=True, eps=1e-8):
|
|
|
|
super().__init__(lr=lr, b1=b1, b2=b2,
|
|
|
|
power=power, debias=debias, runmax=True, eps=eps)
|