remove old versions of optimizers
This commit is contained in:
parent
1e6887cdbd
commit
5fd2b7b546
1 changed files with 0 additions and 143 deletions
143
onn/optimizer.py
143
onn/optimizer.py
|
@ -45,23 +45,6 @@ class Momentum(Optimizer):
|
|||
return V
|
||||
|
||||
|
||||
class Adagrad(Optimizer):
|
||||
def __init__(self, lr=0.01, eps=1e-8):
|
||||
self.eps = _f(eps)
|
||||
|
||||
super().__init__(lr)
|
||||
|
||||
def reset(self):
|
||||
self.g = None
|
||||
|
||||
def compute(self, dW, W):
|
||||
if self.g is None:
|
||||
self.g = np.zeros_like(dW)
|
||||
|
||||
self.g += np.square(dW)
|
||||
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||
|
||||
|
||||
class Adadelta(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1212.5701
|
||||
|
||||
|
@ -87,39 +70,6 @@ class Adadelta(Optimizer):
|
|||
return -self.lr * delta
|
||||
|
||||
|
||||
class RMSprop(Optimizer):
|
||||
# RMSprop generalizes* Adagrad, etc.
|
||||
|
||||
# * RMSprop == Adagrad when
|
||||
# RMSprop.mu == 1
|
||||
|
||||
def __init__(self, lr=1e-4, mu=0.99, eps=1e-8):
|
||||
self.mu = _f(mu) # decay term
|
||||
self.eps = _f(eps)
|
||||
|
||||
# one might consider the following equation when specifying mu:
|
||||
# mu = e**(-1/t)
|
||||
# default: t = -1/ln(0.99) = ~99.5
|
||||
# therefore the default of mu=0.99 means
|
||||
# an input decays to 1/e its original amplitude over 99.5 batches.
|
||||
# (this is from DSP, so how relevant it is in SGD is debatable)
|
||||
|
||||
super().__init__(lr)
|
||||
|
||||
def reset(self):
|
||||
self.g = None
|
||||
|
||||
def compute(self, dW, W):
|
||||
if self.g is None:
|
||||
self.g = np.zeros_like(dW)
|
||||
|
||||
# basically apply a first-order low-pass filter to delta squared,
|
||||
self.g += (1 - self.mu) * (np.square(dW) - self.g)
|
||||
|
||||
# and sqrt it to complete the running root-mean-square approximation.
|
||||
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||
|
||||
|
||||
class RMSpropCentered(Optimizer):
|
||||
# referenced TensorFlow/PyTorch.
|
||||
# paper: https://arxiv.org/abs/1308.0850v5
|
||||
|
@ -164,49 +114,6 @@ class RMSpropCentered(Optimizer):
|
|||
# they are equivalent only when LR is constant, which it might not be.
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1412.6980
|
||||
# Adam generalizes* RMSprop, and
|
||||
# adds a decay term to the regular (non-squared) delta, and performs
|
||||
# debiasing to compensate for the filtered deltas starting from zero.
|
||||
|
||||
# * Adam == RMSprop when
|
||||
# Adam.b1 == 0
|
||||
# Adam.b2 == RMSprop.mu
|
||||
|
||||
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1_t_default = _f(b1) # decay term power t
|
||||
self.b2_t_default = _f(b2) # decay term power t
|
||||
self.eps = _f(eps)
|
||||
|
||||
super().__init__(lr)
|
||||
|
||||
def reset(self):
|
||||
self.mt = None
|
||||
self.vt = None
|
||||
self.b1_t = self.b1_t_default
|
||||
self.b2_t = self.b2_t_default
|
||||
|
||||
def compute(self, dW, W):
|
||||
if self.mt is None:
|
||||
self.mt = np.zeros_like(dW)
|
||||
if self.vt is None:
|
||||
self.vt = np.zeros_like(dW)
|
||||
|
||||
# decay gain
|
||||
self.b1_t *= self.b1
|
||||
self.b2_t *= self.b2
|
||||
|
||||
# filter
|
||||
self.mt += (1 - self.b1) * (dW - self.mt)
|
||||
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
||||
|
||||
return -self.lr * (self.mt / (1 - self.b1_t)) \
|
||||
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
||||
|
||||
|
||||
class Nadam(Optimizer):
|
||||
# paper: https://arxiv.org/abs/1412.6980
|
||||
# paper: http://cs229.stanford.edu/proj2015/054_report.pdf
|
||||
|
@ -256,8 +163,6 @@ class Nadam(Optimizer):
|
|||
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class FTML(Optimizer):
|
||||
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
||||
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
||||
|
@ -592,54 +497,6 @@ class Neumann(Optimizer):
|
|||
self.vt = W + self.gamma * (self.vt - W)
|
||||
|
||||
|
||||
class AMSgrad(Optimizer):
|
||||
# paper: https://openreview.net/forum?id=ryQu7f-RZ
|
||||
# based on Adam. this simply adds a running element-wise maximum to vt.
|
||||
|
||||
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8, debias=True):
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1_t_default = _f(b1) # decay term power t
|
||||
self.b2_t_default = _f(b2) # decay term power t
|
||||
self.eps = _f(eps)
|
||||
self.debias = bool(debias)
|
||||
|
||||
super().__init__(lr)
|
||||
|
||||
def reset(self):
|
||||
self.mt = None
|
||||
self.vt = None
|
||||
self.vtmax = None
|
||||
self.b1_t = self.b1_t_default
|
||||
self.b2_t = self.b2_t_default
|
||||
|
||||
def compute(self, dW, W):
|
||||
if self.mt is None:
|
||||
self.mt = np.zeros_like(dW)
|
||||
if self.vt is None:
|
||||
self.vt = np.zeros_like(dW)
|
||||
if self.vtmax is None:
|
||||
self.vtmax = np.zeros_like(dW)
|
||||
|
||||
# filter
|
||||
self.mt += (1 - self.b1) * (dW - self.mt)
|
||||
self.vt += (1 - self.b2) * (np.square(dW) - self.vt)
|
||||
|
||||
self.vtmax = np.maximum(self.vtmax, self.vt)
|
||||
|
||||
if self.debias:
|
||||
ret = -self.lr * (self.mt / (1 - self.b1_t)) \
|
||||
/ (np.sqrt(self.vtmax / (1 - self.b2_t)) + self.eps)
|
||||
else:
|
||||
ret = -self.lr * self.mt / (np.sqrt(self.vtmax) + self.eps)
|
||||
|
||||
# decay gain
|
||||
self.b1_t *= self.b1
|
||||
self.b2_t *= self.b2
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
class Adamlike(Optimizer):
|
||||
# this generalizes a lot of algorithms that are
|
||||
# either subsets or supersets of the Adam optimizer.
|
||||
|
|
Loading…
Reference in a new issue