diff --git a/onn/optimizer.py b/onn/optimizer.py index 5dd6589..14d0d1e 100644 --- a/onn/optimizer.py +++ b/onn/optimizer.py @@ -45,23 +45,6 @@ class Momentum(Optimizer): return V -class Adagrad(Optimizer): - def __init__(self, lr=0.01, eps=1e-8): - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.g = None - - def compute(self, dW, W): - if self.g is None: - self.g = np.zeros_like(dW) - - self.g += np.square(dW) - return -self.lr * dW / (np.sqrt(self.g) + self.eps) - - class Adadelta(Optimizer): # paper: https://arxiv.org/abs/1212.5701 @@ -87,39 +70,6 @@ class Adadelta(Optimizer): return -self.lr * delta -class RMSprop(Optimizer): - # RMSprop generalizes* Adagrad, etc. - - # * RMSprop == Adagrad when - # RMSprop.mu == 1 - - def __init__(self, lr=1e-4, mu=0.99, eps=1e-8): - self.mu = _f(mu) # decay term - self.eps = _f(eps) - - # one might consider the following equation when specifying mu: - # mu = e**(-1/t) - # default: t = -1/ln(0.99) = ~99.5 - # therefore the default of mu=0.99 means - # an input decays to 1/e its original amplitude over 99.5 batches. - # (this is from DSP, so how relevant it is in SGD is debatable) - - super().__init__(lr) - - def reset(self): - self.g = None - - def compute(self, dW, W): - if self.g is None: - self.g = np.zeros_like(dW) - - # basically apply a first-order low-pass filter to delta squared, - self.g += (1 - self.mu) * (np.square(dW) - self.g) - - # and sqrt it to complete the running root-mean-square approximation. - return -self.lr * dW / (np.sqrt(self.g) + self.eps) - - class RMSpropCentered(Optimizer): # referenced TensorFlow/PyTorch. # paper: https://arxiv.org/abs/1308.0850v5 @@ -164,49 +114,6 @@ class RMSpropCentered(Optimizer): # they are equivalent only when LR is constant, which it might not be. -class Adam(Optimizer): - # paper: https://arxiv.org/abs/1412.6980 - # Adam generalizes* RMSprop, and - # adds a decay term to the regular (non-squared) delta, and performs - # debiasing to compensate for the filtered deltas starting from zero. - - # * Adam == RMSprop when - # Adam.b1 == 0 - # Adam.b2 == RMSprop.mu - - def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term - self.b1_t_default = _f(b1) # decay term power t - self.b2_t_default = _f(b2) # decay term power t - self.eps = _f(eps) - - super().__init__(lr) - - def reset(self): - self.mt = None - self.vt = None - self.b1_t = self.b1_t_default - self.b2_t = self.b2_t_default - - def compute(self, dW, W): - if self.mt is None: - self.mt = np.zeros_like(dW) - if self.vt is None: - self.vt = np.zeros_like(dW) - - # decay gain - self.b1_t *= self.b1 - self.b2_t *= self.b2 - - # filter - self.mt += (1 - self.b1) * (dW - self.mt) - self.vt += (1 - self.b2) * (np.square(dW) - self.vt) - - return -self.lr * (self.mt / (1 - self.b1_t)) \ - / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) - - class Nadam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 # paper: http://cs229.stanford.edu/proj2015/054_report.pdf @@ -256,8 +163,6 @@ class Nadam(Optimizer): return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) -# more - class FTML(Optimizer): # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf # author's implementation: https://github.com/szhengac/optim/commit/923555e @@ -592,54 +497,6 @@ class Neumann(Optimizer): self.vt = W + self.gamma * (self.vt - W) -class AMSgrad(Optimizer): - # paper: https://openreview.net/forum?id=ryQu7f-RZ - # based on Adam. this simply adds a running element-wise maximum to vt. - - def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8, debias=True): - self.b1 = _f(b1) # decay term - self.b2 = _f(b2) # decay term - self.b1_t_default = _f(b1) # decay term power t - self.b2_t_default = _f(b2) # decay term power t - self.eps = _f(eps) - self.debias = bool(debias) - - super().__init__(lr) - - def reset(self): - self.mt = None - self.vt = None - self.vtmax = None - self.b1_t = self.b1_t_default - self.b2_t = self.b2_t_default - - def compute(self, dW, W): - if self.mt is None: - self.mt = np.zeros_like(dW) - if self.vt is None: - self.vt = np.zeros_like(dW) - if self.vtmax is None: - self.vtmax = np.zeros_like(dW) - - # filter - self.mt += (1 - self.b1) * (dW - self.mt) - self.vt += (1 - self.b2) * (np.square(dW) - self.vt) - - self.vtmax = np.maximum(self.vtmax, self.vt) - - if self.debias: - ret = -self.lr * (self.mt / (1 - self.b1_t)) \ - / (np.sqrt(self.vtmax / (1 - self.b2_t)) + self.eps) - else: - ret = -self.lr * self.mt / (np.sqrt(self.vtmax) + self.eps) - - # decay gain - self.b1_t *= self.b1 - self.b2_t *= self.b2 - - return ret - - class Adamlike(Optimizer): # this generalizes a lot of algorithms that are # either subsets or supersets of the Adam optimizer.