diff --git a/onn.py b/onn.py index 409f503..578fef8 100755 --- a/onn.py +++ b/onn.py @@ -108,13 +108,13 @@ class FTML(Optimizer): # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf # author's implementation: https://github.com/szhengac/optim/commit/923555e - def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8): + def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8): self.iterations = _0 self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term self.eps = _f(eps) - super().__init__(alpha) + super().__init__(lr) def reset(self): self.dt1 = None @@ -137,14 +137,14 @@ class FTML(Optimizer): self.b2_t *= self.b2 # hardly an elegant solution. - alpha = max(self.alpha, self.eps) + lr = max(self.lr, self.eps) # same as Adam's vt. self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW # you can factor out "inner" out of Adam as well. inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps - self.dt[:] = (1 - self.b1_t) / alpha * inner + self.dt[:] = (1 - self.b1_t) / lr * inner sigma_t = self.dt - self.b1 * self.dt1 @@ -159,17 +159,17 @@ class YellowFin(Momentum): # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/ # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py - def __init__(self, alpha=0.1, mu=0.0, beta=0.999, curv_win_width=20): - self.alpha_default = _f(alpha) + def __init__(self, lr=0.1, mu=0.0, beta=0.999, curv_win_width=20): + self.lr_default = _f(lr) self.mu_default = _f(mu) self.beta = _f(beta) self.curv_win_width = int(curv_win_width) - super().__init__(alpha=alpha, mu=mu, nesterov=False) + super().__init__(lr=lr, mu=mu, nesterov=False) def reset(self): super().reset() - self.alpha = self.alpha_default + self.lr = self.lr_default self.mu = self.mu_default self.step = 0 @@ -1077,7 +1077,7 @@ def run(program, args=None): # use plain SGD in warmup to prevent (or possibly cause?) numeric issues temp_optim = learner.optim temp_loss = ritual.loss - learner.optim = Optimizer(alpha=0.001) + learner.optim = Optimizer(lr=0.001) ritual.loss = Absolute() # less likely to blow up; more general # NOTE: experiment: trying const batches and batch_size diff --git a/onn_core.py b/onn_core.py index 6ad5116..3522d60 100644 --- a/onn_core.py +++ b/onn_core.py @@ -239,15 +239,15 @@ class L1L2(Regularizer): # Optimizers {{{1 class Optimizer: - def __init__(self, alpha=0.1): - self.alpha = _f(alpha) # learning rate + def __init__(self, lr=0.1): + self.lr = _f(lr) # learning rate self.reset() def reset(self): pass def compute(self, dW, W): - return -self.alpha * dW + return -self.lr * dW def update(self, dW, W): W += self.compute(dW, W) @@ -256,11 +256,11 @@ class Optimizer: # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h class Momentum(Optimizer): - def __init__(self, alpha=0.01, mu=0.9, nesterov=False): + def __init__(self, lr=0.01, mu=0.9, nesterov=False): self.mu = _f(mu) # momentum self.nesterov = bool(nesterov) - super().__init__(alpha) + super().__init__(lr) def reset(self): self.Vprev = None @@ -269,10 +269,10 @@ class Momentum(Optimizer): if self.Vprev is None: self.Vprev = np.copy(dW) - V = self.mu * self.Vprev - self.alpha * dW + V = self.mu * self.Vprev - self.lr * dW self.Vprev[:] = V if self.nesterov: - return self.mu * V - self.alpha * dW + return self.mu * V - self.lr * dW return V @@ -283,7 +283,7 @@ class RMSprop(Optimizer): # * RMSprop == Adagrad when # RMSprop.mu == 1 - def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8): + def __init__(self, lr=0.0001, mu=0.99, eps=1e-8): self.mu = _f(mu) # decay term self.eps = _f(eps) @@ -294,7 +294,7 @@ class RMSprop(Optimizer): # an input decays to 1/e its original amplitude over 99.5 epochs. # (this is from DSP, so how relevant it is in SGD is debatable) - super().__init__(alpha) + super().__init__(lr) def reset(self): self.g = None @@ -309,7 +309,7 @@ class RMSprop(Optimizer): #self.g += (dW * dW - self.g) * (1 - self.mu) # finally sqrt it to complete the running root-mean-square approximation - return -self.alpha * dW / (np.sqrt(self.g) + self.eps) + return -self.lr * dW / (np.sqrt(self.g) + self.eps) class Adam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 @@ -321,14 +321,14 @@ class Adam(Optimizer): # Adam.b1 == 0 # Adam.b2 == RMSprop.mu - def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8): + def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term self.b1_t_default = _f(b1) # decay term power t self.b2_t_default = _f(b2) # decay term power t self.eps = _f(eps) - super().__init__(alpha) + super().__init__(lr) def reset(self): self.mt = None @@ -350,8 +350,8 @@ class Adam(Optimizer): self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW - return -self.alpha * (self.mt / (1 - self.b1_t)) \ - / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) + return -self.lr * (self.mt / (1 - self.b1_t)) \ + / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) class Nadam(Optimizer): # paper: https://arxiv.org/abs/1412.6980 @@ -360,12 +360,12 @@ class Nadam(Optimizer): # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py - def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8): + def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8): self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term self.eps = _f(eps) - super().__init__(alpha) + super().__init__(lr) def reset(self): self.mt = None @@ -398,7 +398,7 @@ class Nadam(Optimizer): mt_bar = (1 - ut0) * gp + ut1 * mtp - return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps) + return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps) # Abstract Layers {{{1 @@ -1071,7 +1071,7 @@ class Learner: def __init__(self, optim, epochs=100, rate=None): assert isinstance(optim, Optimizer) self.optim = optim - self.start_rate = rate # None is okay; it'll use optim.alpha instead. + self.start_rate = rate # None is okay; it'll use optim.lr instead. self.epochs = int(epochs) self.reset() @@ -1093,15 +1093,15 @@ class Learner: @property def rate(self): - return self.optim.alpha + return self.optim.lr @rate.setter def rate(self, new_rate): - self.optim.alpha = new_rate + self.optim.lr = new_rate def rate_at(self, epoch): if self.start_rate is None: - return self.optim.alpha + return self.optim.lr return self.start_rate def next(self): diff --git a/onn_mnist.py b/onn_mnist.py index 9e3f761..2c3b246 100755 --- a/onn_mnist.py +++ b/onn_mnist.py @@ -217,7 +217,7 @@ while learner.next(): quiet = learner.epoch != learner.epochs measure_error(quiet=quiet) - logs.learning_rate.append(optim.alpha) + logs.learning_rate.append(optim.lr) if getattr(optim, 'mu', None): logs.momentum.append(optim.mu)