rename alpha to lr where applicable

This commit is contained in:
Connor Olding 2017-07-02 05:39:51 +00:00
parent 9706aaabbb
commit 6a3f047ddc
3 changed files with 31 additions and 31 deletions

18
onn.py
View file

@ -108,13 +108,13 @@ class FTML(Optimizer):
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
# author's implementation: https://github.com/szhengac/optim/commit/923555e
def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8):
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
self.iterations = _0
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.eps = _f(eps)
super().__init__(alpha)
super().__init__(lr)
def reset(self):
self.dt1 = None
@ -137,14 +137,14 @@ class FTML(Optimizer):
self.b2_t *= self.b2
# hardly an elegant solution.
alpha = max(self.alpha, self.eps)
lr = max(self.lr, self.eps)
# same as Adam's vt.
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
# you can factor out "inner" out of Adam as well.
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
self.dt[:] = (1 - self.b1_t) / alpha * inner
self.dt[:] = (1 - self.b1_t) / lr * inner
sigma_t = self.dt - self.b1 * self.dt1
@ -159,17 +159,17 @@ class YellowFin(Momentum):
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
def __init__(self, alpha=0.1, mu=0.0, beta=0.999, curv_win_width=20):
self.alpha_default = _f(alpha)
def __init__(self, lr=0.1, mu=0.0, beta=0.999, curv_win_width=20):
self.lr_default = _f(lr)
self.mu_default = _f(mu)
self.beta = _f(beta)
self.curv_win_width = int(curv_win_width)
super().__init__(alpha=alpha, mu=mu, nesterov=False)
super().__init__(lr=lr, mu=mu, nesterov=False)
def reset(self):
super().reset()
self.alpha = self.alpha_default
self.lr = self.lr_default
self.mu = self.mu_default
self.step = 0
@ -1077,7 +1077,7 @@ def run(program, args=None):
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
temp_optim = learner.optim
temp_loss = ritual.loss
learner.optim = Optimizer(alpha=0.001)
learner.optim = Optimizer(lr=0.001)
ritual.loss = Absolute() # less likely to blow up; more general
# NOTE: experiment: trying const batches and batch_size

View file

@ -239,15 +239,15 @@ class L1L2(Regularizer):
# Optimizers {{{1
class Optimizer:
def __init__(self, alpha=0.1):
self.alpha = _f(alpha) # learning rate
def __init__(self, lr=0.1):
self.lr = _f(lr) # learning rate
self.reset()
def reset(self):
pass
def compute(self, dW, W):
return -self.alpha * dW
return -self.lr * dW
def update(self, dW, W):
W += self.compute(dW, W)
@ -256,11 +256,11 @@ class Optimizer:
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
class Momentum(Optimizer):
def __init__(self, alpha=0.01, mu=0.9, nesterov=False):
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
self.mu = _f(mu) # momentum
self.nesterov = bool(nesterov)
super().__init__(alpha)
super().__init__(lr)
def reset(self):
self.Vprev = None
@ -269,10 +269,10 @@ class Momentum(Optimizer):
if self.Vprev is None:
self.Vprev = np.copy(dW)
V = self.mu * self.Vprev - self.alpha * dW
V = self.mu * self.Vprev - self.lr * dW
self.Vprev[:] = V
if self.nesterov:
return self.mu * V - self.alpha * dW
return self.mu * V - self.lr * dW
return V
@ -283,7 +283,7 @@ class RMSprop(Optimizer):
# * RMSprop == Adagrad when
# RMSprop.mu == 1
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
def __init__(self, lr=0.0001, mu=0.99, eps=1e-8):
self.mu = _f(mu) # decay term
self.eps = _f(eps)
@ -294,7 +294,7 @@ class RMSprop(Optimizer):
# an input decays to 1/e its original amplitude over 99.5 epochs.
# (this is from DSP, so how relevant it is in SGD is debatable)
super().__init__(alpha)
super().__init__(lr)
def reset(self):
self.g = None
@ -309,7 +309,7 @@ class RMSprop(Optimizer):
#self.g += (dW * dW - self.g) * (1 - self.mu)
# finally sqrt it to complete the running root-mean-square approximation
return -self.alpha * dW / (np.sqrt(self.g) + self.eps)
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
class Adam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
@ -321,14 +321,14 @@ class Adam(Optimizer):
# Adam.b1 == 0
# Adam.b2 == RMSprop.mu
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2) # decay term power t
self.eps = _f(eps)
super().__init__(alpha)
super().__init__(lr)
def reset(self):
self.mt = None
@ -350,8 +350,8 @@ class Adam(Optimizer):
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
return -self.alpha * (self.mt / (1 - self.b1_t)) \
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
return -self.lr * (self.mt / (1 - self.b1_t)) \
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
class Nadam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980
@ -360,12 +360,12 @@ class Nadam(Optimizer):
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term
self.eps = _f(eps)
super().__init__(alpha)
super().__init__(lr)
def reset(self):
self.mt = None
@ -398,7 +398,7 @@ class Nadam(Optimizer):
mt_bar = (1 - ut0) * gp + ut1 * mtp
return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps)
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
# Abstract Layers {{{1
@ -1071,7 +1071,7 @@ class Learner:
def __init__(self, optim, epochs=100, rate=None):
assert isinstance(optim, Optimizer)
self.optim = optim
self.start_rate = rate # None is okay; it'll use optim.alpha instead.
self.start_rate = rate # None is okay; it'll use optim.lr instead.
self.epochs = int(epochs)
self.reset()
@ -1093,15 +1093,15 @@ class Learner:
@property
def rate(self):
return self.optim.alpha
return self.optim.lr
@rate.setter
def rate(self, new_rate):
self.optim.alpha = new_rate
self.optim.lr = new_rate
def rate_at(self, epoch):
if self.start_rate is None:
return self.optim.alpha
return self.optim.lr
return self.start_rate
def next(self):

View file

@ -217,7 +217,7 @@ while learner.next():
quiet = learner.epoch != learner.epochs
measure_error(quiet=quiet)
logs.learning_rate.append(optim.alpha)
logs.learning_rate.append(optim.lr)
if getattr(optim, 'mu', None):
logs.momentum.append(optim.mu)