rename alpha to lr where applicable

This commit is contained in:
Connor Olding 2017-07-02 05:39:51 +00:00
parent 9706aaabbb
commit 6a3f047ddc
3 changed files with 31 additions and 31 deletions

18
onn.py
View file

@ -108,13 +108,13 @@ class FTML(Optimizer):
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
# author's implementation: https://github.com/szhengac/optim/commit/923555e # author's implementation: https://github.com/szhengac/optim/commit/923555e
def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8): def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
self.iterations = _0 self.iterations = _0
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(alpha) super().__init__(lr)
def reset(self): def reset(self):
self.dt1 = None self.dt1 = None
@ -137,14 +137,14 @@ class FTML(Optimizer):
self.b2_t *= self.b2 self.b2_t *= self.b2
# hardly an elegant solution. # hardly an elegant solution.
alpha = max(self.alpha, self.eps) lr = max(self.lr, self.eps)
# same as Adam's vt. # same as Adam's vt.
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
# you can factor out "inner" out of Adam as well. # you can factor out "inner" out of Adam as well.
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
self.dt[:] = (1 - self.b1_t) / alpha * inner self.dt[:] = (1 - self.b1_t) / lr * inner
sigma_t = self.dt - self.b1 * self.dt1 sigma_t = self.dt - self.b1 * self.dt1
@ -159,17 +159,17 @@ class YellowFin(Momentum):
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/ # knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py # author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
def __init__(self, alpha=0.1, mu=0.0, beta=0.999, curv_win_width=20): def __init__(self, lr=0.1, mu=0.0, beta=0.999, curv_win_width=20):
self.alpha_default = _f(alpha) self.lr_default = _f(lr)
self.mu_default = _f(mu) self.mu_default = _f(mu)
self.beta = _f(beta) self.beta = _f(beta)
self.curv_win_width = int(curv_win_width) self.curv_win_width = int(curv_win_width)
super().__init__(alpha=alpha, mu=mu, nesterov=False) super().__init__(lr=lr, mu=mu, nesterov=False)
def reset(self): def reset(self):
super().reset() super().reset()
self.alpha = self.alpha_default self.lr = self.lr_default
self.mu = self.mu_default self.mu = self.mu_default
self.step = 0 self.step = 0
@ -1077,7 +1077,7 @@ def run(program, args=None):
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues # use plain SGD in warmup to prevent (or possibly cause?) numeric issues
temp_optim = learner.optim temp_optim = learner.optim
temp_loss = ritual.loss temp_loss = ritual.loss
learner.optim = Optimizer(alpha=0.001) learner.optim = Optimizer(lr=0.001)
ritual.loss = Absolute() # less likely to blow up; more general ritual.loss = Absolute() # less likely to blow up; more general
# NOTE: experiment: trying const batches and batch_size # NOTE: experiment: trying const batches and batch_size

View file

@ -239,15 +239,15 @@ class L1L2(Regularizer):
# Optimizers {{{1 # Optimizers {{{1
class Optimizer: class Optimizer:
def __init__(self, alpha=0.1): def __init__(self, lr=0.1):
self.alpha = _f(alpha) # learning rate self.lr = _f(lr) # learning rate
self.reset() self.reset()
def reset(self): def reset(self):
pass pass
def compute(self, dW, W): def compute(self, dW, W):
return -self.alpha * dW return -self.lr * dW
def update(self, dW, W): def update(self, dW, W):
W += self.compute(dW, W) W += self.compute(dW, W)
@ -256,11 +256,11 @@ class Optimizer:
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
class Momentum(Optimizer): class Momentum(Optimizer):
def __init__(self, alpha=0.01, mu=0.9, nesterov=False): def __init__(self, lr=0.01, mu=0.9, nesterov=False):
self.mu = _f(mu) # momentum self.mu = _f(mu) # momentum
self.nesterov = bool(nesterov) self.nesterov = bool(nesterov)
super().__init__(alpha) super().__init__(lr)
def reset(self): def reset(self):
self.Vprev = None self.Vprev = None
@ -269,10 +269,10 @@ class Momentum(Optimizer):
if self.Vprev is None: if self.Vprev is None:
self.Vprev = np.copy(dW) self.Vprev = np.copy(dW)
V = self.mu * self.Vprev - self.alpha * dW V = self.mu * self.Vprev - self.lr * dW
self.Vprev[:] = V self.Vprev[:] = V
if self.nesterov: if self.nesterov:
return self.mu * V - self.alpha * dW return self.mu * V - self.lr * dW
return V return V
@ -283,7 +283,7 @@ class RMSprop(Optimizer):
# * RMSprop == Adagrad when # * RMSprop == Adagrad when
# RMSprop.mu == 1 # RMSprop.mu == 1
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8): def __init__(self, lr=0.0001, mu=0.99, eps=1e-8):
self.mu = _f(mu) # decay term self.mu = _f(mu) # decay term
self.eps = _f(eps) self.eps = _f(eps)
@ -294,7 +294,7 @@ class RMSprop(Optimizer):
# an input decays to 1/e its original amplitude over 99.5 epochs. # an input decays to 1/e its original amplitude over 99.5 epochs.
# (this is from DSP, so how relevant it is in SGD is debatable) # (this is from DSP, so how relevant it is in SGD is debatable)
super().__init__(alpha) super().__init__(lr)
def reset(self): def reset(self):
self.g = None self.g = None
@ -309,7 +309,7 @@ class RMSprop(Optimizer):
#self.g += (dW * dW - self.g) * (1 - self.mu) #self.g += (dW * dW - self.g) * (1 - self.mu)
# finally sqrt it to complete the running root-mean-square approximation # finally sqrt it to complete the running root-mean-square approximation
return -self.alpha * dW / (np.sqrt(self.g) + self.eps) return -self.lr * dW / (np.sqrt(self.g) + self.eps)
class Adam(Optimizer): class Adam(Optimizer):
# paper: https://arxiv.org/abs/1412.6980 # paper: https://arxiv.org/abs/1412.6980
@ -321,14 +321,14 @@ class Adam(Optimizer):
# Adam.b1 == 0 # Adam.b1 == 0
# Adam.b2 == RMSprop.mu # Adam.b2 == RMSprop.mu
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8): def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.b1_t_default = _f(b1) # decay term power t self.b1_t_default = _f(b1) # decay term power t
self.b2_t_default = _f(b2) # decay term power t self.b2_t_default = _f(b2) # decay term power t
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(alpha) super().__init__(lr)
def reset(self): def reset(self):
self.mt = None self.mt = None
@ -350,7 +350,7 @@ class Adam(Optimizer):
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
return -self.alpha * (self.mt / (1 - self.b1_t)) \ return -self.lr * (self.mt / (1 - self.b1_t)) \
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps) / (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
class Nadam(Optimizer): class Nadam(Optimizer):
@ -360,12 +360,12 @@ class Nadam(Optimizer):
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530 # lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py # lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8): def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
self.b1 = _f(b1) # decay term self.b1 = _f(b1) # decay term
self.b2 = _f(b2) # decay term self.b2 = _f(b2) # decay term
self.eps = _f(eps) self.eps = _f(eps)
super().__init__(alpha) super().__init__(lr)
def reset(self): def reset(self):
self.mt = None self.mt = None
@ -398,7 +398,7 @@ class Nadam(Optimizer):
mt_bar = (1 - ut0) * gp + ut1 * mtp mt_bar = (1 - ut0) * gp + ut1 * mtp
return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps) return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
# Abstract Layers {{{1 # Abstract Layers {{{1
@ -1071,7 +1071,7 @@ class Learner:
def __init__(self, optim, epochs=100, rate=None): def __init__(self, optim, epochs=100, rate=None):
assert isinstance(optim, Optimizer) assert isinstance(optim, Optimizer)
self.optim = optim self.optim = optim
self.start_rate = rate # None is okay; it'll use optim.alpha instead. self.start_rate = rate # None is okay; it'll use optim.lr instead.
self.epochs = int(epochs) self.epochs = int(epochs)
self.reset() self.reset()
@ -1093,15 +1093,15 @@ class Learner:
@property @property
def rate(self): def rate(self):
return self.optim.alpha return self.optim.lr
@rate.setter @rate.setter
def rate(self, new_rate): def rate(self, new_rate):
self.optim.alpha = new_rate self.optim.lr = new_rate
def rate_at(self, epoch): def rate_at(self, epoch):
if self.start_rate is None: if self.start_rate is None:
return self.optim.alpha return self.optim.lr
return self.start_rate return self.start_rate
def next(self): def next(self):

View file

@ -217,7 +217,7 @@ while learner.next():
quiet = learner.epoch != learner.epochs quiet = learner.epoch != learner.epochs
measure_error(quiet=quiet) measure_error(quiet=quiet)
logs.learning_rate.append(optim.alpha) logs.learning_rate.append(optim.lr)
if getattr(optim, 'mu', None): if getattr(optim, 'mu', None):
logs.momentum.append(optim.mu) logs.momentum.append(optim.mu)