rename alpha to lr where applicable
This commit is contained in:
parent
9706aaabbb
commit
6a3f047ddc
3 changed files with 31 additions and 31 deletions
18
onn.py
18
onn.py
|
@ -108,13 +108,13 @@ class FTML(Optimizer):
|
||||||
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
# paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
|
||||||
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
# author's implementation: https://github.com/szhengac/optim/commit/923555e
|
||||||
|
|
||||||
def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
def __init__(self, lr=0.0025, b1=0.6, b2=0.999, eps=1e-8):
|
||||||
self.iterations = _0
|
self.iterations = _0
|
||||||
self.b1 = _f(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
self.b2 = _f(b2) # decay term
|
self.b2 = _f(b2) # decay term
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(lr)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.dt1 = None
|
self.dt1 = None
|
||||||
|
@ -137,14 +137,14 @@ class FTML(Optimizer):
|
||||||
self.b2_t *= self.b2
|
self.b2_t *= self.b2
|
||||||
|
|
||||||
# hardly an elegant solution.
|
# hardly an elegant solution.
|
||||||
alpha = max(self.alpha, self.eps)
|
lr = max(self.lr, self.eps)
|
||||||
|
|
||||||
# same as Adam's vt.
|
# same as Adam's vt.
|
||||||
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
||||||
|
|
||||||
# you can factor out "inner" out of Adam as well.
|
# you can factor out "inner" out of Adam as well.
|
||||||
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
|
inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
|
||||||
self.dt[:] = (1 - self.b1_t) / alpha * inner
|
self.dt[:] = (1 - self.b1_t) / lr * inner
|
||||||
|
|
||||||
sigma_t = self.dt - self.b1 * self.dt1
|
sigma_t = self.dt - self.b1 * self.dt1
|
||||||
|
|
||||||
|
@ -159,17 +159,17 @@ class YellowFin(Momentum):
|
||||||
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
|
# knowyourmeme: http://cs.stanford.edu/~zjian/project/YellowFin/
|
||||||
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
|
# author's implementation: https://github.com/JianGoForIt/YellowFin/blob/master/tuner_utils/yellowfin.py
|
||||||
|
|
||||||
def __init__(self, alpha=0.1, mu=0.0, beta=0.999, curv_win_width=20):
|
def __init__(self, lr=0.1, mu=0.0, beta=0.999, curv_win_width=20):
|
||||||
self.alpha_default = _f(alpha)
|
self.lr_default = _f(lr)
|
||||||
self.mu_default = _f(mu)
|
self.mu_default = _f(mu)
|
||||||
self.beta = _f(beta)
|
self.beta = _f(beta)
|
||||||
self.curv_win_width = int(curv_win_width)
|
self.curv_win_width = int(curv_win_width)
|
||||||
|
|
||||||
super().__init__(alpha=alpha, mu=mu, nesterov=False)
|
super().__init__(lr=lr, mu=mu, nesterov=False)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
super().reset()
|
super().reset()
|
||||||
self.alpha = self.alpha_default
|
self.lr = self.lr_default
|
||||||
self.mu = self.mu_default
|
self.mu = self.mu_default
|
||||||
|
|
||||||
self.step = 0
|
self.step = 0
|
||||||
|
@ -1077,7 +1077,7 @@ def run(program, args=None):
|
||||||
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
|
# use plain SGD in warmup to prevent (or possibly cause?) numeric issues
|
||||||
temp_optim = learner.optim
|
temp_optim = learner.optim
|
||||||
temp_loss = ritual.loss
|
temp_loss = ritual.loss
|
||||||
learner.optim = Optimizer(alpha=0.001)
|
learner.optim = Optimizer(lr=0.001)
|
||||||
ritual.loss = Absolute() # less likely to blow up; more general
|
ritual.loss = Absolute() # less likely to blow up; more general
|
||||||
|
|
||||||
# NOTE: experiment: trying const batches and batch_size
|
# NOTE: experiment: trying const batches and batch_size
|
||||||
|
|
42
onn_core.py
42
onn_core.py
|
@ -239,15 +239,15 @@ class L1L2(Regularizer):
|
||||||
# Optimizers {{{1
|
# Optimizers {{{1
|
||||||
|
|
||||||
class Optimizer:
|
class Optimizer:
|
||||||
def __init__(self, alpha=0.1):
|
def __init__(self, lr=0.1):
|
||||||
self.alpha = _f(alpha) # learning rate
|
self.lr = _f(lr) # learning rate
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def compute(self, dW, W):
|
def compute(self, dW, W):
|
||||||
return -self.alpha * dW
|
return -self.lr * dW
|
||||||
|
|
||||||
def update(self, dW, W):
|
def update(self, dW, W):
|
||||||
W += self.compute(dW, W)
|
W += self.compute(dW, W)
|
||||||
|
@ -256,11 +256,11 @@ class Optimizer:
|
||||||
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
|
||||||
|
|
||||||
class Momentum(Optimizer):
|
class Momentum(Optimizer):
|
||||||
def __init__(self, alpha=0.01, mu=0.9, nesterov=False):
|
def __init__(self, lr=0.01, mu=0.9, nesterov=False):
|
||||||
self.mu = _f(mu) # momentum
|
self.mu = _f(mu) # momentum
|
||||||
self.nesterov = bool(nesterov)
|
self.nesterov = bool(nesterov)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(lr)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.Vprev = None
|
self.Vprev = None
|
||||||
|
@ -269,10 +269,10 @@ class Momentum(Optimizer):
|
||||||
if self.Vprev is None:
|
if self.Vprev is None:
|
||||||
self.Vprev = np.copy(dW)
|
self.Vprev = np.copy(dW)
|
||||||
|
|
||||||
V = self.mu * self.Vprev - self.alpha * dW
|
V = self.mu * self.Vprev - self.lr * dW
|
||||||
self.Vprev[:] = V
|
self.Vprev[:] = V
|
||||||
if self.nesterov:
|
if self.nesterov:
|
||||||
return self.mu * V - self.alpha * dW
|
return self.mu * V - self.lr * dW
|
||||||
|
|
||||||
return V
|
return V
|
||||||
|
|
||||||
|
@ -283,7 +283,7 @@ class RMSprop(Optimizer):
|
||||||
# * RMSprop == Adagrad when
|
# * RMSprop == Adagrad when
|
||||||
# RMSprop.mu == 1
|
# RMSprop.mu == 1
|
||||||
|
|
||||||
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
|
def __init__(self, lr=0.0001, mu=0.99, eps=1e-8):
|
||||||
self.mu = _f(mu) # decay term
|
self.mu = _f(mu) # decay term
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
|
@ -294,7 +294,7 @@ class RMSprop(Optimizer):
|
||||||
# an input decays to 1/e its original amplitude over 99.5 epochs.
|
# an input decays to 1/e its original amplitude over 99.5 epochs.
|
||||||
# (this is from DSP, so how relevant it is in SGD is debatable)
|
# (this is from DSP, so how relevant it is in SGD is debatable)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(lr)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.g = None
|
self.g = None
|
||||||
|
@ -309,7 +309,7 @@ class RMSprop(Optimizer):
|
||||||
#self.g += (dW * dW - self.g) * (1 - self.mu)
|
#self.g += (dW * dW - self.g) * (1 - self.mu)
|
||||||
|
|
||||||
# finally sqrt it to complete the running root-mean-square approximation
|
# finally sqrt it to complete the running root-mean-square approximation
|
||||||
return -self.alpha * dW / (np.sqrt(self.g) + self.eps)
|
return -self.lr * dW / (np.sqrt(self.g) + self.eps)
|
||||||
|
|
||||||
class Adam(Optimizer):
|
class Adam(Optimizer):
|
||||||
# paper: https://arxiv.org/abs/1412.6980
|
# paper: https://arxiv.org/abs/1412.6980
|
||||||
|
@ -321,14 +321,14 @@ class Adam(Optimizer):
|
||||||
# Adam.b1 == 0
|
# Adam.b1 == 0
|
||||||
# Adam.b2 == RMSprop.mu
|
# Adam.b2 == RMSprop.mu
|
||||||
|
|
||||||
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
self.b1 = _f(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
self.b2 = _f(b2) # decay term
|
self.b2 = _f(b2) # decay term
|
||||||
self.b1_t_default = _f(b1) # decay term power t
|
self.b1_t_default = _f(b1) # decay term power t
|
||||||
self.b2_t_default = _f(b2) # decay term power t
|
self.b2_t_default = _f(b2) # decay term power t
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(lr)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.mt = None
|
self.mt = None
|
||||||
|
@ -350,8 +350,8 @@ class Adam(Optimizer):
|
||||||
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
|
self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW
|
||||||
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
|
||||||
|
|
||||||
return -self.alpha * (self.mt / (1 - self.b1_t)) \
|
return -self.lr * (self.mt / (1 - self.b1_t)) \
|
||||||
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
/ (np.sqrt(self.vt / (1 - self.b2_t)) + self.eps)
|
||||||
|
|
||||||
class Nadam(Optimizer):
|
class Nadam(Optimizer):
|
||||||
# paper: https://arxiv.org/abs/1412.6980
|
# paper: https://arxiv.org/abs/1412.6980
|
||||||
|
@ -360,12 +360,12 @@ class Nadam(Optimizer):
|
||||||
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
# lifted from https://github.com/fchollet/keras/blob/5d38b04/keras/optimizers.py#L530
|
||||||
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
# lifted from https://github.com/jpilaul/IFT6266_project/blob/master/Models/Algo_Momentum.py
|
||||||
|
|
||||||
def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
def __init__(self, lr=0.002, b1=0.9, b2=0.999, eps=1e-8):
|
||||||
self.b1 = _f(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
self.b2 = _f(b2) # decay term
|
self.b2 = _f(b2) # decay term
|
||||||
self.eps = _f(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
super().__init__(alpha)
|
super().__init__(lr)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.mt = None
|
self.mt = None
|
||||||
|
@ -398,7 +398,7 @@ class Nadam(Optimizer):
|
||||||
|
|
||||||
mt_bar = (1 - ut0) * gp + ut1 * mtp
|
mt_bar = (1 - ut0) * gp + ut1 * mtp
|
||||||
|
|
||||||
return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps)
|
return -self.lr * mt_bar / (np.sqrt(vtp) + self.eps)
|
||||||
|
|
||||||
# Abstract Layers {{{1
|
# Abstract Layers {{{1
|
||||||
|
|
||||||
|
@ -1071,7 +1071,7 @@ class Learner:
|
||||||
def __init__(self, optim, epochs=100, rate=None):
|
def __init__(self, optim, epochs=100, rate=None):
|
||||||
assert isinstance(optim, Optimizer)
|
assert isinstance(optim, Optimizer)
|
||||||
self.optim = optim
|
self.optim = optim
|
||||||
self.start_rate = rate # None is okay; it'll use optim.alpha instead.
|
self.start_rate = rate # None is okay; it'll use optim.lr instead.
|
||||||
self.epochs = int(epochs)
|
self.epochs = int(epochs)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
@ -1093,15 +1093,15 @@ class Learner:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def rate(self):
|
def rate(self):
|
||||||
return self.optim.alpha
|
return self.optim.lr
|
||||||
|
|
||||||
@rate.setter
|
@rate.setter
|
||||||
def rate(self, new_rate):
|
def rate(self, new_rate):
|
||||||
self.optim.alpha = new_rate
|
self.optim.lr = new_rate
|
||||||
|
|
||||||
def rate_at(self, epoch):
|
def rate_at(self, epoch):
|
||||||
if self.start_rate is None:
|
if self.start_rate is None:
|
||||||
return self.optim.alpha
|
return self.optim.lr
|
||||||
return self.start_rate
|
return self.start_rate
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
|
|
|
@ -217,7 +217,7 @@ while learner.next():
|
||||||
quiet = learner.epoch != learner.epochs
|
quiet = learner.epoch != learner.epochs
|
||||||
measure_error(quiet=quiet)
|
measure_error(quiet=quiet)
|
||||||
|
|
||||||
logs.learning_rate.append(optim.alpha)
|
logs.learning_rate.append(optim.lr)
|
||||||
if getattr(optim, 'mu', None):
|
if getattr(optim, 'mu', None):
|
||||||
logs.momentum.append(optim.mu)
|
logs.momentum.append(optim.mu)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue