.
This commit is contained in:
parent
8c79667904
commit
9cba495ce4
2 changed files with 61 additions and 48 deletions
28
optim_nn.py
28
optim_nn.py
|
@ -7,6 +7,7 @@
|
||||||
# numpy scipy h5py sklearn dotmap
|
# numpy scipy h5py sklearn dotmap
|
||||||
|
|
||||||
from optim_nn_core import *
|
from optim_nn_core import *
|
||||||
|
from optim_nn_core import _check, _f
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
|
lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
|
||||||
|
@ -31,9 +32,9 @@ class SomethingElse(Loss):
|
||||||
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
||||||
def __init__(self, a=4/3):
|
def __init__(self, a=4/3):
|
||||||
assert 1 <= a <= 2, "parameter out of range"
|
assert 1 <= a <= 2, "parameter out of range"
|
||||||
self.a = nf(a / 2)
|
self.a = _f(a / 2)
|
||||||
self.b = nf(2 / a)
|
self.b = _f(2 / a)
|
||||||
self.c = nf(2 / a - 1)
|
self.c = _f(2 / a - 1)
|
||||||
|
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
return self.a * np.abs(r)**self.b
|
return self.a * np.abs(r)**self.b
|
||||||
|
@ -49,7 +50,7 @@ class LayerNorm(Layer):
|
||||||
|
|
||||||
def __init__(self, eps=1e-3, axis=-1):
|
def __init__(self, eps=1e-3, axis=-1):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.eps = nf(eps)
|
self.eps = _f(eps)
|
||||||
self.axis = int(axis)
|
self.axis = int(axis)
|
||||||
|
|
||||||
def F(self, X):
|
def F(self, X):
|
||||||
|
@ -96,7 +97,7 @@ class StochMRitual(Ritual):
|
||||||
|
|
||||||
def __init__(self, learner=None, loss=None, mloss=None, gamma=0.5):
|
def __init__(self, learner=None, loss=None, mloss=None, gamma=0.5):
|
||||||
super().__init__(learner, loss, mloss)
|
super().__init__(learner, loss, mloss)
|
||||||
self.gamma = nf(gamma)
|
self.gamma = _f(gamma)
|
||||||
|
|
||||||
def prepare(self, model):
|
def prepare(self, model):
|
||||||
self.W = np.copy(model.W)
|
self.W = np.copy(model.W)
|
||||||
|
@ -127,9 +128,9 @@ class StochMRitual(Ritual):
|
||||||
class NoisyRitual(Ritual):
|
class NoisyRitual(Ritual):
|
||||||
def __init__(self, learner=None, loss=None, mloss=None,
|
def __init__(self, learner=None, loss=None, mloss=None,
|
||||||
input_noise=0, output_noise=0, gradient_noise=0):
|
input_noise=0, output_noise=0, gradient_noise=0):
|
||||||
self.input_noise = nf(input_noise) # TODO: implement
|
self.input_noise = _f(input_noise)
|
||||||
self.output_noise = nf(output_noise) # TODO: implement
|
self.output_noise = _f(output_noise)
|
||||||
self.gradient_noise = nf(gradient_noise)
|
self.gradient_noise = _f(gradient_noise)
|
||||||
super().__init__(learner, loss, mloss)
|
super().__init__(learner, loss, mloss)
|
||||||
|
|
||||||
def learn(self, inputs, outputs):
|
def learn(self, inputs, outputs):
|
||||||
|
@ -261,6 +262,7 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
if problem == 1:
|
if problem == 1:
|
||||||
from sklearn.datasets import make_friedman1
|
from sklearn.datasets import make_friedman1
|
||||||
inputs, outputs = make_friedman1(total_samples)
|
inputs, outputs = make_friedman1(total_samples)
|
||||||
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
outputs = np.expand_dims(outputs, -1)
|
outputs = np.expand_dims(outputs, -1)
|
||||||
|
|
||||||
normalize_data(inputs,
|
normalize_data(inputs,
|
||||||
|
@ -274,6 +276,7 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
elif problem == 2:
|
elif problem == 2:
|
||||||
from sklearn.datasets import make_friedman2
|
from sklearn.datasets import make_friedman2
|
||||||
inputs, outputs = make_friedman2(total_samples)
|
inputs, outputs = make_friedman2(total_samples)
|
||||||
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
outputs = np.expand_dims(outputs, -1)
|
outputs = np.expand_dims(outputs, -1)
|
||||||
|
|
||||||
normalize_data(inputs,
|
normalize_data(inputs,
|
||||||
|
@ -287,6 +290,7 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
elif problem == 3:
|
elif problem == 3:
|
||||||
from sklearn.datasets import make_friedman3
|
from sklearn.datasets import make_friedman3
|
||||||
inputs, outputs = make_friedman3(total_samples)
|
inputs, outputs = make_friedman3(total_samples)
|
||||||
|
inputs, outputs = _f(inputs), _f(outputs)
|
||||||
outputs = np.expand_dims(outputs, -1)
|
outputs = np.expand_dims(outputs, -1)
|
||||||
|
|
||||||
normalize_data(inputs,
|
normalize_data(inputs,
|
||||||
|
@ -463,7 +467,7 @@ def run(program, args=[]):
|
||||||
|
|
||||||
# misc
|
# misc
|
||||||
init = 'he_normal',
|
init = 'he_normal',
|
||||||
loss = 'mse',
|
loss = 'msee',
|
||||||
mloss = 'mse',
|
mloss = 'mse',
|
||||||
ritual = 'default',
|
ritual = 'default',
|
||||||
restart_optim = False, # restarts also reset internal state of optimizer
|
restart_optim = False, # restarts also reset internal state of optimizer
|
||||||
|
@ -568,9 +572,9 @@ def run(program, args=[]):
|
||||||
if config.log_fn is not None:
|
if config.log_fn is not None:
|
||||||
log('saving losses', config.log_fn)
|
log('saving losses', config.log_fn)
|
||||||
np.savez_compressed(config.log_fn,
|
np.savez_compressed(config.log_fn,
|
||||||
batch_losses=nfa(batch_losses),
|
batch_losses=np.array(batch_losses, dtype=_f),
|
||||||
train_losses=nfa(train_losses),
|
train_losses=np.array(train_losses, dtype=_f),
|
||||||
valid_losses=nfa(valid_losses))
|
valid_losses=np.array(valid_losses, dtype=_f))
|
||||||
|
|
||||||
# Evaluation {{{2
|
# Evaluation {{{2
|
||||||
# TODO: write this portion again
|
# TODO: write this portion again
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# ugly shorthand:
|
_f = np.float32
|
||||||
nf = np.float32
|
|
||||||
nfa = lambda x: np.array(x, dtype=nf)
|
|
||||||
ni = np.int
|
|
||||||
nia = lambda x: np.array(x, dtype=ni)
|
|
||||||
|
|
||||||
# just for speed, not strictly essential:
|
# just for speed, not strictly essential:
|
||||||
from scipy.special import expit as sigmoid
|
from scipy.special import expit as sigmoid
|
||||||
|
@ -12,6 +8,19 @@ from scipy.special import expit as sigmoid
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
_layer_counters = defaultdict(lambda: 0)
|
_layer_counters = defaultdict(lambda: 0)
|
||||||
|
|
||||||
|
def _check(a):
|
||||||
|
assert isinstance(a, np.ndarray) or type(a) == _f, type(a)
|
||||||
|
assert a.dtype == _f, a.dtype
|
||||||
|
return a
|
||||||
|
|
||||||
|
_0 = _f(0)
|
||||||
|
_1 = _f(1)
|
||||||
|
_2 = _f(2)
|
||||||
|
_inv2 = _f(1/2)
|
||||||
|
_sqrt2 = _f(np.sqrt(2))
|
||||||
|
_invsqrt2 = _f(1/np.sqrt(2))
|
||||||
|
_pi = _f(np.pi)
|
||||||
|
|
||||||
# Initializations {{{1
|
# Initializations {{{1
|
||||||
|
|
||||||
# note: these are currently only implemented for 2D shapes.
|
# note: these are currently only implemented for 2D shapes.
|
||||||
|
@ -54,7 +63,7 @@ class Absolute(Loss):
|
||||||
|
|
||||||
class Optimizer:
|
class Optimizer:
|
||||||
def __init__(self, alpha=0.1):
|
def __init__(self, alpha=0.1):
|
||||||
self.alpha = nf(alpha)
|
self.alpha = _f(alpha)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
@ -71,9 +80,9 @@ class Optimizer:
|
||||||
|
|
||||||
class Momentum(Optimizer):
|
class Momentum(Optimizer):
|
||||||
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
|
||||||
self.alpha = np.asfarray(alpha) # learning rate
|
self.alpha = _f(alpha) # learning rate
|
||||||
self.lamb = np.asfarray(lamb) # weight decay
|
self.lamb = _f(lamb) # weight decay
|
||||||
self.mu = np.asfarray(mu) # momentum
|
self.mu = _f(mu) # momentum
|
||||||
self.nesterov = bool(nesterov)
|
self.nesterov = bool(nesterov)
|
||||||
|
|
||||||
self.reset()
|
self.reset()
|
||||||
|
@ -100,9 +109,9 @@ class RMSprop(Optimizer):
|
||||||
# RMSprop.mu == 1
|
# RMSprop.mu == 1
|
||||||
|
|
||||||
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
|
def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
|
||||||
self.alpha = nf(alpha) # learning rate
|
self.alpha = _f(alpha) # learning rate
|
||||||
self.mu = nf(mu) # decay term
|
self.mu = _f(mu) # decay term
|
||||||
self.eps = nf(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
# one might consider the following equation when specifying mu:
|
# one might consider the following equation when specifying mu:
|
||||||
# mu = e**(-1/t)
|
# mu = e**(-1/t)
|
||||||
|
@ -141,12 +150,12 @@ class Adam(Optimizer):
|
||||||
# Adam.b2_t == 0
|
# Adam.b2_t == 0
|
||||||
|
|
||||||
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
|
||||||
self.alpha = nf(alpha) # learning rate
|
self.alpha = _f(alpha) # learning rate
|
||||||
self.b1 = nf(b1) # decay term
|
self.b1 = _f(b1) # decay term
|
||||||
self.b2 = nf(b2) # decay term
|
self.b2 = _f(b2) # decay term
|
||||||
self.b1_t_default = nf(b1_t) # decay term power t
|
self.b1_t_default = _f(b1_t) # decay term power t
|
||||||
self.b2_t_default = nf(b2_t) # decay term power t
|
self.b2_t_default = _f(b2_t) # decay term power t
|
||||||
self.eps = nf(eps)
|
self.eps = _f(eps)
|
||||||
|
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
@ -317,8 +326,8 @@ class Input(Layer):
|
||||||
class Affine(Layer):
|
class Affine(Layer):
|
||||||
def __init__(self, a=1, b=0):
|
def __init__(self, a=1, b=0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.a = nf(a)
|
self.a = _f(a)
|
||||||
self.b = nf(b)
|
self.b = _f(b)
|
||||||
|
|
||||||
def F(self, X):
|
def F(self, X):
|
||||||
return self.a * X + self.b
|
return self.a * X + self.b
|
||||||
|
@ -355,7 +364,7 @@ class Elu(Layer):
|
||||||
|
|
||||||
def __init__(self, alpha=1):
|
def __init__(self, alpha=1):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.alpha = nf(alpha)
|
self.alpha = _f(alpha)
|
||||||
|
|
||||||
def F(self, X):
|
def F(self, X):
|
||||||
self.cond = X >= 0
|
self.cond = X >= 0
|
||||||
|
@ -382,7 +391,7 @@ class GeluApprox(Layer):
|
||||||
class Dense(Layer):
|
class Dense(Layer):
|
||||||
def __init__(self, dim, init=init_he_uniform):
|
def __init__(self, dim, init=init_he_uniform):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.dim = ni(dim)
|
self.dim = int(dim)
|
||||||
self.output_shape = (dim,)
|
self.output_shape = (dim,)
|
||||||
self.weight_init = init
|
self.weight_init = init
|
||||||
self.size = None
|
self.size = None
|
||||||
|
@ -459,8 +468,8 @@ class Model:
|
||||||
for node in self.ordered_nodes:
|
for node in self.ordered_nodes:
|
||||||
if node.size is not None:
|
if node.size is not None:
|
||||||
self.param_count += node.size
|
self.param_count += node.size
|
||||||
self.W = np.zeros(self.param_count, dtype=nf)
|
self.W = np.zeros(self.param_count, dtype=_f)
|
||||||
self.dW = np.zeros(self.param_count, dtype=nf)
|
self.dW = np.zeros(self.param_count, dtype=_f)
|
||||||
|
|
||||||
offset = 0
|
offset = 0
|
||||||
for node in self.ordered_nodes:
|
for node in self.ordered_nodes:
|
||||||
|
@ -510,7 +519,7 @@ class Model:
|
||||||
weights = {}
|
weights = {}
|
||||||
def visitor(name, obj):
|
def visitor(name, obj):
|
||||||
if isinstance(obj, h5py.Dataset):
|
if isinstance(obj, h5py.Dataset):
|
||||||
weights[name.split('/')[-1]] = nfa(obj[:])
|
weights[name.split('/')[-1]] = np.array(obj[:], dtype=_f)
|
||||||
f.visititems(visitor)
|
f.visititems(visitor)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
@ -532,9 +541,9 @@ class Model:
|
||||||
b_name = "dense_{}".format(b)
|
b_name = "dense_{}".format(b)
|
||||||
# TODO: write a Dense method instead of assigning directly
|
# TODO: write a Dense method instead of assigning directly
|
||||||
grp = f.create_group(b_name)
|
grp = f.create_group(b_name)
|
||||||
data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=nf)
|
data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=_f)
|
||||||
data[:] = denses[a].coeffs
|
data[:] = denses[a].coeffs
|
||||||
data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=nf)
|
data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=_f)
|
||||||
data[:] = denses[a].biases
|
data[:] = denses[a].biases
|
||||||
|
|
||||||
f.close()
|
f.close()
|
||||||
|
@ -572,7 +581,7 @@ class Ritual: # i'm just making up names at this point
|
||||||
|
|
||||||
def train_batched(self, inputs, outputs, batch_size, return_losses=False):
|
def train_batched(self, inputs, outputs, batch_size, return_losses=False):
|
||||||
self.en += 1
|
self.en += 1
|
||||||
cumsum_loss = 0
|
cumsum_loss = _0
|
||||||
batch_count = inputs.shape[0] // batch_size
|
batch_count = inputs.shape[0] // batch_size
|
||||||
losses = []
|
losses = []
|
||||||
for b in range(batch_count):
|
for b in range(batch_count):
|
||||||
|
@ -593,7 +602,7 @@ class Ritual: # i'm just making up names at this point
|
||||||
cumsum_loss += batch_loss
|
cumsum_loss += batch_loss
|
||||||
if return_losses:
|
if return_losses:
|
||||||
losses.append(batch_loss)
|
losses.append(batch_loss)
|
||||||
avg_loss = cumsum_loss / batch_count
|
avg_loss = cumsum_loss / _f(batch_count)
|
||||||
if return_losses:
|
if return_losses:
|
||||||
return avg_loss, losses
|
return avg_loss, losses
|
||||||
else:
|
else:
|
||||||
|
@ -607,7 +616,7 @@ class Learner:
|
||||||
def __init__(self, optim, epochs=100, rate=None):
|
def __init__(self, optim, epochs=100, rate=None):
|
||||||
assert isinstance(optim, Optimizer)
|
assert isinstance(optim, Optimizer)
|
||||||
self.optim = optim
|
self.optim = optim
|
||||||
self.start_rate = optim.alpha if rate is None else float(rate)
|
self.start_rate = optim.alpha if rate is None else _f(rate)
|
||||||
self.epochs = int(epochs)
|
self.epochs = int(epochs)
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
@ -661,8 +670,8 @@ class Learner:
|
||||||
|
|
||||||
class AnnealingLearner(Learner):
|
class AnnealingLearner(Learner):
|
||||||
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
def __init__(self, optim, epochs=100, rate=None, halve_every=10):
|
||||||
self.halve_every = float(halve_every)
|
self.halve_every = _f(halve_every)
|
||||||
self.anneal = 0.5**(1/self.halve_every)
|
self.anneal = _f(0.5**(1/self.halve_every))
|
||||||
super().__init__(optim, epochs, rate)
|
super().__init__(optim, epochs, rate)
|
||||||
|
|
||||||
def rate_at(self, epoch):
|
def rate_at(self, epoch):
|
||||||
|
@ -670,7 +679,7 @@ class AnnealingLearner(Learner):
|
||||||
|
|
||||||
def cosmod(x):
|
def cosmod(x):
|
||||||
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
# plot: https://www.desmos.com/calculator/hlgqmyswy2
|
||||||
return (1 + np.cos((x % 1) * np.pi)) / 2
|
return (_1 + np.cos((x % _1) * _pi)) * _inv2
|
||||||
|
|
||||||
class SGDR(Learner):
|
class SGDR(Learner):
|
||||||
# Stochastic Gradient Descent with Restarts
|
# Stochastic Gradient Descent with Restarts
|
||||||
|
@ -683,7 +692,7 @@ class SGDR(Learner):
|
||||||
restarts=0, restart_decay=0.5, callback=None,
|
restarts=0, restart_decay=0.5, callback=None,
|
||||||
expando=None):
|
expando=None):
|
||||||
self.restart_epochs = int(epochs)
|
self.restart_epochs = int(epochs)
|
||||||
self.decay = float(restart_decay)
|
self.decay = _f(restart_decay)
|
||||||
self.restarts = int(restarts)
|
self.restarts = int(restarts)
|
||||||
self.restart_callback = callback
|
self.restart_callback = callback
|
||||||
# TODO: rename expando to something not insane
|
# TODO: rename expando to something not insane
|
||||||
|
@ -708,8 +717,8 @@ class SGDR(Learner):
|
||||||
|
|
||||||
def rate_at(self, epoch):
|
def rate_at(self, epoch):
|
||||||
restart, sub_epoch, next_restart = self.split_num(epoch)
|
restart, sub_epoch, next_restart = self.split_num(epoch)
|
||||||
x = sub_epoch / next_restart
|
x = _f(sub_epoch) / _f(next_restart)
|
||||||
return self.start_rate * self.decay**restart * cosmod(x)
|
return self.start_rate * self.decay**_f(restart) * cosmod(x)
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
if not super().next():
|
if not super().next():
|
||||||
|
|
Loading…
Reference in a new issue