.
This commit is contained in:
parent
d232d81f5a
commit
389bde3cdb
2 changed files with 73 additions and 34 deletions
20
optim_nn.py
20
optim_nn.py
|
@ -20,14 +20,14 @@ class Dummy:
|
||||||
|
|
||||||
# Loss functions {{{1
|
# Loss functions {{{1
|
||||||
|
|
||||||
class SquaredHalved(Loss):
|
class SquaredHalved(ResidualLoss):
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
return np.square(r) / 2
|
return np.square(r) / 2
|
||||||
|
|
||||||
def df(self, r):
|
def df(self, r):
|
||||||
return r
|
return r
|
||||||
|
|
||||||
class SomethingElse(Loss):
|
class SomethingElse(ResidualLoss):
|
||||||
# generalizes Absolute and SquaredHalved
|
# generalizes Absolute and SquaredHalved
|
||||||
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
# plot: https://www.desmos.com/calculator/fagjg9vuz7
|
||||||
def __init__(self, a=4/3):
|
def __init__(self, a=4/3):
|
||||||
|
@ -42,6 +42,8 @@ class SomethingElse(Loss):
|
||||||
def df(self, r):
|
def df(self, r):
|
||||||
return np.sign(r) * np.abs(r)**self.c
|
return np.sign(r) * np.abs(r)**self.c
|
||||||
|
|
||||||
|
# Nonparametric Layers {{{1
|
||||||
|
|
||||||
# Parametric Layers {{{1
|
# Parametric Layers {{{1
|
||||||
|
|
||||||
class LayerNorm(Layer):
|
class LayerNorm(Layer):
|
||||||
|
@ -238,7 +240,7 @@ def multiresnet(x, width, depth, block=2, multi=1,
|
||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
# etc. {{{1
|
# Toy Data {{{1
|
||||||
|
|
||||||
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
|
inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
|
||||||
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
|
activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
|
||||||
|
@ -317,6 +319,8 @@ def toy_data(train_samples, valid_samples, problem=2):
|
||||||
|
|
||||||
return (inputs, outputs), (valid_inputs, valid_outputs)
|
return (inputs, outputs), (valid_inputs, valid_outputs)
|
||||||
|
|
||||||
|
# Model Creation {{{1
|
||||||
|
|
||||||
def model_from_config(config, input_features, output_features, callbacks):
|
def model_from_config(config, input_features, output_features, callbacks):
|
||||||
# Our Test Model
|
# Our Test Model
|
||||||
|
|
||||||
|
@ -337,6 +341,7 @@ def model_from_config(config, input_features, output_features, callbacks):
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
||||||
|
# FIXME: unused variable
|
||||||
training = config.epochs > 0 and config.restarts >= 0
|
training = config.epochs > 0 and config.restarts >= 0
|
||||||
|
|
||||||
if config.fn_load is not None:
|
if config.fn_load is not None:
|
||||||
|
@ -427,7 +432,7 @@ def model_from_config(config, input_features, output_features, callbacks):
|
||||||
|
|
||||||
return model, learner, ritual, (loss, mloss)
|
return model, learner, ritual, (loss, mloss)
|
||||||
|
|
||||||
# main {{{1
|
# main program {{{1
|
||||||
|
|
||||||
def run(program, args=[]):
|
def run(program, args=[]):
|
||||||
|
|
||||||
|
@ -527,8 +532,7 @@ def run(program, args=[]):
|
||||||
def measure_error():
|
def measure_error():
|
||||||
def print_error(name, inputs, outputs, comparison=None):
|
def print_error(name, inputs, outputs, comparison=None):
|
||||||
predicted = model.forward(inputs)
|
predicted = model.forward(inputs)
|
||||||
residual = predicted - outputs
|
err = ritual.measure(predicted, outputs)
|
||||||
err = ritual.measure(residual)
|
|
||||||
log(name + " loss", "{:12.6e}".format(err))
|
log(name + " loss", "{:12.6e}".format(err))
|
||||||
# TODO: print logarithmic difference as it might be more meaningful
|
# TODO: print logarithmic difference as it might be more meaningful
|
||||||
# (fewer results stuck around -99%)
|
# (fewer results stuck around -99%)
|
||||||
|
@ -549,8 +553,6 @@ def run(program, args=[]):
|
||||||
|
|
||||||
measure_error()
|
measure_error()
|
||||||
|
|
||||||
assert inputs.shape[0] % config.batch_size == 0, \
|
|
||||||
"inputs is not evenly divisible by batch_size" # TODO: lift this restriction
|
|
||||||
ritual.prepare(model)
|
ritual.prepare(model)
|
||||||
while learner.next():
|
while learner.next():
|
||||||
indices = np.arange(inputs.shape[0])
|
indices = np.arange(inputs.shape[0])
|
||||||
|
@ -587,7 +589,7 @@ def run(program, args=[]):
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# do main {{{1
|
# run main program {{{1
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
|
|
|
@ -36,23 +36,40 @@ def init_he_uniform(size, ins, outs):
|
||||||
# Loss functions {{{1
|
# Loss functions {{{1
|
||||||
|
|
||||||
class Loss:
|
class Loss:
|
||||||
per_batch = False
|
pass
|
||||||
|
|
||||||
def mean(self, r):
|
class CategoricalCrossentropy(Loss):
|
||||||
return np.average(self.f(r))
|
# lifted from theano
|
||||||
|
|
||||||
def dmean(self, r):
|
def __init__(self, eps=1e-8):
|
||||||
d = self.df(r)
|
self.eps = _f(eps)
|
||||||
return d / len(d)
|
|
||||||
|
|
||||||
class Squared(Loss):
|
def F(self, p, y):
|
||||||
|
# TODO: assert dimensionality and p > 0 (if not self.unsafe?)
|
||||||
|
p = np.clip(p, self.eps, 1 - self.eps)
|
||||||
|
f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
|
||||||
|
return np.mean(f, axis=-1)
|
||||||
|
|
||||||
|
def dF(self, p, y):
|
||||||
|
p = np.clip(p, self.eps, 1 - self.eps)
|
||||||
|
df = (p - y) / (p * (1 - p))
|
||||||
|
return df / y.shape[-1]
|
||||||
|
|
||||||
|
class ResidualLoss(Loss):
|
||||||
|
def F(self, p, y): # mean
|
||||||
|
return np.mean(self.f(p - y))
|
||||||
|
|
||||||
|
def dF(self, p, y): # dmean
|
||||||
|
return self.df(p - y) / y.shape[-1]
|
||||||
|
|
||||||
|
class Squared(ResidualLoss):
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
return np.square(r)
|
return np.square(r)
|
||||||
|
|
||||||
def df(self, r):
|
def df(self, r):
|
||||||
return 2 * r
|
return 2 * r
|
||||||
|
|
||||||
class Absolute(Loss):
|
class Absolute(ResidualLoss):
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
return np.abs(r)
|
return np.abs(r)
|
||||||
|
|
||||||
|
@ -301,14 +318,6 @@ class Layer:
|
||||||
|
|
||||||
# Nonparametric Layers {{{1
|
# Nonparametric Layers {{{1
|
||||||
|
|
||||||
class Sum(Layer):
|
|
||||||
def multi(self, B):
|
|
||||||
return np.sum(B, axis=0)
|
|
||||||
|
|
||||||
def dmulti(self, dB):
|
|
||||||
#assert len(dB) == 1, "unimplemented"
|
|
||||||
return dB[0] # TODO: does this always work?
|
|
||||||
|
|
||||||
class Input(Layer):
|
class Input(Layer):
|
||||||
def __init__(self, shape):
|
def __init__(self, shape):
|
||||||
assert shape is not None
|
assert shape is not None
|
||||||
|
@ -336,6 +345,14 @@ class Affine(Layer):
|
||||||
def dF(self, dY):
|
def dF(self, dY):
|
||||||
return dY * self.a
|
return dY * self.a
|
||||||
|
|
||||||
|
class Sum(Layer):
|
||||||
|
def multi(self, B):
|
||||||
|
return np.sum(B, axis=0)
|
||||||
|
|
||||||
|
def dmulti(self, dB):
|
||||||
|
#assert len(dB) == 1, "unimplemented"
|
||||||
|
return dB[0] # TODO: does this always work?
|
||||||
|
|
||||||
class Sigmoid(Layer): # aka Logistic
|
class Sigmoid(Layer): # aka Logistic
|
||||||
def F(self, X):
|
def F(self, X):
|
||||||
self.sig = sigmoid(X)
|
self.sig = sigmoid(X)
|
||||||
|
@ -387,6 +404,25 @@ class GeluApprox(Layer):
|
||||||
def dF(self, dY):
|
def dF(self, dY):
|
||||||
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
||||||
|
|
||||||
|
class Softmax(Layer):
|
||||||
|
# lifted from theano
|
||||||
|
|
||||||
|
def __init__(self, axis=-1):
|
||||||
|
super().__init__()
|
||||||
|
self.axis = int(axis)
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
alpha = np.max(X, axis=-1, keepdims=True)
|
||||||
|
num = np.exp(X - alpha)
|
||||||
|
den = np.sum(num, axis=-1, keepdims=True)
|
||||||
|
self.sm = num / den
|
||||||
|
return self.sm
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
dYsm = dY * self.sm
|
||||||
|
dX = dYsm - np.sum(dYsm, axis=-1, keepdims=True) * self.sm
|
||||||
|
return dX
|
||||||
|
|
||||||
# Parametric Layers {{{1
|
# Parametric Layers {{{1
|
||||||
|
|
||||||
class Dense(Layer):
|
class Dense(Layer):
|
||||||
|
@ -560,17 +596,16 @@ class Ritual: # i'm just making up names at this point
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.learner.reset(optim=True)
|
self.learner.reset(optim=True)
|
||||||
|
|
||||||
def measure(self, residual):
|
def measure(self, p, y):
|
||||||
return self.mloss.mean(residual)
|
return self.mloss.F(p, y)
|
||||||
|
|
||||||
def derive(self, residual):
|
def derive(self, p, y):
|
||||||
return self.loss.dmean(residual)
|
return self.loss.dF(p, y)
|
||||||
|
|
||||||
def learn(self, inputs, outputs):
|
def learn(self, inputs, outputs):
|
||||||
predicted = self.model.forward(inputs)
|
predicted = self.model.forward(inputs)
|
||||||
residual = predicted - outputs
|
self.model.backward(self.derive(predicted, outputs))
|
||||||
self.model.backward(self.derive(residual))
|
return predicted
|
||||||
return residual
|
|
||||||
|
|
||||||
def update(self):
|
def update(self):
|
||||||
self.learner.optim.update(self.model.dW, self.model.W)
|
self.learner.optim.update(self.model.dW, self.model.W)
|
||||||
|
@ -585,6 +620,8 @@ class Ritual: # i'm just making up names at this point
|
||||||
cumsum_loss = _0
|
cumsum_loss = _0
|
||||||
batch_count = inputs.shape[0] // batch_size
|
batch_count = inputs.shape[0] // batch_size
|
||||||
losses = []
|
losses = []
|
||||||
|
assert inputs.shape[0] % batch_size == 0, \
|
||||||
|
"inputs is not evenly divisible by batch_size" # TODO: lift this restriction
|
||||||
for b in range(batch_count):
|
for b in range(batch_count):
|
||||||
self.bn += 1
|
self.bn += 1
|
||||||
bi = b * batch_size
|
bi = b * batch_size
|
||||||
|
@ -594,10 +631,10 @@ class Ritual: # i'm just making up names at this point
|
||||||
if self.learner.per_batch:
|
if self.learner.per_batch:
|
||||||
self.learner.batch(b / batch_count)
|
self.learner.batch(b / batch_count)
|
||||||
|
|
||||||
residual = self.learn(batch_inputs, batch_outputs)
|
predicted = self.learn(batch_inputs, batch_outputs)
|
||||||
self.update()
|
self.update()
|
||||||
|
|
||||||
batch_loss = self.measure(residual)
|
batch_loss = self.measure(predicted, batch_outputs)
|
||||||
if np.isnan(batch_loss):
|
if np.isnan(batch_loss):
|
||||||
raise Exception("nan")
|
raise Exception("nan")
|
||||||
cumsum_loss += batch_loss
|
cumsum_loss += batch_loss
|
||||||
|
|
Loading…
Reference in a new issue