diff --git a/optim_nn.py b/optim_nn.py index f25a964..c42b581 100644 --- a/optim_nn.py +++ b/optim_nn.py @@ -20,14 +20,14 @@ class Dummy: # Loss functions {{{1 -class SquaredHalved(Loss): +class SquaredHalved(ResidualLoss): def f(self, r): return np.square(r) / 2 def df(self, r): return r -class SomethingElse(Loss): +class SomethingElse(ResidualLoss): # generalizes Absolute and SquaredHalved # plot: https://www.desmos.com/calculator/fagjg9vuz7 def __init__(self, a=4/3): @@ -42,6 +42,8 @@ class SomethingElse(Loss): def df(self, r): return np.sign(r) * np.abs(r)**self.c +# Nonparametric Layers {{{1 + # Parametric Layers {{{1 class LayerNorm(Layer): @@ -238,7 +240,7 @@ def multiresnet(x, width, depth, block=2, multi=1, return y -# etc. {{{1 +# Toy Data {{{1 inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform) activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox) @@ -317,6 +319,8 @@ def toy_data(train_samples, valid_samples, problem=2): return (inputs, outputs), (valid_inputs, valid_outputs) +# Model Creation {{{1 + def model_from_config(config, input_features, output_features, callbacks): # Our Test Model @@ -337,6 +341,7 @@ def model_from_config(config, input_features, output_features, callbacks): # + # FIXME: unused variable training = config.epochs > 0 and config.restarts >= 0 if config.fn_load is not None: @@ -427,7 +432,7 @@ def model_from_config(config, input_features, output_features, callbacks): return model, learner, ritual, (loss, mloss) -# main {{{1 +# main program {{{1 def run(program, args=[]): @@ -527,8 +532,7 @@ def run(program, args=[]): def measure_error(): def print_error(name, inputs, outputs, comparison=None): predicted = model.forward(inputs) - residual = predicted - outputs - err = ritual.measure(residual) + err = ritual.measure(predicted, outputs) log(name + " loss", "{:12.6e}".format(err)) # TODO: print logarithmic difference as it might be more meaningful # (fewer results stuck around -99%) @@ -549,8 +553,6 @@ def run(program, args=[]): measure_error() - assert inputs.shape[0] % config.batch_size == 0, \ - "inputs is not evenly divisible by batch_size" # TODO: lift this restriction ritual.prepare(model) while learner.next(): indices = np.arange(inputs.shape[0]) @@ -587,7 +589,7 @@ def run(program, args=[]): return 0 -# do main {{{1 +# run main program {{{1 if __name__ == '__main__': import sys diff --git a/optim_nn_core.py b/optim_nn_core.py index 74faa0d..266241b 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -36,23 +36,40 @@ def init_he_uniform(size, ins, outs): # Loss functions {{{1 class Loss: - per_batch = False + pass - def mean(self, r): - return np.average(self.f(r)) +class CategoricalCrossentropy(Loss): + # lifted from theano - def dmean(self, r): - d = self.df(r) - return d / len(d) + def __init__(self, eps=1e-8): + self.eps = _f(eps) -class Squared(Loss): + def F(self, p, y): + # TODO: assert dimensionality and p > 0 (if not self.unsafe?) + p = np.clip(p, self.eps, 1 - self.eps) + f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1) + return np.mean(f, axis=-1) + + def dF(self, p, y): + p = np.clip(p, self.eps, 1 - self.eps) + df = (p - y) / (p * (1 - p)) + return df / y.shape[-1] + +class ResidualLoss(Loss): + def F(self, p, y): # mean + return np.mean(self.f(p - y)) + + def dF(self, p, y): # dmean + return self.df(p - y) / y.shape[-1] + +class Squared(ResidualLoss): def f(self, r): return np.square(r) def df(self, r): return 2 * r -class Absolute(Loss): +class Absolute(ResidualLoss): def f(self, r): return np.abs(r) @@ -301,14 +318,6 @@ class Layer: # Nonparametric Layers {{{1 -class Sum(Layer): - def multi(self, B): - return np.sum(B, axis=0) - - def dmulti(self, dB): - #assert len(dB) == 1, "unimplemented" - return dB[0] # TODO: does this always work? - class Input(Layer): def __init__(self, shape): assert shape is not None @@ -336,6 +345,14 @@ class Affine(Layer): def dF(self, dY): return dY * self.a +class Sum(Layer): + def multi(self, B): + return np.sum(B, axis=0) + + def dmulti(self, dB): + #assert len(dB) == 1, "unimplemented" + return dB[0] # TODO: does this always work? + class Sigmoid(Layer): # aka Logistic def F(self, X): self.sig = sigmoid(X) @@ -387,6 +404,25 @@ class GeluApprox(Layer): def dF(self, dY): return dY * self.sig * (1 + self.a * (1 - self.sig)) +class Softmax(Layer): + # lifted from theano + + def __init__(self, axis=-1): + super().__init__() + self.axis = int(axis) + + def F(self, X): + alpha = np.max(X, axis=-1, keepdims=True) + num = np.exp(X - alpha) + den = np.sum(num, axis=-1, keepdims=True) + self.sm = num / den + return self.sm + + def dF(self, dY): + dYsm = dY * self.sm + dX = dYsm - np.sum(dYsm, axis=-1, keepdims=True) * self.sm + return dX + # Parametric Layers {{{1 class Dense(Layer): @@ -560,17 +596,16 @@ class Ritual: # i'm just making up names at this point def reset(self): self.learner.reset(optim=True) - def measure(self, residual): - return self.mloss.mean(residual) + def measure(self, p, y): + return self.mloss.F(p, y) - def derive(self, residual): - return self.loss.dmean(residual) + def derive(self, p, y): + return self.loss.dF(p, y) def learn(self, inputs, outputs): predicted = self.model.forward(inputs) - residual = predicted - outputs - self.model.backward(self.derive(residual)) - return residual + self.model.backward(self.derive(predicted, outputs)) + return predicted def update(self): self.learner.optim.update(self.model.dW, self.model.W) @@ -585,6 +620,8 @@ class Ritual: # i'm just making up names at this point cumsum_loss = _0 batch_count = inputs.shape[0] // batch_size losses = [] + assert inputs.shape[0] % batch_size == 0, \ + "inputs is not evenly divisible by batch_size" # TODO: lift this restriction for b in range(batch_count): self.bn += 1 bi = b * batch_size @@ -594,10 +631,10 @@ class Ritual: # i'm just making up names at this point if self.learner.per_batch: self.learner.batch(b / batch_count) - residual = self.learn(batch_inputs, batch_outputs) + predicted = self.learn(batch_inputs, batch_outputs) self.update() - batch_loss = self.measure(residual) + batch_loss = self.measure(predicted, batch_outputs) if np.isnan(batch_loss): raise Exception("nan") cumsum_loss += batch_loss