From 4ac04baa1ff60274bf475ae82d92d62bb0792f86 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Thu, 16 Feb 2017 14:10:33 -0800
Subject: [PATCH] .

---
 optim_nn.py      | 187 +++++++++++++++++++++++++----------------------
 optim_nn_core.py |  46 ++++--------
 2 files changed, 114 insertions(+), 119 deletions(-)

diff --git a/optim_nn.py b/optim_nn.py
index c42b581..3a4d330 100644
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -10,7 +10,9 @@ from optim_nn_core import *
 from optim_nn_core import _check, _f
 
 import sys
-lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs)
+
+def lament(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
 
 def log(left, right):
     lament("{:>20}:   {}".format(left, right))
@@ -160,7 +162,8 @@ class NoisyRitual(Ritual):
 
 class DumbLearner(AnnealingLearner):
     # this is my own awful contraption. it's not really "SGD with restarts".
-    def __init__(self, optim, epochs=100, rate=None, halve_every=10, restarts=0, restart_advance=20, callback=None):
+    def __init__(self, optim, epochs=100, rate=None, halve_every=10,
+                 restarts=0, restart_advance=20, callback=None):
         self.restart_epochs = int(epochs)
         self.restarts = int(restarts)
         self.restart_advance = float(restart_advance)
@@ -183,58 +186,67 @@ class DumbLearner(AnnealingLearner):
                 self.restart_callback(restart)
         return True
 
+def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
+    skip = y
+    merger = Sum()
+    skip.feed(merger)
+    z_start = skip.feed(activation())
+    for _ in range(multi):
+        z = z_start
+        for j in range(block):
+            if j > 0:
+                z = z.feed(activation())
+            z = z.feed(FC())
+        z.feed(merger)
+    y = merger
+    return y
+
+def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
+    # this is my own awful contraption.
+    is_last = d + 1 == depth
+    needs_sum = not is_last or multi > 1
+    skip = y
+    if needs_sum:
+        merger = Sum()
+    if not is_last:
+        skip.feed(merger)
+    z_start = skip.feed(activation())
+    for _ in range(multi):
+        z = z_start
+        for j in range(block):
+            if j > 0:
+                z = z.feed(activation())
+            z = z.feed(FC())
+        if needs_sum:
+            z.feed(merger)
+    if needs_sum:
+        y = merger
+    else:
+        y = z
+    return y
+
+_mr_styles = dict(
+    batchless=_mr_batchless,
+    onelesssum=_mr_onelesssum,
+)
+
 def multiresnet(x, width, depth, block=2, multi=1,
                 activation=Relu, style='batchless',
                 init=init_he_normal):
+    if style not in _mr_styles:
+        raise Exception('unknown resnet style', style)
+
     y = x
     last_size = x.output_shape[0]
 
-    FC = lambda size: Dense(size, init)
-    #FC = lambda size: DenseOneLess(size, init)
-
     for d in range(depth):
         size = width
+        FC = lambda: Dense(size, init)
 
         if last_size != size:
-            y = y.feed(Dense(size, init))
+            y = y.feed(FC())
 
-        if style == 'batchless':
-            skip = y
-            merger = Sum()
-            skip.feed(merger)
-            z_start = skip.feed(activation())
-            for i in range(multi):
-                z = z_start
-                for i in range(block):
-                    if i > 0:
-                        z = z.feed(activation())
-                    z = z.feed(FC(size))
-                z.feed(merger)
-            y = merger
-        elif style == 'onelesssum':
-            # this is my own awful contraption.
-            is_last = d + 1 == depth
-            needs_sum = not is_last or multi > 1
-            skip = y
-            if needs_sum:
-                merger = Sum()
-            if not is_last:
-                skip.feed(merger)
-            z_start = skip.feed(activation())
-            for i in range(multi):
-                z = z_start
-                for i in range(block):
-                    if i > 0:
-                        z = z.feed(activation())
-                    z = z.feed(FC(size))
-                if needs_sum:
-                    z.feed(merger)
-            if needs_sum:
-                y = merger
-            else:
-                y = z
-        else:
-            raise Exception('unknown resnet style', style)
+        y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d)
 
         last_size = size
 
@@ -260,17 +272,17 @@ def normalize_data(data, mean=None, std=None):
 def toy_data(train_samples, valid_samples, problem=2):
     total_samples = train_samples + valid_samples
 
+    nod = normalize_data # shorthand to keep a sane indentation
+
     if problem == 0:
-        from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, \
-                                    inputs, outputs, valid_inputs, valid_outputs, \
-                                    x_scale, y_scale
+        from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs
         inputs, outputs = _f(inputs), _f(outputs)
         valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs)
 
-        normalize_data(inputs, 127.5, 73.9)
-        normalize_data(outputs, 44.8, 21.7)
-        normalize_data(valid_inputs, 127.5, 73.9)
-        normalize_data(valid_outputs, 44.8, 21.7)
+        nod(inputs, 127.5, 73.9)
+        nod(outputs, 44.8, 21.7)
+        nod(valid_inputs, 127.5, 73.9)
+        nod(valid_outputs, 44.8, 21.7)
 
     elif problem == 1:
         from sklearn.datasets import make_friedman1
@@ -278,8 +290,8 @@ def toy_data(train_samples, valid_samples, problem=2):
         inputs, outputs = _f(inputs), _f(outputs)
         outputs = np.expand_dims(outputs, -1)
 
-        normalize_data(inputs, 0.5, 1/np.sqrt(12))
-        normalize_data(outputs, 14.4, 4.9)
+        nod(inputs, 0.5, 1/np.sqrt(12))
+        nod(outputs, 14.4, 4.9)
 
     elif problem == 2:
         from sklearn.datasets import make_friedman2
@@ -287,11 +299,11 @@ def toy_data(train_samples, valid_samples, problem=2):
         inputs, outputs = _f(inputs), _f(outputs)
         outputs = np.expand_dims(outputs, -1)
 
-        normalize_data(inputs,
-          [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
-          [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
+        nod(inputs,
+            [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00],
+            [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00])
 
-        normalize_data(outputs, [482], [380])
+        nod(outputs, [482], [380])
 
     elif problem == 3:
         from sklearn.datasets import make_friedman3
@@ -299,11 +311,11 @@ def toy_data(train_samples, valid_samples, problem=2):
         inputs, outputs = _f(inputs), _f(outputs)
         outputs = np.expand_dims(outputs, -1)
 
-        normalize_data(inputs,
-          [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
-          [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
+        nod(inputs,
+            [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00],
+            [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00])
 
-        normalize_data(outputs, [1.32327931], [0.31776295])
+        nod(outputs, [1.32327931], [0.31776295])
 
     else:
         raise Exception("unknown toy data set", problem)
@@ -341,9 +353,6 @@ def model_from_config(config, input_features, output_features, callbacks):
 
     #
 
-    # FIXME: unused variable
-    training = config.epochs > 0 and config.restarts >= 0
-
     if config.fn_load is not None:
         log('loading weights', config.fn_load)
         model.load_weights(config.fn_load)
@@ -390,7 +399,8 @@ def model_from_config(config, input_features, output_features, callbacks):
     elif config.learner == 'dumb':
         learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
                               halve_every=config.learn_halve_every,
-                              restarts=config.restarts, restart_advance=config.learn_restart_advance,
+                              restarts=config.restarts,
+                              restart_advance=config.learn_restart_advance,
                               callback=rscb)
         log("final learning rate", "{:10.8f}".format(learner.final_rate))
     elif config.learner == 'sgd':
@@ -430,11 +440,12 @@ def model_from_config(config, input_features, output_features, callbacks):
 
     #
 
-    return model, learner, ritual, (loss, mloss)
+    return model, learner, ritual
 
 # main program {{{1
 
-def run(program, args=[]):
+def run(program, args=None):
+    args = args if args else []
 
     np.random.seed(42069)
 
@@ -469,7 +480,7 @@ def run(program, args=[]):
         epochs = 24,
         restarts = 2,
         restart_decay = 0.25, # only used with SGDR
-        expando = lambda i: i + 1,
+        expando = lambda i: 24 * i,
 
         # misc
         init = 'he_normal',
@@ -484,15 +495,17 @@ def run(program, args=[]):
             # training/validation pairs for each problem (starting from problem 0):
             #(5.08e-05, 6.78e-05),
             (7.577717e-04, 1.255284e-03),
-            (3.032806e-06, 3.963775e-06),
-            (3.676451e-07, 4.495362e-07),
-            (1.854613e-05, 1.623881e-05)
+            # 1080 epochs on these...
+            (1.790511e-07, 2.785208e-07),
+            (2.233277e-08, 3.580281e-08),
+            (5.266719e-07, 5.832677e-06), # overfitting? bad valid set?
         ),
 
         unsafe = True, # aka gotta go fast mode
     )
 
-    for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']:
+    for k in ['parallel_style', 'activation', 'optim', 'learner',
+              'init', 'loss', 'mloss', 'ritual']:
         config[k] = config[k].lower()
 
     config.pprint()
@@ -507,20 +520,16 @@ def run(program, args=[]):
 
     callbacks = Dummy()
 
-    model, learner, ritual, (loss, mloss) = \
+    model, learner, ritual = \
       model_from_config(config, input_features, output_features, callbacks)
 
     # Model Information
 
-    if 0:
-        node_names = ' '.join([str(node) for node in model.ordered_nodes])
-        log('{} nodes'.format(len(model.ordered_nodes)), node_names)
-    else:
-        for node in model.ordered_nodes:
-            children = [str(n) for n in node.children]
-            if len(children) > 0:
-                sep = '->'
-                print(str(node)+sep+('\n'+str(node)+sep).join(children))
+    for node in model.ordered_nodes:
+        children = [str(n) for n in node.children]
+        if children:
+            sep = '->'
+            print(str(node)+sep+('\n'+str(node)+sep).join(children))
     log('parameters', model.param_count)
 
     # Training {{{2
@@ -534,10 +543,8 @@ def run(program, args=[]):
             predicted = model.forward(inputs)
             err = ritual.measure(predicted, outputs)
             log(name + " loss", "{:12.6e}".format(err))
-            # TODO: print logarithmic difference as it might be more meaningful
-            # (fewer results stuck around -99%)
             if comparison:
-                log("improvement", "{:+7.2f}%".format((comparison / err - 1) * 100))
+                log("improvement", "10**({:+7.4f}) times".format(np.log10(comparison / err)))
             return err
 
         train_err = print_error("train",
@@ -551,10 +558,13 @@ def run(program, args=[]):
 
     callbacks.restart = measure_error
 
-    measure_error()
+    training = config.epochs > 0 and config.restarts >= 0
+
+    if training:
+        measure_error()
 
     ritual.prepare(model)
-    while learner.next():
+    while training and learner.next():
         indices = np.arange(inputs.shape[0])
         np.random.shuffle(indices)
         shuffled_inputs = inputs[indices]
@@ -573,11 +583,11 @@ def run(program, args=[]):
 
     measure_error()
 
-    if config.fn_save is not None:
+    if training and config.fn_save is not None:
         log('saving weights', config.fn_save)
         model.save_weights(config.fn_save, overwrite=True)
 
-    if config.log_fn is not None:
+    if training and config.log_fn is not None:
         log('saving losses', config.log_fn)
         np.savez_compressed(config.log_fn,
                             batch_losses=np.array(batch_losses, dtype=_f),
@@ -592,5 +602,4 @@ def run(program, args=[]):
 # run main program {{{1
 
 if __name__ == '__main__':
-    import sys
     sys.exit(run(sys.argv[0], sys.argv[1:]))
diff --git a/optim_nn_core.py b/optim_nn_core.py
index 266241b..3beacc6 100644
--- a/optim_nn_core.py
+++ b/optim_nn_core.py
@@ -48,19 +48,20 @@ class CategoricalCrossentropy(Loss):
         # TODO: assert dimensionality and p > 0 (if not self.unsafe?)
         p = np.clip(p, self.eps, 1 - self.eps)
         f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
-        return np.mean(f, axis=-1)
+        return np.mean(f)
 
     def dF(self, p, y):
         p = np.clip(p, self.eps, 1 - self.eps)
         df = (p - y) / (p * (1 - p))
-        return df / y.shape[-1]
+        return df / len(y)
 
 class ResidualLoss(Loss):
     def F(self, p, y): # mean
         return np.mean(self.f(p - y))
 
     def dF(self, p, y): # dmean
-        return self.df(p - y) / y.shape[-1]
+        ret = self.df(p - y) / len(y)
+        return ret
 
 class Squared(ResidualLoss):
     def f(self, r):
@@ -80,7 +81,7 @@ class Absolute(ResidualLoss):
 
 class Optimizer:
     def __init__(self, alpha=0.1):
-        self.alpha = _f(alpha)
+        self.alpha = _f(alpha) # learning rate
         self.reset()
 
     def reset(self):
@@ -97,12 +98,11 @@ class Optimizer:
 
 class Momentum(Optimizer):
     def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False):
-        self.alpha = _f(alpha) # learning rate
         self.lamb = _f(lamb) # weight decay
         self.mu = _f(mu) # momentum
         self.nesterov = bool(nesterov)
 
-        self.reset()
+        super().__init__(alpha)
 
     def reset(self):
         self.dWprev = None
@@ -116,8 +116,7 @@ class Momentum(Optimizer):
         self.dWprev[:] = V
         if self.nesterov: # TODO: is this correct? looks weird
             return self.mu * V - self.alpha * (dW + W * self.lamb)
-        else:
-            return V
+        return V
 
 class RMSprop(Optimizer):
     # RMSprop generalizes* Adagrad, etc.
@@ -127,7 +126,6 @@ class RMSprop(Optimizer):
     #   RMSprop.mu == 1
 
     def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8):
-        self.alpha = _f(alpha) # learning rate
         self.mu = _f(mu) # decay term
         self.eps = _f(eps)
 
@@ -138,7 +136,7 @@ class RMSprop(Optimizer):
         # an input decays to 1/e its original amplitude over 99.5 epochs.
         # (this is from DSP, so how relevant it is in SGD is debatable)
 
-        self.reset()
+        super().__init__(alpha)
 
     def reset(self):
         self.g = None
@@ -168,14 +166,13 @@ class Adam(Optimizer):
     #   Adam.b2_t == 0
 
     def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8):
-        self.alpha = _f(alpha) # learning rate
         self.b1 = _f(b1) # decay term
         self.b2 = _f(b2) # decay term
         self.b1_t_default = _f(b1_t) # decay term power t
         self.b2_t_default = _f(b2_t) # decay term power t
         self.eps = _f(eps)
 
-        self.reset()
+        super().__init__(alpha)
 
     def reset(self):
         self.mt = None
@@ -249,14 +246,7 @@ class Layer:
     def dmulti(self, dB):
         if len(dB) == 1:
             return self.dF(dB[0])
-        else:
-            dX = None
-            for dY in dB:
-                if dX is None:
-                    dX = self.dF(dY)
-                else:
-                    dX += self.dF(dY)
-            return dX
+        return sum((self.dF(dY) for dY in dB))
 
     # general utility methods:
 
@@ -267,10 +257,7 @@ class Layer:
             if shape is None:
                 return False
             self.input_shape = shape
-        if np.all(self.input_shape == parent.output_shape):
-            return True
-        else:
-            return False
+        return np.all(self.input_shape == parent.output_shape)
 
     def feed(self, child):
         if not child.compatible(self):
@@ -288,7 +275,7 @@ class Layer:
 
     def forward(self, lut):
         if not self.unsafe:
-            assert len(self.parents) > 0, self
+            assert self.parents, self
         B = []
         for parent in self.parents:
             # TODO: skip over irrelevant nodes (if any)
@@ -303,7 +290,7 @@ class Layer:
 
     def backward(self, lut):
         if not self.unsafe:
-            assert len(self.children) > 0, self
+            assert self.children, self
         dB = []
         for child in self.children:
             # TODO: skip over irrelevant nodes (if any)
@@ -643,8 +630,7 @@ class Ritual: # i'm just making up names at this point
         avg_loss = cumsum_loss / _f(batch_count)
         if return_losses:
             return avg_loss, losses
-        else:
-            return avg_loss
+        return avg_loss
 
 # Learners {{{1
 
@@ -734,12 +720,12 @@ class SGDR(Learner):
         self.restarts = int(restarts)
         self.restart_callback = callback
         # TODO: rename expando to something not insane
-        self.expando = expando if expando is not None else lambda i: 1
+        self.expando = expando if expando is not None else lambda i: i
 
         self.splits = []
         epochs = 0
         for i in range(0, self.restarts + 1):
-            split = epochs + int(self.restart_epochs * self.expando(i))
+            split = epochs + self.restart_epochs + int(self.expando(i))
             self.splits.append(split)
             epochs = split
         super().__init__(optim, epochs, rate)