From 166644023edd6a70c2982ea4ea1c7f21fda4dda3 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Sun, 26 Feb 2017 07:41:38 +0000
Subject: [PATCH] .

---
 optim_nn.py      | 65 +++++++++++++++++++++++++++++++++++++-----------
 optim_nn_core.py | 48 +++++++++++++++++++++++++++++++++--
 2 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/optim_nn.py b/optim_nn.py
index 3cd1b3a..9a6d6ca 100644
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -132,6 +132,38 @@ class DenseOneLess(Dense):
         np.fill_diagonal(self.dcoeffs, 0)
         return dX
 
+class CosineDense(Dense):
+    # paper: https://arxiv.org/abs/1702.05870
+    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
+    # the paper doesn't mention bias,
+    # so we treat bias as an additional weight with a constant input of 1.
+    # this is correct in Dense layers, so i hope it's correct here too.
+
+    eps = 1e-4
+
+    def F(self, X):
+        self.X = X
+        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
+          + 1 + self.eps)
+        self.W_norm = np.sqrt(np.square(self.coeffs).sum(0, keepdims=True) \
+          + np.square(self.biases) + self.eps)
+        self.dot = X.dot(self.coeffs) + self.biases
+        Y = self.dot / (self.X_norm * self.W_norm)
+        return Y
+
+    def dF(self, dY):
+        ddot = dY / self.X_norm / self.W_norm
+        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
+        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
+
+        self.dcoeffs[:] = self.X.T.dot(ddot)         \
+          + dW_norm / self.W_norm * self.coeffs
+        self.dbiases[:] = ddot.sum(0, keepdims=True) \
+          + dW_norm / self.W_norm * self.biases
+        dX = ddot.dot(self.coeffs.T) + dX_norm / self.X_norm * self.X
+
+        return dX
+
 # Rituals {{{1
 
 def stochastic_multiply(W, gamma=0.5, allow_negation=True):
@@ -300,7 +332,6 @@ def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
         y = merger
     else:
         y = z
-    #y = y.feed(LayerNorm())
     return y
 
 _mr_styles = dict(
@@ -312,6 +343,11 @@ _mr_styles = dict(
 def multiresnet(x, width, depth, block=2, multi=1,
                 activation=Relu, style='batchless',
                 init=init_he_normal):
+    if style == 'cossim':
+        style = 'batchless'
+        DenseClass = CosineDense
+    else:
+        DenseClass = Dense
     if style not in _mr_styles:
         raise Exception('unknown resnet style', style)
 
@@ -320,7 +356,7 @@ def multiresnet(x, width, depth, block=2, multi=1,
 
     for d in range(depth):
         size = width
-        FC = lambda: Dense(size, init)
+        FC = lambda: DenseClass(size, init)
 
         if last_size != size:
             y = y.feed(FC())
@@ -433,8 +469,10 @@ def optim_from_config(config):
         mu = np.exp(-1/d2)
         optim = RMSprop(mu=mu)
     elif config.optim == 'sgd':
-        if config.momentum != 0:
-            optim = Momentum(mu=config.momentum, nesterov=config.nesterov)
+        d1 = config.optim_decay1 if 'optim_decay1' in config else 0
+        if d1 > 0:
+            b1 = np.exp(-1/d1)
+            optim = Momentum(mu=b1, nesterov=config.nesterov)
         else:
             optim = Optimizer()
     else:
@@ -453,6 +491,7 @@ def learner_from_config(config, optim, rscb):
     elif config.learner == 'anneal':
         learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
                                    halve_every=config.learn_halve_every)
+        log("final learning rate", "{:10.8f}".format(learner.final_rate))
     elif config.learner == 'dumb':
         learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
                               halve_every=config.learn_halve_every,
@@ -462,7 +501,6 @@ def learner_from_config(config, optim, rscb):
         log("final learning rate", "{:10.8f}".format(learner.final_rate))
     elif config.learner == 'sgd':
         learner = Learner(optim, epochs=config.epochs, rate=config.learn)
-        log("final learning rate", "{:10.8f}".format(learner.final_rate))
     else:
         raise Exception('unknown learner', config.learner)
 
@@ -559,11 +597,10 @@ def run(program, args=None):
         parallel_style = 'onelesssum',
         activation = 'gelu',
 
-        optim = 'adam',
-        optim_decay1 = 2,   # given in epochs (optional)
-        optim_decay2 = 100, # given in epochs (optional)
-        momentum = 0.90, # only used with SGD
-        nesterov = True, # only used with SGD or Adam
+        optim = 'adam', # note: most features only implemented for Adam
+        optim_decay1 = 2, #  first momentum given in epochs (optional)
+        optim_decay2 = 100, # second momentum given in epochs (optional)
+        nesterov = True,
         batch_size = 64,
 
         # learning parameters
@@ -571,7 +608,7 @@ def run(program, args=None):
         learn = 1e-2,
         epochs = 24,
         learn_halve_every = 16, # only used with anneal/dumb
-        restarts = 2,
+        restarts = 8,
         restart_decay = 0.25, # only used with SGDR
         expando = lambda i: 24 * i,
 
@@ -585,9 +622,9 @@ def run(program, args=None):
 
         # logging/output
         log10_loss = True, # personally, i'm sick of looking linear loss values!
-        #fancy_logs = True, # unimplemented
+        #fancy_logs = True, # unimplemented (can't turn it off yet)
 
-        problem = 3,
+        problem = 2,
         compare = (
             # best results for ~10,000 parameters
             # training/validation pairs for each problem (starting from problem 0):
@@ -595,7 +632,7 @@ def run(program, args=None):
             (7.577717e-04, 1.255284e-03),
             # 1080 epochs on these...
             (1.790511e-07, 2.785208e-07),
-            (2.233277e-08, 3.580281e-08),
+            (  10**-7.774,   10**-7.626),
             (5.266719e-07, 5.832677e-06), # overfitting? bad valid set?
         ),
 
diff --git a/optim_nn_core.py b/optim_nn_core.py
index 0f3e576..0afb073 100644
--- a/optim_nn_core.py
+++ b/optim_nn_core.py
@@ -33,6 +33,14 @@ def init_he_uniform(size, ins, outs):
     s = np.sqrt(6 / ins)
     return np.random.uniform(-s, s, size=size)
 
+def init_glorot_normal(size, ins, outs):
+    s = np.sqrt(2 / (ins + outs))
+    return np.random.normal(0, s, size=size)
+
+def init_glorot_uniform(size, ins, outs):
+    s = np.sqrt(6 / (ins + outs))
+    return np.random.uniform(-s, s, size=size)
+
 # Loss functions {{{1
 
 class Loss:
@@ -162,8 +170,6 @@ class Adam(Optimizer):
     # * Adam == RMSprop when
     #   Adam.b1 == 0
     #   Adam.b2 == RMSprop.mu
-    #   Adam.b1_t == 0
-    #   Adam.b2_t == 0
 
     def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
         self.b1 = _f(b1) # decay term
@@ -373,6 +379,20 @@ class Input(Layer):
         #self.dY = dY
         return np.zeros_like(dY)
 
+class Reshape(Layer):
+    def __init__(self, new_shape):
+        super().__init__()
+        self.shape = tuple(new_shape)
+        self.output_shape = self.shape
+
+    def F(self, X):
+        self.batch_size = X.shape[0]
+        return X.reshape(self.batch_size, *self.output_shape)
+
+    def dF(self, dY):
+        assert dY.shape[0] == self.batch_size
+        return dY.reshape(self.batch_size, *self.input_shape)
+
 class Affine(Layer):
     def __init__(self, a=1, b=0):
         super().__init__()
@@ -666,6 +686,30 @@ class Ritual: # i'm just making up names at this point
             return avg_loss, losses
         return avg_loss
 
+    def test_batched(self, inputs, outputs, batch_size, return_losses=False):
+        cumsum_loss = _0
+        batch_count = inputs.shape[0] // batch_size
+        losses = []
+        assert inputs.shape[0] % batch_size == 0, \
+          "inputs is not evenly divisible by batch_size" # TODO: lift this restriction
+        for b in range(batch_count):
+            bi = b * batch_size
+            batch_inputs  = inputs[ bi:bi+batch_size]
+            batch_outputs = outputs[bi:bi+batch_size]
+
+            predicted = self.model.forward(batch_inputs)
+
+            batch_loss = self.measure(predicted, batch_outputs)
+            if np.isnan(batch_loss):
+                raise Exception("nan")
+            cumsum_loss += batch_loss
+            if return_losses:
+                losses.append(batch_loss)
+        avg_loss = cumsum_loss / _f(batch_count)
+        if return_losses:
+            return avg_loss, losses
+        return avg_loss
+
 # Learners {{{1
 
 class Learner: