From 5b2a293b8ce40ab18828a6caea74025b69ee174d Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Tue, 10 Jan 2017 13:21:26 -0800
Subject: [PATCH] .

---
 optim_nn.py | 67 +++++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/optim_nn.py b/optim_nn.py
index 86c50df..c23e448 100644
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -18,6 +18,13 @@ class Loss:
         d = self.df(r)
         return d / len(d)
 
+class Squared(Loss):
+    def f(self, r):
+        return np.square(r)
+
+    def df(self, r):
+        return 2 * r
+
 class SquaredHalved(Loss):
     def f(self, r):
         return np.square(r) / 2
@@ -144,18 +151,18 @@ class Layer:
 
     def multi(self, B):
         assert len(B) == 1, self
-        return self.F(B[0].T).T
+        return self.F(B[0])
 
     def dmulti(self, dB):
         if len(dB) == 1:
-            return self.dF(dB[0].T).T
+            return self.dF(dB[0])
         else:
             dX = None
             for dY in dB:
                 if dX is None:
-                    dX = self.dF(dY.T).T
+                    dX = self.dF(dY)
                 else:
-                    dX += self.dF(dY.T).T
+                    dX += self.dF(dY)
             return dX
 
     # general utility methods:
@@ -300,10 +307,10 @@ class Dense(Layer):
 
         self.W = W
         self.dW = dW
-        self.coeffs = self.W[:self.nW].reshape(outs, ins)
-        self.biases = self.W[self.nW:].reshape(outs, 1)
-        self.dcoeffs = self.dW[:self.nW].reshape(outs, ins)
-        self.dbiases = self.dW[self.nW:].reshape(outs)
+        self.coeffs = self.W[:self.nW].reshape(ins, outs)
+        self.biases = self.W[self.nW:].reshape(1, outs)
+        self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
+        self.dbiases = self.dW[self.nW:].reshape(1, outs)
 
         # he_normal initialization
         s = np.sqrt(2 / ins)
@@ -321,14 +328,14 @@ class Dense(Layer):
 
     def F(self, X):
         self.X = X
-        Y = self.coeffs.dot(X) \
+        Y = X.dot(self.coeffs) \
           + self.biases
         return Y
 
     def dF(self, dY):
-        dX = self.coeffs.T.dot(dY)
-        self.dcoeffs[:] = dY.dot(self.X.T)
-        self.dbiases[:] = np.sum(dY, axis=1)
+        dX = dY.dot(self.coeffs.T)
+        self.dcoeffs[:] = self.X.T.dot(dY)
+        self.dbiases[:] = np.sum(dY, axis=0, keepdims=True)
         return dX
 
 # Model
@@ -411,8 +418,8 @@ class Model:
         for i in range(len(denses)):
             a, b = i, i + 1
             b_name = "dense_{}".format(b)
-            denses[a].coeffs = weights[b_name+'_W'].T
-            denses[a].biases = np.expand_dims(weights[b_name+'_b'], -1)
+            denses[a].coeffs = weights[b_name+'_W']
+            denses[a].biases = np.expand_dims(weights[b_name+'_b'], 0)
 
     def save_weights(self, fn, overwrite=False):
         raise NotImplementedError("unimplemented", self)
@@ -433,7 +440,7 @@ if __name__ == '__main__':
         # style of resnet
         # only one is implemented so far
         parallel_style = 'batchless',
-        activation = 'gelu',
+        activation = 'relu',
 
         optim = 'adam',
         nesterov = False, # only used with SGD or Adam
@@ -465,8 +472,7 @@ if __name__ == '__main__':
         return inputs, outputs
 
     inputs, outputs = read_data("ml/cie_mlp_data.npz")
-    valid_data = read_data("ml/cie_mlp_vdata.npz")
-    valid_inputs, valid_outputs = valid_data
+    valid_inputs, valid_outputs = read_data("ml/cie_mlp_vdata.npz")
 
     # Our Test Model
 
@@ -520,8 +526,12 @@ if __name__ == '__main__':
     else:
         raise Exception('unknown optimizer', config.optim)
 
-    assert config.loss == 'mse', 'unknown loss function'
-    loss = SquaredHalved()
+    if config.loss == 'mse':
+        loss = Squared()
+    elif config.loss == 'mshe': # mushy
+        loss = SquaredHalved()
+    else:
+        raise Exception('unknown objective', config.loss)
 
     LR = config.LR
     LRprod = 0.5**(1/config.LR_halve_every)
@@ -532,12 +542,14 @@ if __name__ == '__main__':
         predicted = model.forward(inputs / x_scale)
         residual = predicted - outputs / y_scale
         err = loss.mean(residual)
-        print("train loss: {:10.6f}".format(err))
+        print("train loss: {:11.7f}".format(err))
+        print("improvement: {:+7.2f}%".format((0.0007031 / err - 1) * 100))
 
         predicted = model.forward(valid_inputs / x_scale)
         residual = predicted - valid_outputs / y_scale
         err = loss.mean(residual)
-        print("valid loss: {:10.6f}".format(err))
+        print("valid loss: {:11.7f}".format(err))
+        print("improvement: {:+7.2f}%".format((0.0007159 / err - 1) * 100))
 
     for i in range(config.restarts + 1):
         measure_loss()
@@ -554,6 +566,8 @@ if __name__ == '__main__':
             shuffled_inputs = inputs[indices] / x_scale
             shuffled_outputs = outputs[indices] / y_scale
 
+            optim.alpha = LR * LRprod**e
+
             cumsum_loss = 0
             for b in range(batch_count):
                 bi = b * config.batch_size
@@ -561,16 +575,9 @@ if __name__ == '__main__':
                 batch_outputs = shuffled_outputs[bi:bi+config.batch_size]
 
                 predicted = model.forward(batch_inputs)
-                dW = model.backward(np.ones_like(predicted))
-
                 residual = predicted - batch_outputs
-
-                # TODO: try something like this instead?
-                #err_dW = np.dot(loss.dmean(residual), np.expand_dims(dW, 0))
-                err_dW = loss.df(residual) * dW / len(residual)
-                err_dW = np.sum(err_dW, axis=0)
-                optim.alpha = LR * LRprod**e
-                optim.update(err_dW, model.W)
+                dW = model.backward(loss.dmean(residual))
+                optim.update(dW, model.W)
 
                 # note: we don't actually need this for training, only monitoring.
                 cumsum_loss += loss.mean(residual)