From 5a4b3d29343a61eb135d3d6d923d3ee5e88a0fc2 Mon Sep 17 00:00:00 2001
From: Connor Olding <cloningdonor@gmail.com>
Date: Wed, 21 Jun 2017 12:20:27 +0000
Subject: [PATCH] add Follow The Moving Leader optimizer

---
 optim_nn.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/optim_nn.py b/optim_nn.py
index 59fc68b..900d24b 100755
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -105,6 +105,58 @@ class SaturateRelu(Regularizer):
     def backward(self, X):
         return self.lamb * np.where(X >= 0, 1, 0)
 
+# Optimizers {{{1
+
+class FTML(Optimizer):
+    # paper: http://www.cse.ust.hk/~szhengac/papers/icml17.pdf
+    # author's implementation: https://github.com/szhengac/optim/commit/923555e
+
+    def __init__(self, alpha=0.0025, b1=0.6, b2=0.999, eps=1e-8):
+        self.iterations = _0
+        self.b1 = _f(b1) # decay term
+        self.b2 = _f(b2) # decay term
+        self.eps = _f(eps)
+
+        super().__init__(alpha)
+
+    def reset(self):
+        self.dt1 = None
+        self.dt = None
+        self.vt = None
+        self.zt = None
+        self.b1_t = _1
+        self.b2_t = _1
+
+    def compute(self, dW, W):
+        if self.dt1 is None: self.dt1 = np.zeros_like(dW)
+        if self.dt is None: self.dt = np.zeros_like(dW)
+        if self.vt is None: self.vt = np.zeros_like(dW)
+        if self.zt is None: self.zt = np.zeros_like(dW)
+
+        # NOTE: we could probably rewrite these equations to avoid this copy.
+        self.dt1[:] = self.dt[:]
+
+        self.b1_t *= self.b1
+        self.b2_t *= self.b2
+
+        # hardly an elegant solution.
+        alpha = max(self.alpha, self.eps)
+
+        # same as Adam's vt.
+        self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW
+
+        # you can factor out "inner" out of Adam as well.
+        inner = np.sqrt(self.vt / (1 - self.b2_t)) + self.eps
+        self.dt[:] = (1 - self.b1_t) / alpha * inner
+
+        sigma_t = self.dt - self.b1 * self.dt1
+
+        # Adam's mt minus the sigma term.
+        self.zt[:] = self.b1 * self.zt + (1 - self.b1) * dW - sigma_t * W
+
+        # subtract by weights to avoid having to override self.update.
+        return -self.zt / self.dt - W
+
 # Nonparametric Layers {{{1
 
 class AlphaDropout(Layer):
@@ -635,6 +687,12 @@ def optim_from_config(config):
         b2 = np.exp(-1/d2)
         o = Nadam if config.nesterov else Adam
         optim = o(b1=b1, b2=b2)
+    elif config.optim == 'ftml':
+        d1 = config.optim_decay1 if 'optim_decay1' in config else 9.5
+        d2 = config.optim_decay2 if 'optim_decay2' in config else 999.5
+        b1 = np.exp(-1/d1)
+        b2 = np.exp(-1/d2)
+        optim = FTML(b1=b1, b2=b2)
     elif config.optim in ('rms', 'rmsprop'):
         d2 = config.optim_decay2 if 'optim_decay2' in config else 99.5
         mu = np.exp(-1/d2)
@@ -764,8 +822,12 @@ def run(program, args=None):
 
         # style of resnet (order of layers, which layers, etc.)
         parallel_style = 'onelesssum',
-        activation = 'selu',
+        activation = 'gelu',
 
+        #optim = 'ftml',
+        #optim_decay1 = 2,
+        #optim_decay2 = 100,
+        #nesterov = False,
         optim = 'adam', # note: most features only implemented for Adam
         optim_decay1 = 24,  #  first momentum given in epochs (optional)
         optim_decay2 = 100, # second momentum given in epochs (optional)
@@ -782,7 +844,7 @@ def run(program, args=None):
         expando = lambda i: 24 * i,
 
         # misc
-        init = 'gaussian_unit',
+        init = 'he_normal',
         loss = 'mse',
         mloss = 'mse',
         ritual = 'default',