add Softplus and LeCunTanh; use LeCunTanh

2017-04-23 17:40:47 +00:00 · 2017-04-23 17:40:47 +00:00 · d3c23912c1
commit d3c23912c1
parent 0332c2662b
2 changed files with 34 additions and 8 deletions
--- a/optim_nn.py
+++ b/optim_nn.py
@ -468,8 +468,10 @@ def multiresnet(x, width, depth, block=2, multi=1,

 # Toy Data {{{1

-inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
-activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
+inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform,
+             glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform)
+activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh,
+                   relu=Relu, elu=Elu, gelu=GeluApprox, softplus=Softplus)

 def prettyize(data):
    if isinstance(data, np.ndarray):
@ -692,10 +694,10 @@ def run(program, args=None):

        # style of resnet (order of layers, which layers, etc.)
        parallel_style = 'onelesssum',
-        activation = 'gelu',
+        activation = 'lecun',

        optim = 'adam', # note: most features only implemented for Adam
-        optim_decay1 = 2, #  first momentum given in epochs (optional)
+        optim_decay1 = 24,  #  first momentum given in epochs (optional)
        optim_decay2 = 100, # second momentum given in epochs (optional)
        nesterov = True,
        batch_size = 64,
@ -705,13 +707,13 @@ def run(program, args=None):
        learn = 1e-2,
        epochs = 24,
        learn_halve_every = 16, # only used with anneal/dumb
-        restarts = 8,
+        restarts = 5,
        restart_decay = 0.25, # only used with SGDR
        expando = lambda i: 24 * i,

        # misc
-        init = 'he_normal',
-        loss = 'msee',
+        init = 'glorot_uniform',
+        loss = 'mse',
        mloss = 'mse',
        ritual = 'default',
        restart_optim = False, # restarts also reset internal state of optimizer
--- a/optim_nn_core.py
+++ b/optim_nn_core.py
@ -570,7 +570,7 @@ class Dropout(Layer):

 # Activation Layers {{{2

-class Sigmoid(Layer): # aka Logistic
+class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
    def forward(self, X):
        self.sig = sigmoid(X)
        return self.sig
@ -578,6 +578,16 @@ class Sigmoid(Layer): # aka Logistic
    def backward(self, dY):
        return dY * self.sig * (1 - self.sig)

+class Softplus(Layer):
+    # integral of Sigmoid.
+
+    def forward(self, X):
+        self.X = X
+        return np.log(1 + np.exp(X))
+
+    def backward(self, dY):
+        return sigmoid(self.X)
+
 class Tanh(Layer):
    def forward(self, X):
        self.sig = np.tanh(X)
@ -586,6 +596,20 @@ class Tanh(Layer):
    def backward(self, dY):
        return dY * (1 - self.sig * self.sig)

+class LeCunTanh(Layer):
+    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
+    # scaled such that f([-1, 1]) = [-1, 1].
+    # helps preserve an input variance of 1.
+    # second derivative peaks around an input of ±1.
+
+    def forward(self, X):
+        self.sig = np.tanh(2 / 3 * X)
+        return 1.7159 * self.sig
+
+    def backward(self, dY):
+        return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
+
 class Relu(Layer):
    def forward(self, X):
        self.cond = X >= 0