diff --git a/optim_nn.py b/optim_nn.py
index 0858d09..6ae5219 100755
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -468,8 +468,10 @@ def multiresnet(x, width, depth, block=2, multi=1,
 
 # Toy Data {{{1
 
-inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform)
-activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
+inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform,
+             glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform)
+activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh,
+                   relu=Relu, elu=Elu, gelu=GeluApprox, softplus=Softplus)
 
 def prettyize(data):
     if isinstance(data, np.ndarray):
@@ -692,10 +694,10 @@ def run(program, args=None):
 
         # style of resnet (order of layers, which layers, etc.)
         parallel_style = 'onelesssum',
-        activation = 'gelu',
+        activation = 'lecun',
 
         optim = 'adam', # note: most features only implemented for Adam
-        optim_decay1 = 2, #  first momentum given in epochs (optional)
+        optim_decay1 = 24,  #  first momentum given in epochs (optional)
         optim_decay2 = 100, # second momentum given in epochs (optional)
         nesterov = True,
         batch_size = 64,
@@ -705,13 +707,13 @@ def run(program, args=None):
         learn = 1e-2,
         epochs = 24,
         learn_halve_every = 16, # only used with anneal/dumb
-        restarts = 8,
+        restarts = 5,
         restart_decay = 0.25, # only used with SGDR
         expando = lambda i: 24 * i,
 
         # misc
-        init = 'he_normal',
-        loss = 'msee',
+        init = 'glorot_uniform',
+        loss = 'mse',
         mloss = 'mse',
         ritual = 'default',
         restart_optim = False, # restarts also reset internal state of optimizer
diff --git a/optim_nn_core.py b/optim_nn_core.py
index 86c1f69..feba346 100644
--- a/optim_nn_core.py
+++ b/optim_nn_core.py
@@ -570,7 +570,7 @@ class Dropout(Layer):
 
 # Activation Layers {{{2
 
-class Sigmoid(Layer): # aka Logistic
+class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit)
     def forward(self, X):
         self.sig = sigmoid(X)
         return self.sig
@@ -578,6 +578,16 @@ class Sigmoid(Layer): # aka Logistic
     def backward(self, dY):
         return dY * self.sig * (1 - self.sig)
 
+class Softplus(Layer):
+    # integral of Sigmoid.
+
+    def forward(self, X):
+        self.X = X
+        return np.log(1 + np.exp(X))
+
+    def backward(self, dY):
+        return sigmoid(self.X)
+
 class Tanh(Layer):
     def forward(self, X):
         self.sig = np.tanh(X)
@@ -586,6 +596,20 @@ class Tanh(Layer):
     def backward(self, dY):
         return dY * (1 - self.sig * self.sig)
 
+class LeCunTanh(Layer):
+    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
+    # scaled such that f([-1, 1]) = [-1, 1].
+    # helps preserve an input variance of 1.
+    # second derivative peaks around an input of ±1.
+
+    def forward(self, X):
+        self.sig = np.tanh(2 / 3 * X)
+        return 1.7159 * self.sig
+
+    def backward(self, dY):
+        return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
+
 class Relu(Layer):
     def forward(self, X):
         self.cond = X >= 0