diff --git a/optim_nn.py b/optim_nn.py
index edc42dd..16feca9 100644
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -123,7 +123,7 @@ class Layer:
         _layer_counters[kind] += 1
         self.name = "{}_{}".format(kind, _layer_counters[kind])
         self.size = None # total weight count (if any)
-        self.unsafe = False # aka gotta go fast mode
+        self.unsafe = False # disables assertions for better performance
 
     def __str__(self):
         return self.name
@@ -288,6 +288,20 @@ class Relu(Layer):
     def dF(self, dY):
         return np.where(self.cond, dY, 0)
 
+class Elu(Layer):
+    # paper: https://arxiv.org/abs/1511.07289
+    def __init__(self, alpha=1):
+        super().__init__()
+        self.alpha = nf(alpha)
+
+    def F(self, X):
+        self.cond = X >= 0
+        self.neg = np.exp(X) - 1
+        return np.where(self.cond, X, self.neg)
+
+    def dF(self, dY):
+        return dY * np.where(self.cond, 1, self.neg + 1)
+
 class GeluApprox(Layer):
     # paper: https://arxiv.org/abs/1606.08415
     #  plot: https://www.desmos.com/calculator/ydzgtccsld
@@ -437,15 +451,15 @@ if __name__ == '__main__':
         fn = 'ml/cie_mlp_min.h5',
 
         # multi-residual network parameters
-        res_width = 12,
-        res_depth = 3,
-        res_block = 2, # normally 2 for plain resnet
-        res_multi = 4, # normally 1 for plain resnet
+        res_width = 49,
+        res_depth = 1,
+        res_block = 4, # normally 2 for plain resnet
+        res_multi = 1, # normally 1 for plain resnet
 
-        # style of resnet
+        # style of resnet (order of layers, which layers, etc.)
         # only one is implemented so far
         parallel_style = 'batchless',
-        activation = 'relu',
+        activation = 'gelu',
 
         optim = 'adam',
         nesterov = False, # only used with SGD or Adam
@@ -453,17 +467,21 @@ if __name__ == '__main__':
 
         # learning parameters: SGD with restarts (kinda)
         LR = 1e-2,
-        epochs = 6,
-        LR_halve_every = 2,
-        restarts = 3,
-        LR_restart_advance = 3,
+        epochs = 24,
+        LR_halve_every = 16,
+        restarts = 2,
+        LR_restart_advance = 16,
 
         # misc
         batch_size = 64,
         init = 'he_normal',
         loss = 'mse',
+        restart_optim = False, # restarts also reset internal state of optimizer
+        unsafe = False, # aka gotta go fast mode
     )
 
+    config.pprint()
+
     # toy CIE-2000 data
     from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, x_scale, y_scale
 
@@ -485,7 +503,7 @@ if __name__ == '__main__':
     y = x
     last_size = input_samples
 
-    activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, gelu=GeluApprox)
+    activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox)
     activation = activations[config.activation]
 
     for blah in range(config.res_depth):
@@ -513,7 +531,7 @@ if __name__ == '__main__':
     if last_size != output_samples:
         y = y.feed(Dense(output_samples))
 
-    model = Model(x, y, unsafe=False)
+    model = Model(x, y, unsafe=config.unsafe)
 
     node_names = ' '.join([str(node) for node in model.ordered_nodes])
     log('{} nodes'.format(len(model.ordered_nodes)), node_names)
@@ -522,6 +540,12 @@ if __name__ == '__main__':
     training = config.epochs > 0 and config.restarts >= 0
 
     if not training:
+        assert config.res_width == 12
+        assert config.res_depth == 3
+        assert config.res_block == 2
+        assert config.res_multi == 4
+        assert config.activation == 'relu'
+        assert config.parallel_style == 'batchless'
         model.load_weights(config.fn)
 
     if config.optim == 'adam':
@@ -569,7 +593,8 @@ if __name__ == '__main__':
         if i > 0:
             log("restarting", i)
             LR *= LRprod**config.LR_restart_advance
-            optim.reset()
+            if config.restart_optim:
+                optim.reset()
 
         assert inputs.shape[0] % config.batch_size == 0, \
                "inputs is not evenly divisible by batch_size" # TODO: lift this restriction