diff --git a/optim_nn.py b/optim_nn.py
index 46dd5b0..fcc695b 100755
--- a/optim_nn.py
+++ b/optim_nn.py
@@ -764,7 +764,7 @@ def run(program, args=None):
 
         # style of resnet (order of layers, which layers, etc.)
         parallel_style = 'onelesssum',
-        activation = 'lecun',
+        activation = 'selu',
 
         optim = 'adam', # note: most features only implemented for Adam
         optim_decay1 = 24,  #  first momentum given in epochs (optional)
@@ -774,20 +774,20 @@ def run(program, args=None):
 
         # learning parameters
         learner = 'sgdr',
-        learn = 1e-2,
+        learn = 0.00125,
         epochs = 24,
         learn_halve_every = 16, # only used with anneal/dumb
-        restarts = 5,
+        restarts = 4,
         restart_decay = 0.25, # only used with SGDR
         expando = lambda i: 24 * i,
 
         # misc
-        init = 'glorot_uniform',
+        init = 'gaussian_unit',
         loss = 'mse',
         mloss = 'mse',
         ritual = 'default',
         restart_optim = False, # restarts also reset internal state of optimizer
-        warmup = True, # train a couple epochs on gaussian noise and reset
+        warmup = False, # train a couple epochs on gaussian noise and reset
 
         # logging/output
         log10_loss = True, # personally, i'm sick of looking linear loss values!
@@ -811,6 +811,8 @@ def run(program, args=None):
               'init', 'loss', 'mloss', 'ritual']:
         config[k] = config[k].lower()
 
+    config.learn *= np.sqrt(config.batch_size)
+
     config.pprint()
 
     # Toy Data {{{2
diff --git a/optim_nn_mnist.py b/optim_nn_mnist.py
index 900ef15..8089810 100755
--- a/optim_nn_mnist.py
+++ b/optim_nn_mnist.py
@@ -5,27 +5,27 @@ from optim_nn_core import _f
 
 #np.random.seed(42069)
 
-use_emnist = False
+use_emnist = True
 
 measure_every_epoch = True
 
 if use_emnist:
-    lr = 0.01
+    lr = 0.0005
     epochs = 48
     starts = 2
-    bs = 200
+    bs = 400
 
     learner_class = SGDR
     restart_decay = 0.5
 
-    n_dense = 0
-    n_denses = 2
+    n_dense = 2
+    n_denses = 0
     new_dims = (28, 28)
     activation = GeluApprox
 
-    reg = None
-    final_reg = None
-    dropout = None
+    reg = L1L2(3.2e-5, 3.2e-4)
+    final_reg = L1L2(3.2e-5, 1e-3)
+    dropout = 0.05
     actreg_lamb = None
 
     load_fn = None
@@ -37,7 +37,7 @@ if use_emnist:
     mnist_classes = 47
 
 else:
-    lr = 0.01
+    lr = 0.0005
     epochs = 60
     starts = 3
     bs = 500
@@ -129,6 +129,8 @@ y = y.feed(Softmax())
 
 model = Model(x, y, unsafe=True)
 
+lr *= np.sqrt(bs)
+
 optim = Adam()
 if learner_class == SGDR:
     learner = learner_class(optim, epochs=epochs//starts, rate=lr,
@@ -176,7 +178,6 @@ def measure_error(quiet=False):
 
         return loss, mloss, confid
 
-    #if not quiet:
     loss, mloss, confid = print_error("train", inputs, outputs)
     train_losses.append(loss)
     train_mlosses.append(mloss)