diff --git a/optim_nn.py b/optim_nn.py index 46dd5b0..fcc695b 100755 --- a/optim_nn.py +++ b/optim_nn.py @@ -764,7 +764,7 @@ def run(program, args=None): # style of resnet (order of layers, which layers, etc.) parallel_style = 'onelesssum', - activation = 'lecun', + activation = 'selu', optim = 'adam', # note: most features only implemented for Adam optim_decay1 = 24, # first momentum given in epochs (optional) @@ -774,20 +774,20 @@ def run(program, args=None): # learning parameters learner = 'sgdr', - learn = 1e-2, + learn = 0.00125, epochs = 24, learn_halve_every = 16, # only used with anneal/dumb - restarts = 5, + restarts = 4, restart_decay = 0.25, # only used with SGDR expando = lambda i: 24 * i, # misc - init = 'glorot_uniform', + init = 'gaussian_unit', loss = 'mse', mloss = 'mse', ritual = 'default', restart_optim = False, # restarts also reset internal state of optimizer - warmup = True, # train a couple epochs on gaussian noise and reset + warmup = False, # train a couple epochs on gaussian noise and reset # logging/output log10_loss = True, # personally, i'm sick of looking linear loss values! @@ -811,6 +811,8 @@ def run(program, args=None): 'init', 'loss', 'mloss', 'ritual']: config[k] = config[k].lower() + config.learn *= np.sqrt(config.batch_size) + config.pprint() # Toy Data {{{2 diff --git a/optim_nn_mnist.py b/optim_nn_mnist.py index 900ef15..8089810 100755 --- a/optim_nn_mnist.py +++ b/optim_nn_mnist.py @@ -5,27 +5,27 @@ from optim_nn_core import _f #np.random.seed(42069) -use_emnist = False +use_emnist = True measure_every_epoch = True if use_emnist: - lr = 0.01 + lr = 0.0005 epochs = 48 starts = 2 - bs = 200 + bs = 400 learner_class = SGDR restart_decay = 0.5 - n_dense = 0 - n_denses = 2 + n_dense = 2 + n_denses = 0 new_dims = (28, 28) activation = GeluApprox - reg = None - final_reg = None - dropout = None + reg = L1L2(3.2e-5, 3.2e-4) + final_reg = L1L2(3.2e-5, 1e-3) + dropout = 0.05 actreg_lamb = None load_fn = None @@ -37,7 +37,7 @@ if use_emnist: mnist_classes = 47 else: - lr = 0.01 + lr = 0.0005 epochs = 60 starts = 3 bs = 500 @@ -129,6 +129,8 @@ y = y.feed(Softmax()) model = Model(x, y, unsafe=True) +lr *= np.sqrt(bs) + optim = Adam() if learner_class == SGDR: learner = learner_class(optim, epochs=epochs//starts, rate=lr, @@ -176,7 +178,6 @@ def measure_error(quiet=False): return loss, mloss, confid - #if not quiet: loss, mloss, confid = print_error("train", inputs, outputs) train_losses.append(loss) train_mlosses.append(mloss)