diff --git a/optim_nn.py b/optim_nn.py index 35e0e87..08e4875 100644 --- a/optim_nn.py +++ b/optim_nn.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# external packages required for full functionality: +# numpy scipy h5py sklearn dotmap + import numpy as np # ugly shorthand: nf = np.float32 @@ -50,6 +53,13 @@ class SquaredHalved(Loss): def df(self, r): return r +class Absolute(Loss): + def f(self, r): + return np.abs(r) + + def df(self, r): + return np.sign(r) + class SomethingElse(Loss): # generalizes Absolute and SquaredHalved (|dx| = 1) # plot: https://www.desmos.com/calculator/fagjg9vuz7 @@ -650,6 +660,14 @@ class NoisyRitual(Ritual): self.gradient_noise = nf(gradient_noise) super().__init__(learner, loss, mloss) + def learn(self, inputs, outputs): + # this is pretty crude + s = self.input_noise + noisy_inputs = inputs + np.random.normal(0, s, size=inputs.shape) + s = self.output_noise + noisy_outputs = outputs + np.random.normal(0, s, size=outputs.shape) + return super().learn(noisy_inputs, noisy_outputs) + def update(self): # gradient noise paper: https://arxiv.org/abs/1511.06807 if self.gradient_noise > 0: @@ -867,6 +885,74 @@ def multiresnet(x, width, depth, block=2, multi=1, inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform) activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox) +def normalize_data(data, mean=None, std=None): + # in-place + if mean is None or std is None: + mean = np.mean(data, axis=0) + std = np.std(data, axis=0) + # TODO: construct function call string for copy-paste convenience + print('mean:', mean) + print('std: ', std) + import sys + sys.exit(1) + data -= mean + data /= std + +def toy_data(train_samples, valid_samples, problem=2): + total_samples = train_samples + valid_samples + + if problem == 1: + from sklearn.datasets import make_friedman1 + inputs, outputs = make_friedman1(total_samples) + outputs = np.expand_dims(outputs, -1) + + normalize_data(inputs, + 0.5, + 1/np.sqrt(12)) + + normalize_data(outputs, + 14.4, + 4.9) + + elif problem == 2: + from sklearn.datasets import make_friedman2 + inputs, outputs = make_friedman2(total_samples) + outputs = np.expand_dims(outputs, -1) + + normalize_data(inputs, + [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00], + [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00]) + + normalize_data(outputs, + [482], + [380]) + + elif problem == 3: + from sklearn.datasets import make_friedman3 + inputs, outputs = make_friedman3(total_samples) + outputs = np.expand_dims(outputs, -1) + + normalize_data(inputs, + [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00], + [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00]) + + normalize_data(outputs, + [1.32327931], + [0.31776295]) + + else: + raise Exception("unknown toy data set", problem) + + # split off a validation set + indices = np.arange(inputs.shape[0]) + np.random.shuffle(indices) + valid_inputs = inputs[indices][-valid_samples:] + valid_outputs = outputs[indices][-valid_samples:] + inputs = inputs[indices][:-valid_samples] + outputs = outputs[indices][:-valid_samples] + + return (inputs, outputs), (valid_inputs, valid_outputs) + def run(program, args=[]): import sys lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs) @@ -894,54 +980,61 @@ def run(program, args=[]): optim = 'adam', nesterov = False, # only used with SGD or Adam momentum = 0.50, # only used with SGD + batch_size = 64, # learning parameters learner = 'sgdr', learn = 1e-2, learn_halve_every = 16, # unused with SGDR learn_restart_advance = 16, # unused with SGDR - epochs = 12, + epochs = 24, restarts = 2, - restart_decay = 1, # only used with SGDR + restart_decay = 0.25, # only used with SGDR expando = lambda i: i + 1, # misc - batch_size = 64, init = 'he_normal', - loss = 'msee', + loss = 'mse', mloss = 'mse', - restart_optim = False, # restarts also reset internal state of optimizer - unsafe = True, # aka gotta go fast mode - train_compare = 0.0000508, - valid_compare = 0.0000678, - ritual = 'default', + restart_optim = False, # restarts also reset internal state of optimizer + + problem = 3, + # best results for ~10,000 parameters + # (keep these paired; update both at the same time!) + train_compare = 1.854613e-05, + valid_compare = 1.623881e-05, + + unsafe = True, # aka gotta go fast mode ) - for k in ['parallel_style', 'optim', 'learner', 'ritual']: + for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']: config[k] = config[k].lower() - #config.pprint() + config.pprint() - # toy CIE-2000 data - from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, \ - inputs, outputs, valid_inputs, valid_outputs, \ - x_scale, y_scale + # toy data + # (our model is probably complete overkill for this, so TODO: better data) + + (inputs, outputs), (valid_inputs, valid_outputs) = \ + toy_data(2**14, 2**11, problem=config.problem) + input_features = inputs.shape[-1] + output_features = outputs.shape[-1] # Our Test Model init = inits[config.init] activation = activations[config.activation] - x = Input(shape=(input_samples,)) + x = Input(shape=(input_features,)) y = x y = multiresnet(y, config.res_width, config.res_depth, config.res_block, config.res_multi, activation=activation, init=init, style=config.parallel_style) - if y.output_shape[0] != output_samples: - y = y.feed(Dense(output_samples, init)) + if y.output_shape[0] != output_features: + y = y.feed(Dense(output_features, init)) model = Model(x, y, unsafe=config.unsafe) @@ -991,7 +1084,7 @@ def run(program, args=[]): restart_decay=config.restart_decay, restarts=config.restarts, callback=rscb, expando=expando) # final learning rate isn't of interest here; it's gonna be close to 0. - log('total epochs:', learner.epochs) + log('total epochs', learner.epochs) elif config.learner == 'anneal': learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn, halve_every=config.learn_halve_every) @@ -1016,6 +1109,8 @@ def run(program, args=[]): return Squared() elif maybe_name == 'mshe': # mushy return SquaredHalved() + elif maybe_name == 'mae': + return Absolute() elif maybe_name == 'msee': return SomethingElse() raise Exception('unknown objective', maybe_name) @@ -1029,7 +1124,8 @@ def run(program, args=[]): ritual = StochMRitual(learner=learner, loss=loss, mloss=mloss) elif config.ritual == 'noisy': ritual = NoisyRitual(learner=learner, loss=loss, mloss=mloss, - gradient_noise=0.01) + input_noise=1e-1, output_noise=1e-2, + gradient_noise=2e-7) else: raise Exception('unknown ritual', config.ritual) @@ -1044,16 +1140,18 @@ def run(program, args=[]): predicted = model.forward(inputs) residual = predicted - outputs err = ritual.measure(residual) - log(name + " loss", "{:11.7f}".format(err)) + log(name + " loss", "{:12.6e}".format(err)) + # TODO: print logarithmic difference as it might be more meaningful + # (fewer results stuck around -99%) if comparison: log("improvement", "{:+7.2f}%".format((comparison / err - 1) * 100)) return err train_err = print_error("train", - inputs / x_scale, outputs / y_scale, + inputs, outputs, config.train_compare) valid_err = print_error("valid", - valid_inputs / x_scale, valid_outputs / y_scale, + valid_inputs, valid_outputs, config.valid_compare) train_losses.append(train_err) valid_losses.append(valid_err) @@ -1066,8 +1164,8 @@ def run(program, args=[]): while learner.next(): indices = np.arange(inputs.shape[0]) np.random.shuffle(indices) - shuffled_inputs = inputs[indices] / x_scale - shuffled_outputs = outputs[indices] / y_scale + shuffled_inputs = inputs[indices] + shuffled_outputs = outputs[indices] avg_loss, losses = ritual.train_batched( shuffled_inputs, shuffled_outputs, @@ -1077,7 +1175,7 @@ def run(program, args=[]): #log("learning rate", "{:10.8f}".format(learner.rate)) #log("average loss", "{:11.7f}".format(avg_loss)) - fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:11.7f}" + fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}" log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss)) measure_error() @@ -1087,15 +1185,7 @@ def run(program, args=[]): model.save_weights(config.fn_save, overwrite=True) # Evaluation - - # this is just an example/test of how to predict a single output; - # it doesn't measure the quality of the network or anything. - a = (192, 128, 64) - b = (64, 128, 192) - X = np.expand_dims(np.hstack((a, b)), 0) / x_scale - P = model.forward(X) * y_scale - log("truth", rgbcompare(a, b)) - log("network", np.squeeze(P)) + # TODO: write this portion again if config.log_fn is not None: np.savez_compressed(config.log_fn,