From 4ac04baa1ff60274bf475ae82d92d62bb0792f86 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Thu, 16 Feb 2017 14:10:33 -0800 Subject: [PATCH] . --- optim_nn.py | 187 +++++++++++++++++++++++++---------------------- optim_nn_core.py | 46 ++++-------- 2 files changed, 114 insertions(+), 119 deletions(-) diff --git a/optim_nn.py b/optim_nn.py index c42b581..3a4d330 100644 --- a/optim_nn.py +++ b/optim_nn.py @@ -10,7 +10,9 @@ from optim_nn_core import * from optim_nn_core import _check, _f import sys -lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs) + +def lament(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) def log(left, right): lament("{:>20}: {}".format(left, right)) @@ -160,7 +162,8 @@ class NoisyRitual(Ritual): class DumbLearner(AnnealingLearner): # this is my own awful contraption. it's not really "SGD with restarts". - def __init__(self, optim, epochs=100, rate=None, halve_every=10, restarts=0, restart_advance=20, callback=None): + def __init__(self, optim, epochs=100, rate=None, halve_every=10, + restarts=0, restart_advance=20, callback=None): self.restart_epochs = int(epochs) self.restarts = int(restarts) self.restart_advance = float(restart_advance) @@ -183,58 +186,67 @@ class DumbLearner(AnnealingLearner): self.restart_callback(restart) return True +def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d): + skip = y + merger = Sum() + skip.feed(merger) + z_start = skip.feed(activation()) + for _ in range(multi): + z = z_start + for j in range(block): + if j > 0: + z = z.feed(activation()) + z = z.feed(FC()) + z.feed(merger) + y = merger + return y + +def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d): + # this is my own awful contraption. + is_last = d + 1 == depth + needs_sum = not is_last or multi > 1 + skip = y + if needs_sum: + merger = Sum() + if not is_last: + skip.feed(merger) + z_start = skip.feed(activation()) + for _ in range(multi): + z = z_start + for j in range(block): + if j > 0: + z = z.feed(activation()) + z = z.feed(FC()) + if needs_sum: + z.feed(merger) + if needs_sum: + y = merger + else: + y = z + return y + +_mr_styles = dict( + batchless=_mr_batchless, + onelesssum=_mr_onelesssum, +) + def multiresnet(x, width, depth, block=2, multi=1, activation=Relu, style='batchless', init=init_he_normal): + if style not in _mr_styles: + raise Exception('unknown resnet style', style) + y = x last_size = x.output_shape[0] - FC = lambda size: Dense(size, init) - #FC = lambda size: DenseOneLess(size, init) - for d in range(depth): size = width + FC = lambda: Dense(size, init) if last_size != size: - y = y.feed(Dense(size, init)) + y = y.feed(FC()) - if style == 'batchless': - skip = y - merger = Sum() - skip.feed(merger) - z_start = skip.feed(activation()) - for i in range(multi): - z = z_start - for i in range(block): - if i > 0: - z = z.feed(activation()) - z = z.feed(FC(size)) - z.feed(merger) - y = merger - elif style == 'onelesssum': - # this is my own awful contraption. - is_last = d + 1 == depth - needs_sum = not is_last or multi > 1 - skip = y - if needs_sum: - merger = Sum() - if not is_last: - skip.feed(merger) - z_start = skip.feed(activation()) - for i in range(multi): - z = z_start - for i in range(block): - if i > 0: - z = z.feed(activation()) - z = z.feed(FC(size)) - if needs_sum: - z.feed(merger) - if needs_sum: - y = merger - else: - y = z - else: - raise Exception('unknown resnet style', style) + y = _mr_styles[style](y, width, depth, block, multi, activation, style, FC, d) last_size = size @@ -260,17 +272,17 @@ def normalize_data(data, mean=None, std=None): def toy_data(train_samples, valid_samples, problem=2): total_samples = train_samples + valid_samples + nod = normalize_data # shorthand to keep a sane indentation + if problem == 0: - from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, \ - inputs, outputs, valid_inputs, valid_outputs, \ - x_scale, y_scale + from ml.cie_mlp_data import inputs, outputs, valid_inputs, valid_outputs inputs, outputs = _f(inputs), _f(outputs) valid_inputs, valid_outputs = _f(valid_inputs), _f(valid_outputs) - normalize_data(inputs, 127.5, 73.9) - normalize_data(outputs, 44.8, 21.7) - normalize_data(valid_inputs, 127.5, 73.9) - normalize_data(valid_outputs, 44.8, 21.7) + nod(inputs, 127.5, 73.9) + nod(outputs, 44.8, 21.7) + nod(valid_inputs, 127.5, 73.9) + nod(valid_outputs, 44.8, 21.7) elif problem == 1: from sklearn.datasets import make_friedman1 @@ -278,8 +290,8 @@ def toy_data(train_samples, valid_samples, problem=2): inputs, outputs = _f(inputs), _f(outputs) outputs = np.expand_dims(outputs, -1) - normalize_data(inputs, 0.5, 1/np.sqrt(12)) - normalize_data(outputs, 14.4, 4.9) + nod(inputs, 0.5, 1/np.sqrt(12)) + nod(outputs, 14.4, 4.9) elif problem == 2: from sklearn.datasets import make_friedman2 @@ -287,11 +299,11 @@ def toy_data(train_samples, valid_samples, problem=2): inputs, outputs = _f(inputs), _f(outputs) outputs = np.expand_dims(outputs, -1) - normalize_data(inputs, - [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00], - [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00]) + nod(inputs, + [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00], + [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00]) - normalize_data(outputs, [482], [380]) + nod(outputs, [482], [380]) elif problem == 3: from sklearn.datasets import make_friedman3 @@ -299,11 +311,11 @@ def toy_data(train_samples, valid_samples, problem=2): inputs, outputs = _f(inputs), _f(outputs) outputs = np.expand_dims(outputs, -1) - normalize_data(inputs, - [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00], - [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00]) + nod(inputs, + [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00], + [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00]) - normalize_data(outputs, [1.32327931], [0.31776295]) + nod(outputs, [1.32327931], [0.31776295]) else: raise Exception("unknown toy data set", problem) @@ -341,9 +353,6 @@ def model_from_config(config, input_features, output_features, callbacks): # - # FIXME: unused variable - training = config.epochs > 0 and config.restarts >= 0 - if config.fn_load is not None: log('loading weights', config.fn_load) model.load_weights(config.fn_load) @@ -390,7 +399,8 @@ def model_from_config(config, input_features, output_features, callbacks): elif config.learner == 'dumb': learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn, halve_every=config.learn_halve_every, - restarts=config.restarts, restart_advance=config.learn_restart_advance, + restarts=config.restarts, + restart_advance=config.learn_restart_advance, callback=rscb) log("final learning rate", "{:10.8f}".format(learner.final_rate)) elif config.learner == 'sgd': @@ -430,11 +440,12 @@ def model_from_config(config, input_features, output_features, callbacks): # - return model, learner, ritual, (loss, mloss) + return model, learner, ritual # main program {{{1 -def run(program, args=[]): +def run(program, args=None): + args = args if args else [] np.random.seed(42069) @@ -469,7 +480,7 @@ def run(program, args=[]): epochs = 24, restarts = 2, restart_decay = 0.25, # only used with SGDR - expando = lambda i: i + 1, + expando = lambda i: 24 * i, # misc init = 'he_normal', @@ -484,15 +495,17 @@ def run(program, args=[]): # training/validation pairs for each problem (starting from problem 0): #(5.08e-05, 6.78e-05), (7.577717e-04, 1.255284e-03), - (3.032806e-06, 3.963775e-06), - (3.676451e-07, 4.495362e-07), - (1.854613e-05, 1.623881e-05) + # 1080 epochs on these... + (1.790511e-07, 2.785208e-07), + (2.233277e-08, 3.580281e-08), + (5.266719e-07, 5.832677e-06), # overfitting? bad valid set? ), unsafe = True, # aka gotta go fast mode ) - for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']: + for k in ['parallel_style', 'activation', 'optim', 'learner', + 'init', 'loss', 'mloss', 'ritual']: config[k] = config[k].lower() config.pprint() @@ -507,20 +520,16 @@ def run(program, args=[]): callbacks = Dummy() - model, learner, ritual, (loss, mloss) = \ + model, learner, ritual = \ model_from_config(config, input_features, output_features, callbacks) # Model Information - if 0: - node_names = ' '.join([str(node) for node in model.ordered_nodes]) - log('{} nodes'.format(len(model.ordered_nodes)), node_names) - else: - for node in model.ordered_nodes: - children = [str(n) for n in node.children] - if len(children) > 0: - sep = '->' - print(str(node)+sep+('\n'+str(node)+sep).join(children)) + for node in model.ordered_nodes: + children = [str(n) for n in node.children] + if children: + sep = '->' + print(str(node)+sep+('\n'+str(node)+sep).join(children)) log('parameters', model.param_count) # Training {{{2 @@ -534,10 +543,8 @@ def run(program, args=[]): predicted = model.forward(inputs) err = ritual.measure(predicted, outputs) log(name + " loss", "{:12.6e}".format(err)) - # TODO: print logarithmic difference as it might be more meaningful - # (fewer results stuck around -99%) if comparison: - log("improvement", "{:+7.2f}%".format((comparison / err - 1) * 100)) + log("improvement", "10**({:+7.4f}) times".format(np.log10(comparison / err))) return err train_err = print_error("train", @@ -551,10 +558,13 @@ def run(program, args=[]): callbacks.restart = measure_error - measure_error() + training = config.epochs > 0 and config.restarts >= 0 + + if training: + measure_error() ritual.prepare(model) - while learner.next(): + while training and learner.next(): indices = np.arange(inputs.shape[0]) np.random.shuffle(indices) shuffled_inputs = inputs[indices] @@ -573,11 +583,11 @@ def run(program, args=[]): measure_error() - if config.fn_save is not None: + if training and config.fn_save is not None: log('saving weights', config.fn_save) model.save_weights(config.fn_save, overwrite=True) - if config.log_fn is not None: + if training and config.log_fn is not None: log('saving losses', config.log_fn) np.savez_compressed(config.log_fn, batch_losses=np.array(batch_losses, dtype=_f), @@ -592,5 +602,4 @@ def run(program, args=[]): # run main program {{{1 if __name__ == '__main__': - import sys sys.exit(run(sys.argv[0], sys.argv[1:])) diff --git a/optim_nn_core.py b/optim_nn_core.py index 266241b..3beacc6 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -48,19 +48,20 @@ class CategoricalCrossentropy(Loss): # TODO: assert dimensionality and p > 0 (if not self.unsafe?) p = np.clip(p, self.eps, 1 - self.eps) f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1) - return np.mean(f, axis=-1) + return np.mean(f) def dF(self, p, y): p = np.clip(p, self.eps, 1 - self.eps) df = (p - y) / (p * (1 - p)) - return df / y.shape[-1] + return df / len(y) class ResidualLoss(Loss): def F(self, p, y): # mean return np.mean(self.f(p - y)) def dF(self, p, y): # dmean - return self.df(p - y) / y.shape[-1] + ret = self.df(p - y) / len(y) + return ret class Squared(ResidualLoss): def f(self, r): @@ -80,7 +81,7 @@ class Absolute(ResidualLoss): class Optimizer: def __init__(self, alpha=0.1): - self.alpha = _f(alpha) + self.alpha = _f(alpha) # learning rate self.reset() def reset(self): @@ -97,12 +98,11 @@ class Optimizer: class Momentum(Optimizer): def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False): - self.alpha = _f(alpha) # learning rate self.lamb = _f(lamb) # weight decay self.mu = _f(mu) # momentum self.nesterov = bool(nesterov) - self.reset() + super().__init__(alpha) def reset(self): self.dWprev = None @@ -116,8 +116,7 @@ class Momentum(Optimizer): self.dWprev[:] = V if self.nesterov: # TODO: is this correct? looks weird return self.mu * V - self.alpha * (dW + W * self.lamb) - else: - return V + return V class RMSprop(Optimizer): # RMSprop generalizes* Adagrad, etc. @@ -127,7 +126,6 @@ class RMSprop(Optimizer): # RMSprop.mu == 1 def __init__(self, alpha=0.0001, mu=0.99, eps=1e-8): - self.alpha = _f(alpha) # learning rate self.mu = _f(mu) # decay term self.eps = _f(eps) @@ -138,7 +136,7 @@ class RMSprop(Optimizer): # an input decays to 1/e its original amplitude over 99.5 epochs. # (this is from DSP, so how relevant it is in SGD is debatable) - self.reset() + super().__init__(alpha) def reset(self): self.g = None @@ -168,14 +166,13 @@ class Adam(Optimizer): # Adam.b2_t == 0 def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8): - self.alpha = _f(alpha) # learning rate self.b1 = _f(b1) # decay term self.b2 = _f(b2) # decay term self.b1_t_default = _f(b1_t) # decay term power t self.b2_t_default = _f(b2_t) # decay term power t self.eps = _f(eps) - self.reset() + super().__init__(alpha) def reset(self): self.mt = None @@ -249,14 +246,7 @@ class Layer: def dmulti(self, dB): if len(dB) == 1: return self.dF(dB[0]) - else: - dX = None - for dY in dB: - if dX is None: - dX = self.dF(dY) - else: - dX += self.dF(dY) - return dX + return sum((self.dF(dY) for dY in dB)) # general utility methods: @@ -267,10 +257,7 @@ class Layer: if shape is None: return False self.input_shape = shape - if np.all(self.input_shape == parent.output_shape): - return True - else: - return False + return np.all(self.input_shape == parent.output_shape) def feed(self, child): if not child.compatible(self): @@ -288,7 +275,7 @@ class Layer: def forward(self, lut): if not self.unsafe: - assert len(self.parents) > 0, self + assert self.parents, self B = [] for parent in self.parents: # TODO: skip over irrelevant nodes (if any) @@ -303,7 +290,7 @@ class Layer: def backward(self, lut): if not self.unsafe: - assert len(self.children) > 0, self + assert self.children, self dB = [] for child in self.children: # TODO: skip over irrelevant nodes (if any) @@ -643,8 +630,7 @@ class Ritual: # i'm just making up names at this point avg_loss = cumsum_loss / _f(batch_count) if return_losses: return avg_loss, losses - else: - return avg_loss + return avg_loss # Learners {{{1 @@ -734,12 +720,12 @@ class SGDR(Learner): self.restarts = int(restarts) self.restart_callback = callback # TODO: rename expando to something not insane - self.expando = expando if expando is not None else lambda i: 1 + self.expando = expando if expando is not None else lambda i: i self.splits = [] epochs = 0 for i in range(0, self.restarts + 1): - split = epochs + int(self.restart_epochs * self.expando(i)) + split = epochs + self.restart_epochs + int(self.expando(i)) self.splits.append(split) epochs = split super().__init__(optim, epochs, rate)