#!/usr/bin/env python3 # external packages required for full functionality: # numpy scipy h5py sklearn dotmap import numpy as np # ugly shorthand: nf = np.float32 nfa = lambda x: np.array(x, dtype=nf) ni = np.int nia = lambda x: np.array(x, dtype=ni) # just for speed, not strictly essential: from scipy.special import expit as sigmoid # used for numbering layers like Keras: from collections import defaultdict _layer_counters = defaultdict(lambda: 0) # Initializations # note: these are currently only implemented for 2D shapes. def init_he_normal(size, ins, outs): s = np.sqrt(2 / ins) return np.random.normal(0, s, size=size) def init_he_uniform(size, ins, outs): s = np.sqrt(6 / ins) return np.random.uniform(-s, s, size=size) # Loss functions class Loss: def mean(self, r): return np.average(self.f(r)) def dmean(self, r): d = self.df(r) return d / len(d) class Squared(Loss): def f(self, r): return np.square(r) def df(self, r): return 2 * r class SquaredHalved(Loss): def f(self, r): return np.square(r) / 2 def df(self, r): return r class Absolute(Loss): def f(self, r): return np.abs(r) def df(self, r): return np.sign(r) class SomethingElse(Loss): # generalizes Absolute and SquaredHalved (|dx| = 1) # plot: https://www.desmos.com/calculator/fagjg9vuz7 def __init__(self, a=4/3): assert 1 <= a <= 2, "parameter out of range" self.a = nf(a / 2) self.b = nf(2 / a) self.c = nf(2 / a - 1) def f(self, r): return self.a * np.abs(r)**self.b def df(self, r): return np.sign(r) * np.abs(r)**self.c # Optimizers class Optimizer: def __init__(self, alpha=0.1): self.alpha = nf(alpha) self.reset() def reset(self): pass def compute(self, dW, W): return -self.alpha * dW def update(self, dW, W): W += self.compute(dW, W) # the following optimizers are blatantly lifted from tiny-dnn: # https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h class Momentum(Optimizer): def __init__(self, alpha=0.01, lamb=0, mu=0.9, nesterov=False): self.alpha = np.asfarray(alpha) # learning rate self.lamb = np.asfarray(lamb) # weight decay self.mu = np.asfarray(mu) # momentum self.nesterov = bool(nesterov) self.reset() def reset(self): self.dWprev = None def compute(self, dW, W): if self.dWprev is None: #self.dWprev = np.zeros_like(dW) self.dWprev = np.copy(dW) V = self.mu * self.dWprev - self.alpha * (dW + W * self.lamb) self.dWprev[:] = V if self.nesterov: # TODO: is this correct? looks weird return self.mu * V - self.alpha * (dW + W * self.lamb) else: return V class Adam(Optimizer): def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8): self.alpha = nf(alpha) # learning rate self.b1 = nf(b1) # decay term self.b2 = nf(b2) # decay term self.b1_t_default = nf(b1_t) # decay term power t self.b2_t_default = nf(b2_t) # decay term power t self.eps = nf(eps) self.reset() def reset(self): self.mt = None self.vt = None self.b1_t = self.b1_t_default self.b2_t = self.b2_t_default def compute(self, dW, W): if self.mt is None: self.mt = np.zeros_like(W) if self.vt is None: self.vt = np.zeros_like(W) # decay self.b1_t *= self.b1 self.b2_t *= self.b2 self.mt[:] = self.b1 * self.mt + (1 - self.b1) * dW self.vt[:] = self.b2 * self.vt + (1 - self.b2) * dW * dW return -self.alpha * (self.mt / (1 - self.b1_t)) \ / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps) # Abstract Layers class Layer: def __init__(self): self.parents = [] self.children = [] self.input_shape = None self.output_shape = None kind = self.__class__.__name__ global _layer_counters _layer_counters[kind] += 1 self.name = "{}_{}".format(kind, _layer_counters[kind]) self.size = None # total weight count (if any) self.unsafe = False # disables assertions for better performance def __str__(self): return self.name # methods we might want to override: def F(self, X): raise NotImplementedError("unimplemented", self) def dF(self, dY): raise NotImplementedError("unimplemented", self) def do_feed(self, child): self.children.append(child) def be_fed(self, parent): self.parents.append(parent) def make_shape(self, shape): if not self.unsafe: assert shape is not None if self.output_shape is None: self.output_shape = shape return shape # TODO: rename this multi and B crap to something actually relevant. def multi(self, B): if not self.unsafe: assert len(B) == 1, self return self.F(B[0]) def dmulti(self, dB): if len(dB) == 1: return self.dF(dB[0]) else: dX = None for dY in dB: if dX is None: dX = self.dF(dY) else: dX += self.dF(dY) return dX # general utility methods: def compatible(self, parent): if self.input_shape is None: # inherit shape from output shape = self.make_shape(parent.output_shape) if shape is None: return False self.input_shape = shape if np.all(self.input_shape == parent.output_shape): return True else: return False def feed(self, child): if not child.compatible(self): fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" raise Exception(fmt.format(self, child, self.output_shape, child.input_shape)) self.do_feed(child) child.be_fed(self) return child def validate_input(self, X): assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) def validate_output(self, Y): assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) def forward(self, lut): if not self.unsafe: assert len(self.parents) > 0, self B = [] for parent in self.parents: # TODO: skip over irrelevant nodes (if any) X = lut[parent] if not self.unsafe: self.validate_input(X) B.append(X) Y = self.multi(B) if not self.unsafe: self.validate_output(Y) return Y def backward(self, lut): if not self.unsafe: assert len(self.children) > 0, self dB = [] for child in self.children: # TODO: skip over irrelevant nodes (if any) dY = lut[child] if not self.unsafe: self.validate_output(dY) dB.append(dY) dX = self.dmulti(dB) if not self.unsafe: self.validate_input(dX) return dX # Final Layers class Sum(Layer): def multi(self, B): return np.sum(B, axis=0) def dmulti(self, dB): #assert len(dB) == 1, "unimplemented" return dB[0] # TODO: does this always work? class Input(Layer): def __init__(self, shape): assert shape is not None super().__init__() self.shape = tuple(shape) self.input_shape = self.shape self.output_shape = self.shape def F(self, X): return X def dF(self, dY): #self.dY = dY return np.zeros_like(dY) class Affine(Layer): def __init__(self, a=1, b=0): super().__init__() self.a = nf(a) self.b = nf(b) def F(self, X): return self.a * X + self.b def dF(self, dY): return dY * self.a class Sigmoid(Layer): # aka Logistic def F(self, X): self.sig = sigmoid(X) return X * self.sig def dF(self, dY): return dY * self.sig * (1 - self.sig) class Tanh(Layer): def F(self, X): self.sig = np.tanh(X) return X * self.sig def dF(self, dY): return dY * (1 - self.sig * self.sig) class Relu(Layer): def F(self, X): self.cond = X >= 0 return np.where(self.cond, X, 0) def dF(self, dY): return np.where(self.cond, dY, 0) class Elu(Layer): # paper: https://arxiv.org/abs/1511.07289 def __init__(self, alpha=1): super().__init__() self.alpha = nf(alpha) def F(self, X): self.cond = X >= 0 self.neg = np.exp(X) - 1 return np.where(self.cond, X, self.neg) def dF(self, dY): return dY * np.where(self.cond, 1, self.neg + 1) class GeluApprox(Layer): # paper: https://arxiv.org/abs/1606.08415 # plot: https://www.desmos.com/calculator/ydzgtccsld def F(self, X): self.a = 1.704 * X self.sig = sigmoid(self.a) return X * self.sig def dF(self, dY): return dY * self.sig * (1 + self.a * (1 - self.sig)) class Dense(Layer): def __init__(self, dim, init=init_he_uniform): super().__init__() self.dim = ni(dim) self.output_shape = (dim,) self.weight_init = init self.size = None def make_shape(self, shape): super().make_shape(shape) if len(shape) != 1: return False self.nW = self.dim * shape[0] self.nb = self.dim self.size = self.nW + self.nb return shape def init(self, W, dW): ins, outs = self.input_shape[0], self.output_shape[0] self.W = W self.dW = dW self.coeffs = self.W[:self.nW].reshape(ins, outs) self.biases = self.W[self.nW:].reshape(1, outs) self.dcoeffs = self.dW[:self.nW].reshape(ins, outs) self.dbiases = self.dW[self.nW:].reshape(1, outs) self.coeffs.flat = self.weight_init(self.nW, ins, outs) self.biases.flat = 0 self.std = np.std(self.W) def F(self, X): self.X = X Y = X.dot(self.coeffs) + self.biases return Y def dF(self, dY): dX = dY.dot(self.coeffs.T) self.dcoeffs[:] = self.X.T.dot(dY) self.dbiases[:] = dY.sum(0, keepdims=True) return dX class DenseOneLess(Dense): def init(self, W, dW): super().init(W, dW) ins, outs = self.input_shape[0], self.output_shape[0] assert ins == outs, (ins, outs) def F(self, X): np.fill_diagonal(self.coeffs, 0) self.X = X Y = X.dot(self.coeffs) + self.biases return Y def dF(self, dY): dX = dY.dot(self.coeffs.T) self.dcoeffs[:] = self.X.T.dot(dY) self.dbiases[:] = dY.sum(0, keepdims=True) np.fill_diagonal(self.dcoeffs, 0) return dX class LayerNorm(Layer): # paper: https://arxiv.org/abs/1607.06450 # my implementation may be incorrect. def __init__(self, eps=1e-3, axis=-1): super().__init__() self.eps = nf(eps) self.axis = int(axis) def F(self, X): self.center = X - np.mean(X, axis=self.axis, keepdims=True) #self.var = np.var(X, axis=self.axis, keepdims=True) + self.eps self.var = np.mean(np.square(self.center), axis=self.axis, keepdims=True) + self.eps self.std = np.sqrt(self.var) + self.eps Y = self.center / self.std return Y def dF(self, dY): length = self.input_shape[self.axis] dstd = dY * (-self.center / self.var) dvar = dstd * (0.5 / self.std) dcenter2 = dvar * (1 / length) dcenter = dY * (1 / self.std) dcenter += dcenter2 * (2 * self.center) dX = dcenter - dcenter / length return dX # Model class Model: def __init__(self, x, y, unsafe=False): assert isinstance(x, Layer), x assert isinstance(y, Layer), y self.x = x self.y = y self.ordered_nodes = self.traverse([], self.y) self.make_weights() for node in self.ordered_nodes: node.unsafe = unsafe def make_weights(self): self.param_count = 0 for node in self.ordered_nodes: if node.size is not None: self.param_count += node.size self.W = np.zeros(self.param_count, dtype=nf) self.dW = np.zeros(self.param_count, dtype=nf) offset = 0 for node in self.ordered_nodes: if node.size is not None: end = offset + node.size node.init(self.W[offset:end], self.dW[offset:end]) offset += node.size def traverse(self, nodes, node): if node == self.x: return [node] for parent in node.parents: if parent not in nodes: new_nodes = self.traverse(nodes, parent) for new_node in new_nodes: if new_node not in nodes: nodes.append(new_node) if nodes: nodes.append(node) return nodes def forward(self, X): lut = dict() input_node = self.ordered_nodes[0] output_node = self.ordered_nodes[-1] lut[input_node] = input_node.multi(np.expand_dims(X, 0)) for node in self.ordered_nodes[1:]: lut[node] = node.forward(lut) return lut[output_node] def backward(self, error): lut = dict() input_node = self.ordered_nodes[0] output_node = self.ordered_nodes[-1] lut[output_node] = output_node.dmulti(np.expand_dims(error, 0)) for node in reversed(self.ordered_nodes[:-1]): lut[node] = node.backward(lut) #return lut[input_node] # meaningless value return self.dW def load_weights(self, fn): # seemingly compatible with keras' Dense layers. # ignores any non-Dense layer types. # TODO: assert file actually exists import h5py f = h5py.File(fn) weights = {} def visitor(name, obj): if isinstance(obj, h5py.Dataset): weights[name.split('/')[-1]] = nfa(obj[:]) f.visititems(visitor) f.close() denses = [node for node in self.ordered_nodes if isinstance(node, Dense)] for i in range(len(denses)): a, b = i, i + 1 b_name = "dense_{}".format(b) # TODO: write a Dense method instead of assigning directly denses[a].coeffs[:] = weights[b_name+'_W'] denses[a].biases[:] = np.expand_dims(weights[b_name+'_b'], 0) def save_weights(self, fn, overwrite=False): import h5py f = h5py.File(fn, 'w') denses = [node for node in self.ordered_nodes if isinstance(node, Dense)] for i in range(len(denses)): a, b = i, i + 1 b_name = "dense_{}".format(b) # TODO: write a Dense method instead of assigning directly grp = f.create_group(b_name) data = grp.create_dataset(b_name+'_W', denses[a].coeffs.shape, dtype=nf) data[:] = denses[a].coeffs data = grp.create_dataset(b_name+'_b', denses[a].biases.shape, dtype=nf) data[:] = denses[a].biases f.close() class Ritual: # i'm just making up names at this point def __init__(self, learner=None, loss=None, mloss=None): self.learner = learner if learner is not None else Learner(Optimizer()) self.loss = loss if loss is not None else Squared() self.mloss = mloss if mloss is not None else loss def reset(self): self.learner.reset(optim=True) def measure(self, residual): return self.mloss.mean(residual) def derive(self, residual): return self.loss.dmean(residual) def learn(self, inputs, outputs): predicted = self.model.forward(inputs) residual = predicted - outputs self.model.backward(self.derive(residual)) return residual def update(self): self.learner.optim.update(self.model.dW, self.model.W) def prepare(self, model): self.en = 0 self.bn = 0 self.model = model def train_batched(self, inputs, outputs, batch_size, return_losses=False): self.en += 1 cumsum_loss = 0 batch_count = inputs.shape[0] // batch_size losses = [] for b in range(batch_count): self.bn += 1 bi = b * batch_size batch_inputs = inputs[ bi:bi+batch_size] batch_outputs = outputs[bi:bi+batch_size] if self.learner.per_batch: self.learner.batch(b / batch_count) residual = self.learn(batch_inputs, batch_outputs) self.update() batch_loss = self.measure(residual) if np.isnan(batch_loss): raise Exception("nan") cumsum_loss += batch_loss if return_losses: losses.append(batch_loss) avg_loss = cumsum_loss / batch_count if return_losses: return avg_loss, losses else: return avg_loss def stochastic_multiply(W, gamma=0.5, allow_negation=True): # paper: https://arxiv.org/abs/1606.01981 assert W.ndim == 1, W.ndim assert 0 < gamma < 1, gamma size = len(W) alpha = np.max(np.abs(W)) # NOTE: numpy gives [low, high) but the paper advocates [low, high] mult = np.random.uniform(gamma, 1/gamma, size=size) if allow_negation: # TODO: verify this is correct. seems to wreak havok. prob = (W / alpha + 1) / 2 samples = np.random.random_sample(size=size) mult *= np.where(samples < prob, 1, -1) np.multiply(W, mult, out=W) class StochMRitual(Ritual): # paper: https://arxiv.org/abs/1606.01981 # this probably doesn't make sense for regression problems, # let alone small models, but here it is anyway! def __init__(self, learner=None, loss=None, mloss=None, gamma=0.5): super().__init__(learner, loss, mloss) self.gamma = nf(gamma) def prepare(self, model): self.W = np.copy(model.W) super().prepare(model) def learn(self, inputs, outputs): # an experiment: #assert self.learner.rate < 10, self.learner.rate #self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate)) self.W[:] = self.model.W for layer in self.model.ordered_nodes: if isinstance(layer, Dense): stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma, allow_negation=True) residual = super().learn(inputs, outputs) self.model.W[:] = self.W return residual def update(self): super().update() f = 0.5 for layer in self.model.ordered_nodes: if isinstance(layer, Dense): np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W) # np.clip(layer.W, -1, 1, out=layer.W) class NoisyRitual(Ritual): def __init__(self, learner=None, loss=None, mloss=None, input_noise=0, output_noise=0, gradient_noise=0): self.input_noise = nf(input_noise) # TODO: implement self.output_noise = nf(output_noise) # TODO: implement self.gradient_noise = nf(gradient_noise) super().__init__(learner, loss, mloss) def learn(self, inputs, outputs): # this is pretty crude s = self.input_noise noisy_inputs = inputs + np.random.normal(0, s, size=inputs.shape) s = self.output_noise noisy_outputs = outputs + np.random.normal(0, s, size=outputs.shape) return super().learn(noisy_inputs, noisy_outputs) def update(self): # gradient noise paper: https://arxiv.org/abs/1511.06807 if self.gradient_noise > 0: size = len(self.model.dW) gamma = 0.55 s = self.gradient_noise / (1 + self.bn) ** gamma # experiments: #s = np.sqrt(self.learner.rate) #s = np.square(self.learner.rate) #s = self.learner.rate / self.en self.model.dW += np.random.normal(0, s, size=size) super().update() class Learner: per_batch = False def __init__(self, optim, epochs=100, rate=None): assert isinstance(optim, Optimizer) self.optim = optim self.start_rate = optim.alpha if rate is None else float(rate) self.epochs = int(epochs) self.reset() def reset(self, optim=False): self.started = False self.epoch = 0 if optim: self.optim.reset() @property def epoch(self): return self._epoch @epoch.setter def epoch(self, new_epoch): self._epoch = int(new_epoch) self.rate = self.rate_at(self._epoch) @property def rate(self): return self.optim.alpha @rate.setter def rate(self, new_rate): self.optim.alpha = new_rate def rate_at(self, epoch): return self.start_rate def next(self): # prepares the next epoch. returns whether or not to continue training. if self.epoch + 1 >= self.epochs: return False if self.started: self.epoch += 1 else: self.started = True self.epoch = self.epoch # poke property setter just in case return True def batch(self, progress): # TODO: rename # interpolates rates between epochs. # unlike epochs, we do not store batch number as a state. # i.e. calling next() will not respect progress. assert 0 <= progress <= 1 self.rate = self.rate_at(self._epoch + progress) @property def final_rate(self): return self.rate_at(self.epochs - 1) class AnnealingLearner(Learner): def __init__(self, optim, epochs=100, rate=None, halve_every=10): self.halve_every = float(halve_every) self.anneal = 0.5**(1/self.halve_every) super().__init__(optim, epochs, rate) def rate_at(self, epoch): return self.start_rate * self.anneal**epoch class DumbLearner(AnnealingLearner): # this is my own awful contraption. it's not really "SGD with restarts". def __init__(self, optim, epochs=100, rate=None, halve_every=10, restarts=0, restart_advance=20, callback=None): self.restart_epochs = int(epochs) self.restarts = int(restarts) self.restart_advance = float(restart_advance) self.restart_callback = callback epochs = self.restart_epochs * (self.restarts + 1) super().__init__(optim, epochs, rate, halve_every) def rate_at(self, epoch): sub_epoch = epoch % self.restart_epochs restart = epoch // self.restart_epochs return super().rate_at(sub_epoch) * (self.anneal**self.restart_advance)**restart def next(self): if not super().next(): return False sub_epoch = self.epoch % self.restart_epochs restart = self.epoch // self.restart_epochs if restart > 0 and sub_epoch == 0: if self.restart_callback is not None: self.restart_callback(restart) return True def cosmod(x): # plot: https://www.desmos.com/calculator/hlgqmyswy2 return (1 + np.cos((x % 1) * np.pi)) / 2 class SGDR(Learner): # Stochastic Gradient Descent with Restarts # paper: https://arxiv.org/abs/1608.03983 # NOTE: this is missing a couple features. per_batch = True def __init__(self, optim, epochs=100, rate=None, restarts=0, restart_decay=0.5, callback=None, expando=None): self.restart_epochs = int(epochs) self.decay = float(restart_decay) self.restarts = int(restarts) self.restart_callback = callback # TODO: rename expando to something not insane self.expando = expando if expando is not None else lambda i: 1 self.splits = [] epochs = 0 for i in range(0, self.restarts + 1): split = epochs + int(self.restart_epochs * self.expando(i)) self.splits.append(split) epochs = split super().__init__(optim, epochs, rate) def split_num(self, epoch): shit = [0] + self.splits # hack for i in range(0, len(self.splits)): if epoch < self.splits[i]: sub_epoch = epoch - shit[i] next_restart = self.splits[i] - shit[i] return i, sub_epoch, next_restart raise Exception('this should never happen.') def rate_at(self, epoch): restart, sub_epoch, next_restart = self.split_num(epoch) x = sub_epoch / next_restart return self.start_rate * self.decay**restart * cosmod(x) def next(self): if not super().next(): return False restart, sub_epoch, next_restart = self.split_num(self.epoch) if restart > 0 and sub_epoch == 0: if self.restart_callback is not None: self.restart_callback(restart) return True def multiresnet(x, width, depth, block=2, multi=1, activation=Relu, style='batchless', init=init_he_normal): y = x last_size = x.output_shape[0] FC = lambda size: Dense(size, init) #FC = lambda size: DenseOneLess(size, init) for d in range(depth): size = width if last_size != size: y = y.feed(Dense(size, init)) if style == 'batchless': skip = y merger = Sum() skip.feed(merger) z_start = skip.feed(activation()) for i in range(multi): z = z_start for i in range(block): if i > 0: z = z.feed(activation()) z = z.feed(FC(size)) z.feed(merger) y = merger elif style == 'onelesssum': # this is my own awful contraption. is_last = d + 1 == depth needs_sum = not is_last or multi > 1 skip = y if needs_sum: merger = Sum() if not is_last: skip.feed(merger) z_start = skip.feed(activation()) for i in range(multi): z = z_start for i in range(block): if i > 0: z = z.feed(activation()) z = z.feed(FC(size)) if needs_sum: z.feed(merger) if needs_sum: y = merger else: y = z else: raise Exception('unknown resnet style', style) last_size = size return y inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform) activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox) def normalize_data(data, mean=None, std=None): # in-place if mean is None or std is None: mean = np.mean(data, axis=0) std = np.std(data, axis=0) # TODO: construct function call string for copy-paste convenience print('mean:', mean) print('std: ', std) import sys sys.exit(1) data -= mean data /= std def toy_data(train_samples, valid_samples, problem=2): total_samples = train_samples + valid_samples if problem == 1: from sklearn.datasets import make_friedman1 inputs, outputs = make_friedman1(total_samples) outputs = np.expand_dims(outputs, -1) normalize_data(inputs, 0.5, 1/np.sqrt(12)) normalize_data(outputs, 14.4, 4.9) elif problem == 2: from sklearn.datasets import make_friedman2 inputs, outputs = make_friedman2(total_samples) outputs = np.expand_dims(outputs, -1) normalize_data(inputs, [5.00e+01, 9.45e+02, 5.01e-01, 5.98e+00], [2.89e+01, 4.72e+02, 2.89e-01, 2.87e+00]) normalize_data(outputs, [482], [380]) elif problem == 3: from sklearn.datasets import make_friedman3 inputs, outputs = make_friedman3(total_samples) outputs = np.expand_dims(outputs, -1) normalize_data(inputs, [4.98e+01, 9.45e+02, 4.99e-01, 6.02e+00], [2.88e+01, 4.73e+02, 2.90e-01, 2.87e+00]) normalize_data(outputs, [1.32327931], [0.31776295]) else: raise Exception("unknown toy data set", problem) # split off a validation set indices = np.arange(inputs.shape[0]) np.random.shuffle(indices) valid_inputs = inputs[indices][-valid_samples:] valid_outputs = outputs[indices][-valid_samples:] inputs = inputs[indices][:-valid_samples] outputs = outputs[indices][:-valid_samples] return (inputs, outputs), (valid_inputs, valid_outputs) def run(program, args=[]): import sys lament = lambda *args, **kwargs: print(*args, file=sys.stderr, **kwargs) def log(left, right): lament("{:>20}: {}".format(left, right)) # Config from dotmap import DotMap config = DotMap( fn_load = None, fn_save = 'optim_nn.h5', log_fn = 'losses.npz', # multi-residual network parameters res_width = 28, res_depth = 2, res_block = 3, # normally 2 for plain resnet res_multi = 2, # normally 1 for plain resnet # style of resnet (order of layers, which layers, etc.) parallel_style = 'onelesssum', activation = 'gelu', optim = 'adam', nesterov = False, # only used with SGD or Adam momentum = 0.50, # only used with SGD batch_size = 64, # learning parameters learner = 'sgdr', learn = 1e-2, learn_halve_every = 16, # unused with SGDR learn_restart_advance = 16, # unused with SGDR epochs = 24, restarts = 2, restart_decay = 0.25, # only used with SGDR expando = lambda i: i + 1, # misc init = 'he_normal', loss = 'mse', mloss = 'mse', ritual = 'default', restart_optim = False, # restarts also reset internal state of optimizer problem = 3, # best results for ~10,000 parameters # (keep these paired; update both at the same time!) train_compare = 1.854613e-05, valid_compare = 1.623881e-05, unsafe = True, # aka gotta go fast mode ) for k in ['parallel_style', 'activation', 'optim', 'learner', 'init', 'loss', 'mloss', 'ritual']: config[k] = config[k].lower() config.pprint() # toy data # (our model is probably complete overkill for this, so TODO: better data) (inputs, outputs), (valid_inputs, valid_outputs) = \ toy_data(2**14, 2**11, problem=config.problem) input_features = inputs.shape[-1] output_features = outputs.shape[-1] # Our Test Model init = inits[config.init] activation = activations[config.activation] x = Input(shape=(input_features,)) y = x y = multiresnet(y, config.res_width, config.res_depth, config.res_block, config.res_multi, activation=activation, init=init, style=config.parallel_style) if y.output_shape[0] != output_features: y = y.feed(Dense(output_features, init)) model = Model(x, y, unsafe=config.unsafe) if 0: node_names = ' '.join([str(node) for node in model.ordered_nodes]) log('{} nodes'.format(len(model.ordered_nodes)), node_names) else: for node in model.ordered_nodes: children = [str(n) for n in node.children] if len(children) > 0: sep = '->' print(str(node)+sep+('\n'+str(node)+sep).join(children)) log('parameters', model.param_count) # training = config.epochs > 0 and config.restarts >= 0 if config.fn_load is not None: log('loading weights', config.fn_load) model.load_weights(config.fn_load) # if config.optim == 'adam': assert not config.nesterov, "unimplemented" optim = Adam() elif config.optim == 'sgd': if config.momentum != 0: optim = Momentum(mu=config.momentum, nesterov=config.nesterov) else: optim = Optimizer() else: raise Exception('unknown optimizer', config.optim) def rscb(restart): measure_error() # declared later... log("restarting", restart) if config.restart_optim: optim.reset() # if config.learner == 'sgdr': expando = config.expando if 'expando' in config else None learner = SGDR(optim, epochs=config.epochs, rate=config.learn, restart_decay=config.restart_decay, restarts=config.restarts, callback=rscb, expando=expando) # final learning rate isn't of interest here; it's gonna be close to 0. log('total epochs', learner.epochs) elif config.learner == 'anneal': learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn, halve_every=config.learn_halve_every) elif config.learner == 'dumb': learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn, halve_every=config.learn_halve_every, restarts=config.restarts, restart_advance=config.learn_restart_advance, callback=rscb) log("final learning rate", "{:10.8f}".format(learner.final_rate)) elif config.learner == 'sgd': learner = Learner(optim, epochs=config.epochs, rate=config.learn) log("final learning rate", "{:10.8f}".format(learner.final_rate)) else: raise Exception('unknown learner', config.learner) # def lookup_loss(maybe_name): if isinstance(maybe_name, Loss): return maybe_name elif maybe_name == 'mse': return Squared() elif maybe_name == 'mshe': # mushy return SquaredHalved() elif maybe_name == 'mae': return Absolute() elif maybe_name == 'msee': return SomethingElse() raise Exception('unknown objective', maybe_name) loss = lookup_loss(config.loss) mloss = lookup_loss(config.mloss) if config.mloss else loss if config.ritual == 'default': ritual = Ritual(learner=learner, loss=loss, mloss=mloss) elif config.ritual == 'stochm': ritual = StochMRitual(learner=learner, loss=loss, mloss=mloss) elif config.ritual == 'noisy': ritual = NoisyRitual(learner=learner, loss=loss, mloss=mloss, input_noise=1e-1, output_noise=1e-2, gradient_noise=2e-7) else: raise Exception('unknown ritual', config.ritual) # Training batch_losses = [] train_losses = [] valid_losses = [] def measure_error(): def print_error(name, inputs, outputs, comparison=None): predicted = model.forward(inputs) residual = predicted - outputs err = ritual.measure(residual) log(name + " loss", "{:12.6e}".format(err)) # TODO: print logarithmic difference as it might be more meaningful # (fewer results stuck around -99%) if comparison: log("improvement", "{:+7.2f}%".format((comparison / err - 1) * 100)) return err train_err = print_error("train", inputs, outputs, config.train_compare) valid_err = print_error("valid", valid_inputs, valid_outputs, config.valid_compare) train_losses.append(train_err) valid_losses.append(valid_err) measure_error() assert inputs.shape[0] % config.batch_size == 0, \ "inputs is not evenly divisible by batch_size" # TODO: lift this restriction ritual.prepare(model) while learner.next(): indices = np.arange(inputs.shape[0]) np.random.shuffle(indices) shuffled_inputs = inputs[indices] shuffled_outputs = outputs[indices] avg_loss, losses = ritual.train_batched( shuffled_inputs, shuffled_outputs, config.batch_size, return_losses=True) batch_losses += losses #log("learning rate", "{:10.8f}".format(learner.rate)) #log("average loss", "{:11.7f}".format(avg_loss)) fmt = "epoch {:4.0f}, rate {:10.8f}, loss {:12.6e}" log("info", fmt.format(learner.epoch + 1, learner.rate, avg_loss)) measure_error() if config.fn_save is not None: log('saving weights', config.fn_save) model.save_weights(config.fn_save, overwrite=True) # Evaluation # TODO: write this portion again if config.log_fn is not None: np.savez_compressed(config.log_fn, batch_losses=nfa(batch_losses), train_losses=nfa(train_losses), valid_losses=nfa(valid_losses)) return 0 if __name__ == '__main__': import sys sys.exit(run(sys.argv[0], sys.argv[1:]))