diff --git a/.dummy b/.dummy deleted file mode 100644 index 945c9b4..0000000 --- a/.dummy +++ /dev/null @@ -1 +0,0 @@ -. \ No newline at end of file diff --git a/optim_nn.py b/optim_nn.py new file mode 100644 index 0000000..6da6966 --- /dev/null +++ b/optim_nn.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 + +# imports + +import numpy as np +nf = np.float32 +nfa = lambda x: np.array(x, dtype=nf) +ni = np.int +nia = lambda x: np.array(x, dtype=ni) + +from collections import defaultdict + +# Loss functions + +class Loss: + def mean(self, r): + return np.average(self.f(r)) + + def dmean(self, r): + d = self.df(r) + return d / len(d) + +class SquaredHalved(Loss): + def f(self, r): + return np.square(r) / 2 + + def df(self, r): + return r + +# Optimizers + +class Optimizer: + def __init__(self, alpha=0.1): + self.alpha = nfa(alpha) + self.reset() + + def reset(self): + pass + + def compute(self, dW, W): + return -self.alpha * dW + + def update(self, dW, W): + W += self.compute(dW, W) + +# the following optimizers are blatantly ripped from tiny-dnn: +# https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h + +class Adam(Optimizer): + def __init__(self, alpha=0.001, b1=0.9, b2=0.999, b1_t=0.9, b2_t=0.999, eps=1e-8): + self.alpha = nf(alpha) # learning rate + self.b1 = nf(b1) # decay term + self.b2 = nf(b2) # decay term + self.b1_t_default = nf(b1_t) # decay term power t + self.b2_t_default = nf(b2_t) # decay term power t + self.eps = nf(eps) + + self.reset() + + def reset(self): + self.mt = None + self.vt = None + self.b1_t = self.b1_t_default + self.b2_t = self.b2_t_default + + def compute(self, dW, W): + if self.mt is None: + self.mt = np.zeros_like(W) + if self.vt is None: + self.vt = np.zeros_like(W) + + # decay + self.b1_t *= self.b1 + self.b2_t *= self.b2 + + self.mt = self.b1 * self.mt + (1 - self.b1) * dW + self.vt = self.b2 * self.vt + (1 - self.b2) * dW * dW + + return -self.alpha * (self.mt / (1 - self.b1_t)) \ + / np.sqrt((self.vt / (1 - self.b2_t)) + self.eps) + +# Abstract Layers + +_layer_counters = defaultdict(lambda: 0) + +class Layer: + def __init__(self): + self.parents = [] + self.children = [] + self.input_shape = None + self.output_shape = None + kind = self.__class__.__name__ + global _layer_counters + _layer_counters[kind] += 1 + self.name = "{}_{}".format(kind, _layer_counters[kind]) + self.size = None # total weight count (if any) + + def __str__(self): + return self.name + + # methods we might want to override: + + def F(self, X): + raise NotImplementedError("unimplemented", self) + + def dF(self, dY): + raise NotImplementedError("unimplemented", self) + + def do_feed(self, child): + pass + + def be_fed(self, parent): + self.parents.append(parent) + + def make_shape(self, shape): + assert shape is not None + if self.output_shape is None: + self.output_shape = shape + return shape + + # TODO: rename this multi and B crap to something actually relevant. + + def multi(self, B): + assert len(B) == 1, self + return self.F(B[0].T).T + + def dmulti(self, dB): + if len(dB) == 1: + return self.dF(dB[0].T).T + else: + dX = None + for dY in dB: + if dX is None: + dX = self.dF(dY.T).T + else: + dX += self.dF(dY.T).T + return dX + + # general utility methods: + + def compatible(self, parent): + if self.input_shape is None: + # inherit shape from output + shape = self.make_shape(parent.output_shape) + if shape is None: + return False + self.input_shape = shape + if np.all(self.input_shape == parent.output_shape): + return True + else: + return False + + def feed(self, child): + if not child.compatible(self): + fmt = "{} is incompatible with {}: shape mismatch: {} vs. {}" + raise Exception(fmt.format(self, child, self.output_shape, child.input_shape)) + self.children.append(child) + self.do_feed(child) + child.be_fed(self) + return child + + def validate_input(self, X): + assert X.shape[1:] == self.input_shape, (self, X.shape[1:], self.input_shape) + + def validate_output(self, Y): + assert Y.shape[1:] == self.output_shape, (self, Y.shape[1:], self.output_shape) + + def forward(self, lut): + assert len(self.parents) > 0, self + #print(" forwarding", self) + B = [] + for parent in self.parents: + # TODO: skip over irrelevant nodes (if any) + X = lut[parent] + #print("collected parent", parent) + self.validate_input(X) + B.append(X) + Y = self.multi(B) + self.validate_output(Y) + return Y + + def backward(self, lut): + assert len(self.children) > 0, self + #print(" backwarding", self) + dB = [] + for child in self.children: + # TODO: skip over irrelevant nodes (if any) + dY = lut[child] + #print(" collected child", child) + self.validate_output(dY) + dB.append(dY) + dX = self.dmulti(dB) + self.validate_input(dX) + return dX + +# Final Layers + +class Sum(Layer): + def multi(self, B): + return np.sum(B, axis=0) + + def dmulti(self, dB): + #assert len(dB) == 1, "unimplemented" + return dB[0] # TODO: does this always work? + +class Input(Layer): + def __init__(self, shape): + assert shape is not None + super().__init__() + self.shape = tuple(shape) + self.input_shape = self.shape + self.output_shape = self.shape + + def F(self, X): + return X + + def dF(self, dY): + #self.delta = dY + return np.zeros_like(dY) + +class Affine(Layer): + def __init__(self, a=1, b=0): + super().__init__() + self.a = nf(a) + self.b = nf(b) + + def F(self, X): + return self.a * X + self.b + + def dF(self, dY): + return dY * self.a + +class Relu(Layer): + def F(self, X): + self.cond = X >= 0 + return np.where(self.cond, X, 0) + + def dF(self, dY): + return np.where(self.cond, dY, 0) + +class Dense(Layer): + def __init__(self, dim): + super().__init__() + self.dim = ni(dim) + self.output_shape = (dim,) + self.size = None + + def init(self, W, dW): + ins, outs = self.input_shape[0], self.output_shape[0] + + self.W = W + self.dW = dW + #self.coeffs = np.random.normal(0, s, size=self.size) + #self.biases = np.zeros((self.dim, 1), dtype=nf) + self.coeffs = self.W[:self.nW].reshape(outs, ins) + self.biases = self.W[self.nW:].reshape(outs, 1) + self.dcoeffs = self.dW[:self.nW].reshape(outs, ins) + self.dbiases = self.dW[self.nW:].reshape(outs) + + # he_normal + s = np.sqrt(2 / ins) + self.coeffs.flat = np.random.normal(0, s, size=self.nW) + self.biases.flat = 0 + + def make_shape(self, shape): + super().make_shape(shape) + if len(shape) != 1: + return False + self.nW = self.dim * shape[0] + self.nb = self.dim + self.size = self.nW + self.nb + return shape + + def F(self, X): + self.X = X + Y = self.coeffs.dot(X) \ + + self.biases + return Y + + def dF(self, dY): + # http://cs231n.github.io/optimization-2/#gradients-for-vectorized-operations + # note: because we only call df once (we only have a df/dy method), + # we have to do df/dw stuff here too. + dX = self.coeffs.T.dot(dY) + self.dcoeffs[:] = dY.dot(self.X.T) + self.dbiases[:] = np.sum(dY, axis=1) + return dX + +# Model + +class Model: + def __init__(self, x, y): + assert isinstance(x, Layer), x + assert isinstance(y, Layer), y + self.x = x + self.y = y + + self.ordered_nodes = self.traverse([], self.y) + print([str(node) for node in self.ordered_nodes]) + #print(len(self.ordered_nodes)) + + self.make_weights() + + def make_weights(self): + self.param_count = 0 + for node in self.ordered_nodes: + if node.size is not None: + self.param_count += node.size + print(self.param_count) + self.W = np.zeros(self.param_count, dtype=nf) + self.dW = np.zeros(self.param_count, dtype=nf) + + offset = 0 + for node in self.ordered_nodes: + if node.size is not None: + end = offset + node.size + node.init(self.W[offset:end], self.dW[offset:end]) + offset += node.size + + #print(self.W, self.dW) + + def traverse(self, nodes, node): + if node == x: + return [node] + for parent in node.parents: + if parent not in nodes: + new_nodes = self.traverse(nodes, parent) + for new_node in new_nodes: + if new_node not in nodes: + nodes.append(new_node) + if nodes: + nodes.append(node) + return nodes + + def forward(self, X): + lut = dict() + input_node = self.ordered_nodes[0] + output_node = self.ordered_nodes[-1] + #lut[input_node] = input_node.F(X) + lut[input_node] = input_node.multi(np.expand_dims(X, 0)) + for node in self.ordered_nodes[1:]: + lut[node] = node.forward(lut) + return lut[output_node] + + def backward(self, error): + lut = dict() + input_node = self.ordered_nodes[0] + output_node = self.ordered_nodes[-1] + #lut[output_node] = output_node.dF(error) + lut[output_node] = output_node.dmulti(np.expand_dims(error, 0)) + #for node in self.ordered_nodes[-2:0:-1]: + for node in reversed(self.ordered_nodes[:-1]): + lut[node] = node.backward(lut) + #return lut[input_node] # meaningless value + + return self.dW + + def load_model(self, fn): + # seemingly compatible with keras models at the moment + import h5py + f = h5py.File(fn) + loadweights = {} + def visitor(name, obj): + if isinstance(obj, h5py.Dataset): + loadweights[name.split('/')[-1]] = nfa(obj[:]) + f.visititems(visitor) + f.close() + + denses = [node for node in self.ordered_nodes if isinstance(node, Dense)] + for i in range(len(denses)): + a, b = i, i + 1 + b_name = "dense_{}".format(b) + denses[a].coeffs = loadweights[b_name+'_W'].T + denses[a].biases = np.expand_dims(loadweights[b_name+'_b'], -1) + + def save_model(self, fn, overwrite=False): + raise NotImplementedError("unimplemented", self) + +if __name__ == '__main__': + # Config + + from dotmap import DotMap + config = DotMap( + fn = 'ml/cie_mlp_min.h5', + + batch_size = 64, + + res_width = 12, + res_depth = 3, + res_block = 2, # normally 2 + res_multi = 4, # normally 1 + activation = 'relu', + + optim = 'adam', + nesterov = False, # only used with SGD or Adam + momentum = 0.33, # only used with SGD + epochs = 6, # 6 + LR = 1e-2, + restarts = 3, # 3 + LR_halve_every = 2, + LR_restart_advance = 3, + init = 'he_normal', + loss = 'mse', + + parallel_style = 'batchless', + ) + + # toy CIE-2000 data + from ml.cie_mlp_data import rgbcompare, input_samples, output_samples, x_scale, y_scale + + def read_data(fn): + data = np.load(fn) + try: + inputs, outputs = data['inputs'], data['outputs'] + except KeyError: + # because i'm bad at video games. + inputs, outputs = data['arr_0'], data['arr_1'] + return inputs, outputs + + inputs, outputs = read_data("ml/cie_mlp_data.npz") + valid_data = read_data("ml/cie_mlp_vdata.npz") + valid_inputs, valid_outputs = valid_data + + # Our Test Model + + x = Input(shape=(input_samples,)) + y = x + last_size = input_samples + + for blah in range(config.res_depth): + size = config.res_width + + if last_size != size: + y = y.feed(Dense(size)) + + assert config.parallel_style == 'batchless' + skip = y + merger = Sum() + skip.feed(merger) + z_start = skip.feed(Relu()) + for i in range(config.res_multi): + z = z_start + for i in range(config.res_block): + if i > 0: + z = z.feed(Relu()) + z = z.feed(Dense(size)) + z.feed(merger) + y = merger + + last_size = size + + if last_size != output_samples: + y = y.feed(Dense(output_samples)) + + model = Model(x, y) + + training = config.epochs > 0 and config.restarts >= 0 + + if not training: + model.load_weights(config.fn) + + assert config.optim == 'adam' + if config.nesterov: + assert False, "unimplemented" + else: + optim = Adam() + + assert config.loss == 'mse' + loss = SquaredHalved() + + LR = config.LR + LRprod = 0.5**(1/config.LR_halve_every) + + # Training + + def measure_loss(): + predicted = model.forward(inputs / x_scale) + residual = predicted - outputs / y_scale + err = loss.mean(residual) + print("train loss: {:10.6f}".format(err)) + + predicted = model.forward(valid_inputs / x_scale) + residual = predicted - valid_outputs / y_scale + err = loss.mean(residual) + print("valid loss: {:10.6f}".format(err)) + + for i in range(config.restarts + 1): + measure_loss() + + if i > 0: + print("restarting") + + assert inputs.shape[0] % config.batch_size == 0, \ + "inputs is not evenly divisible by batch_size" # TODO: lift this restriction + batch_count = inputs.shape[0] // config.batch_size + for e in range(config.epochs): + indices = np.arange(len(inputs)) + np.random.shuffle(indices) + shuffled_inputs = inputs[indices] / x_scale + shuffled_outputs = outputs[indices] / y_scale + + cumsum_loss = 0 + for b in range(batch_count): + bi = b * config.batch_size + batch_inputs = shuffled_inputs[ bi:bi+config.batch_size] + batch_outputs = shuffled_outputs[bi:bi+config.batch_size] + + predicted = model.forward(batch_inputs) + dW = model.backward(np.ones_like(predicted)) + + residual = predicted - batch_outputs + + # TODO: try something like this instead? + #err_dW = np.dot(loss.dmean(residual), np.expand_dims(dW, 0)) + err_dW = loss.df(residual) * dW / len(residual) + err_dW = np.sum(err_dW, axis=0) + optim.alpha = LR * LRprod**e + optim.update(err_dW, model.W) + + # note: we don't actually need this for training, only monitoring. + cumsum_loss += loss.mean(residual) + print("avg loss: {:10.6f}".format(cumsum_loss / batch_count)) + + LR *= LRprod**config.LR_restart_advance + + measure_loss() + + #if training: + # model.save_weights(config.fn, overwrite=True) + + # Evaluation + + a = (192, 128, 64) + b = (64, 128, 192) + X = np.expand_dims(np.hstack((a, b)), 0) / x_scale + P = model.forward(X) * y_scale + print("truth:", rgbcompare(a, b)) + print("network:", np.squeeze(P))