diff --git a/optim_nn.py b/optim_nn.py index eddd8e5..9e52f8d 100755 --- a/optim_nn.py +++ b/optim_nn.py @@ -93,9 +93,10 @@ class LayerNorm(Layer): super().__init__() self.eps = _f(eps) self.affine = bool(affine) - self.size = None if self.affine: + self.gamma = self._new_weights('gamma', init=init_ones) + self.beta = self._new_weights('beta', init=init_zeros) self.serialized = { 'gamma': 'gamma', 'beta': 'beta', @@ -104,23 +105,12 @@ class LayerNorm(Layer): def make_shape(self, parent): shape = parent.output_shape self.input_shape = shape + self.output_shape = shape if len(shape) != 1: return False - self.features = shape[0] if self.affine: - self.size = 2 * self.features - return shape - - def init(self, W, dW): - super().init(W, dW) - - f = self.features - - self.gamma, self.dgamma = self.W[0*f:1*f], self.dW[0*f:1*f] - self.beta, self.dbeta = self.W[1*f:2*f], self.dW[1*f:2*f] - - self.gamma[:] = 1 - self.beta[:] = 0 + self.gamma.shape = (shape[0],) + self.beta.shape = (shape[0],) def forward(self, X): self.mean = X.mean(0) @@ -130,16 +120,16 @@ class LayerNorm(Layer): self.Xnorm = self.center / self.std if self.affine: - return self.gamma * self.Xnorm + self.beta + return self.gamma.f * self.Xnorm + self.beta.f return self.Xnorm def backward(self, dY): length = dY.shape[0] if self.affine: - dXnorm = dY * self.gamma - self.dgamma[:] = (dY * self.Xnorm).sum(0) - self.dbeta[:] = dY.sum(0) + dXnorm = dY * self.gamma.f + self.gamma.g[:] = (dY * self.Xnorm).sum(0) + self.beta.g[:] = dY.sum(0) else: dXnorm = dY @@ -163,7 +153,8 @@ class Denses(Layer): # TODO: rename? self.dim = int(dim) self.weight_init = init self.axis = int(axis) - self.size = None + self.coeffs = self._new_weights('coeffs', init=init) + self.biases = self._new_weights('biases', init=init_zeros) def make_shape(self, parent): shape = parent.output_shape @@ -178,64 +169,46 @@ class Denses(Layer): # TODO: rename? self.output_shape[self.axis] = self.dim self.output_shape = tuple(self.output_shape) - self.nW = self.dim * np.prod(shape) - self.nb = np.prod(self.output_shape) - self.size = self.nW + self.nb - - return shape - - def init(self, W, dW): - super().init(W, dW) - - ins, outs = np.prod(self.input_shape), np.prod(self.output_shape) - in_rows = self.input_shape[0] in_cols = self.input_shape[1] out_rows = self.output_shape[0] out_cols = self.output_shape[1] - self.coeffs = self.W[:self.nW].reshape(in_rows, in_cols, self.dim) - self.biases = self.W[self.nW:].reshape(1, out_rows, out_cols) - self.dcoeffs = self.dW[:self.nW].reshape(self.coeffs.shape) - self.dbiases = self.dW[self.nW:].reshape(self.biases.shape) - - self.coeffs.flat = self.weight_init(self.nW, ins, outs) - self.biases.flat = 0 - - self.std = np.std(self.W) + self.coeffs.shape = (in_rows, in_cols, self.dim) + self.biases.shape = (1, out_rows, out_cols) def forward(self, X): self.X = X if self.axis == 0: - return np.einsum('ixj,xjk->ikj', X, self.coeffs) + self.biases + return np.einsum('ixj,xjk->ikj', X, self.coeffs.f) + self.biases.f elif self.axis == 1: - return np.einsum('ijx,jxk->ijk', X, self.coeffs) + self.biases + return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f def backward(self, dY): - self.dbiases[:] = dY.sum(0, keepdims=True) + self.biases.g[:] = dY.sum(0, keepdims=True) if self.axis == 0: - self.dcoeffs[:] = np.einsum('ixj,ikj->xjk', self.X, dY) - return np.einsum('ikj,xjk->ixj', dY, self.coeffs) + self.coeffs.g[:] = np.einsum('ixj,ikj->xjk', self.X, dY) + return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f) elif self.axis == 1: - self.dcoeffs[:] = np.einsum('ijx,ijk->jxk', self.X, dY) - return np.einsum('ijk,jxk->ijx', dY, self.coeffs) + self.coeffs.g[:] = np.einsum('ijx,ijk->jxk', self.X, dY) + return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f) class DenseOneLess(Dense): - def init(self, W, dW): - super().init(W, dW) + def init(self, allocator): + super().init(allocator) ins, outs = self.input_shape[0], self.output_shape[0] assert ins == outs, (ins, outs) def forward(self, X): - np.fill_diagonal(self.coeffs, 0) + np.fill_diagonal(self.coeffs.f, 0) self.X = X - return X.dot(self.coeffs) + self.biases + return X.dot(self.coeffs.f) + self.biases def backward(self, dY): - self.dcoeffs[:] = self.X.T.dot(dY) - self.dbiases[:] = dY.sum(0, keepdims=True) - np.fill_diagonal(self.dcoeffs, 0) - return dY.dot(self.coeffs.T) + self.coeffs.g[:] = self.X.T.dot(dY) + self.biases.g[:] = dY.sum(0, keepdims=True) + np.fill_diagonal(self.coeffs.g, 0) + return dY.dot(self.coeffs.f.T) class CosineDense(Dense): # paper: https://arxiv.org/abs/1702.05870 @@ -250,9 +223,9 @@ class CosineDense(Dense): self.X = X self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \ + 1 + self.eps) - self.W_norm = np.sqrt(np.square(self.coeffs).sum(0, keepdims=True) \ - + np.square(self.biases) + self.eps) - self.dot = X.dot(self.coeffs) + self.biases + self.W_norm = np.sqrt(np.square(self.coeffs.f).sum(0, keepdims=True) \ + + np.square(self.biases.f) + self.eps) + self.dot = X.dot(self.coeffs.f) + self.biases.f Y = self.dot / (self.X_norm * self.W_norm) return Y @@ -261,11 +234,11 @@ class CosineDense(Dense): dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2 dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2 - self.dcoeffs[:] = self.X.T.dot(ddot) \ - + dW_norm / self.W_norm * self.coeffs - self.dbiases[:] = ddot.sum(0, keepdims=True) \ - + dW_norm / self.W_norm * self.biases - dX = ddot.dot(self.coeffs.T) + dX_norm / self.X_norm * self.X + self.coeffs.g[:] = self.X.T.dot(ddot) \ + + dW_norm / self.W_norm * self.coeffs.f + self.biases.g[:] = ddot.sum(0, keepdims=True) \ + + dW_norm / self.W_norm * self.biases.f + dX = ddot.dot(self.coeffs.f.T) + dX_norm / self.X_norm * self.X return dX @@ -817,7 +790,7 @@ def run(program, args=None): ritual.prepare(model) - if training and config.warmup: + if training and config.warmup and not config.fn_load: log("warming", "up") # use plain SGD in warmup to prevent (or possibly cause?) numeric issues diff --git a/optim_nn_core.py b/optim_nn_core.py index f39ee60..8cf9bb9 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -4,8 +4,8 @@ _f = np.float32 # just for speed, not strictly essential: from scipy.special import expit as sigmoid -# used for numbering layers like Keras: -from collections import defaultdict +# used for numbering layers like Keras, and keeping initialization consistent: +from collections import defaultdict, OrderedDict _layer_counters = defaultdict(lambda: 0) def _check(a): @@ -28,6 +28,12 @@ class LayerIncompatibility(Exception): # note: these are currently only implemented for 2D shapes. +def init_zeros(size, ins=None, outs=None): + return np.zeros(size) + +def init_ones(size, ins=None, outs=None): + return np.ones(size) + def init_he_normal(size, ins, outs): s = np.sqrt(2 / ins) return np.random.normal(0, s, size=size) @@ -264,19 +270,57 @@ class Nadam(Optimizer): return -self.alpha * mt_bar / (np.sqrt(vtp) + self.eps) +# Weight container {{{1 + +class Weights: + # we may or may not contain weights -- or any information, for that matter. + + def __init__(self, **kwargs): + self.f = None # forward weights + self.g = None # backward weights (gradients) + self.shape = None + self.init = None + self.allocator = None + + self.configure(**kwargs) + + def configure(self, **kwargs): + for k, v in kwargs.items(): + getattr(self, k) # ensures the key already exists + setattr(self, k, v) + + @property + def size(self): + assert self.shape is not None + return np.prod(self.shape) + + def allocate(self, *args, **kwargs): + self.configure(**kwargs) + + # intentionally not using isinstance + assert type(self.shape) == tuple, self.shape + + f, g = self.allocator(self.size) + assert len(f) == self.size, "{} != {}".format(f.shape, self.size) + assert len(g) == self.size, "{} != {}".format(g.shape, self.size) + f[:] = self.init(self.size, *args) + g[:] = self.init(self.size, *args) + self.f = f.reshape(self.shape) + self.g = g.reshape(self.shape) + # Abstract Layers {{{1 class Layer: def __init__(self): self.parents = [] self.children = [] + self.weights = OrderedDict() self.input_shape = None self.output_shape = None kind = self.__class__.__name__ global _layer_counters _layer_counters[kind] += 1 self.name = "{}_{}".format(kind, _layer_counters[kind]) - self.size = None # total weight count (if any) self.unsafe = False # disables assertions for better performance def __str__(self): @@ -335,11 +379,20 @@ class Layer: def validate_output(self, Y): assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) - def init(self, W, dW): - assert W.ndim == 1 and W.shape[0] == self.size, W.shape - assert dW.ndim == 1 and dW.shape[0] == self.size, dW.shape - self.W = W - self.dW = dW + def _new_weights(self, name, **kwargs): + w = Weights(**kwargs) + assert name not in self.weights, name + self.weights[name] = w + return w + + @property + def size(self): + return sum((w.size for w in self.weights.values())) + + def init(self, allocator): + ins, outs = self.input_shape[0], self.output_shape[0] + for k, w in self.weights.items(): + w.allocate(ins, outs, allocator=allocator) def propagate(self, values): if not self.unsafe: @@ -407,7 +460,6 @@ class Flatten(Layer): shape = parent.output_shape self.input_shape = shape self.output_shape = (np.prod(shape),) - return shape def forward(self, X): self.batch_size = X.shape[0] @@ -527,42 +579,25 @@ class Dense(Layer): super().__init__() self.dim = int(dim) self.output_shape = (dim,) - self.weight_init = init - self.size = None + self.coeffs = self._new_weights('coeffs', init=init) + self.biases = self._new_weights('biases', init=init_zeros) def make_shape(self, parent): shape = parent.output_shape self.input_shape = shape if len(shape) != 1: return False - self.nW = self.dim * shape[0] - self.nb = self.dim - self.size = self.nW + self.nb - return shape - - def init(self, W, dW): - super().init(W, dW) - - ins, outs = self.input_shape[0], self.output_shape[0] - - self.coeffs = self.W[:self.nW].reshape(ins, outs) - self.biases = self.W[self.nW:].reshape(1, outs) - self.dcoeffs = self.dW[:self.nW].reshape(ins, outs) - self.dbiases = self.dW[self.nW:].reshape(1, outs) - - self.coeffs.flat = self.weight_init(self.nW, ins, outs) - self.biases.flat = 0 - - self.std = np.std(self.W) + self.coeffs.shape = (shape[0], self.dim) + self.biases.shape = (1, self.dim) def forward(self, X): self.X = X - return X.dot(self.coeffs) + self.biases + return X.dot(self.coeffs.f) + self.biases.f def backward(self, dY): - self.dcoeffs[:] = self.X.T.dot(dY) - self.dbiases[:] = dY.sum(0, keepdims=True) - return dY.dot(self.coeffs.T) + self.coeffs.g[:] = self.X.T.dot(dY) + self.biases.g[:] = dY.sum(0, keepdims=True) + return dY.dot(self.coeffs.f.T) # Models {{{1 @@ -578,18 +613,30 @@ class Model: node.unsafe = unsafe def make_weights(self): - self.param_count = 0 - for node in self.ordered_nodes: - if node.size is not None: - self.param_count += node.size + self.param_count = sum((node.size for node in self.ordered_nodes)) self.W = np.zeros(self.param_count, dtype=_f) self.dW = np.zeros(self.param_count, dtype=_f) offset = 0 for node in self.ordered_nodes: - if node.size is not None: + if node.size > 0: end = offset + node.size - node.init(self.W[offset:end], self.dW[offset:end]) + inner_offset = 0 + + def allocate(size): + nonlocal inner_offset + o = offset + inner_offset + ret = self.W[o:o+size], self.dW[o:o+size] + inner_offset += size + assert len(ret[0]) == len(ret[1]) + assert size == len(ret[0]), (size, len(ret[0])) + return ret + + node.init(allocate) + assert inner_offset <= node.size, "Layer {} allocated more weights than it said it would".format(node) + # i don't care if "less" is grammatically incorrect. + # you're mom is grammatically incorrect. + assert inner_offset >= node.size, "Layer {} allocated less weights than it said it would".format(node) offset += node.size def traverse(self, nodes, node): @@ -638,14 +685,14 @@ class Model: for k in weights.keys(): used[k] = False - nodes = [node for node in self.ordered_nodes if node.size is not None] + nodes = [node for node in self.ordered_nodes if node.size > 0] for node in nodes: full_name = str(node).lower() for s_name, o_name in node.serialized.items(): key = full_name + '_' + s_name data = weights[key] target = getattr(node, o_name) - target[:] = data + target.f[:] = data used[key] = True for k, v in used.items(): @@ -658,7 +705,7 @@ class Model: counts = defaultdict(lambda: 0) - nodes = [node for node in self.ordered_nodes if node.size is not None] + nodes = [node for node in self.ordered_nodes if node.size > 0] for node in nodes: full_name = str(node).lower() grp = f.create_group(full_name) @@ -666,7 +713,7 @@ class Model: key = full_name + '_' + s_name target = getattr(node, o_name) data = grp.create_dataset(key, target.shape, dtype=_f) - data[:] = target + data[:] = target.f counts[key] += 1 if counts[key] > 1: lament("WARNING: rewrote weight", key)