diff --git a/optim_nn.py b/optim_nn.py index 9a6d6ca..f1e0989 100644 --- a/optim_nn.py +++ b/optim_nn.py @@ -113,6 +113,70 @@ class LayerNorm(Layer): return dX +class Denses(Layer): # TODO: rename? + # acts as a separate Dense for each row or column. only for 2D arrays. + + def __init__(self, dim, init=init_he_uniform, axis=-1): + super().__init__() + self.dim = int(dim) + self.weight_init = init + self.axis = int(axis) + self.size = None + + def make_shape(self, shape): + super().make_shape(shape) + if len(shape) != 2: + return False + + assert -len(shape) <= self.axis < len(shape) + self.axis = self.axis % len(shape) + + self.output_shape = list(shape) + self.output_shape[self.axis] = self.dim + self.output_shape = tuple(self.output_shape) + + self.nW = self.dim * np.prod(shape) + self.nb = np.prod(self.output_shape) + self.size = self.nW + self.nb + + return shape + + def init(self, W, dW): + super().init(W, dW) + + ins, outs = np.prod(self.input_shape), np.prod(self.output_shape) + + in_rows = self.input_shape[0] + in_cols = self.input_shape[1] + out_rows = self.output_shape[0] + out_cols = self.output_shape[1] + + self.coeffs = self.W[:self.nW].reshape(in_rows, in_cols, self.dim) + self.biases = self.W[self.nW:].reshape(1, out_rows, out_cols) + self.dcoeffs = self.dW[:self.nW].reshape(self.coeffs.shape) + self.dbiases = self.dW[self.nW:].reshape(self.biases.shape) + + self.coeffs.flat = self.weight_init(self.nW, ins, outs) + self.biases.flat = 0 + + self.std = np.std(self.W) + + def F(self, X): + self.X = X + if self.axis == 0: + return np.einsum('ixj,xjk->ikj', X, self.coeffs) + self.biases + elif self.axis == 1: + return np.einsum('ijx,jxk->ijk', X, self.coeffs) + self.biases + + def dF(self, dY): + self.dbiases[:] = dY.sum(0, keepdims=True) + if self.axis == 0: + self.dcoeffs[:] = np.einsum('ixj,ikj->xjk', self.X, dY) + return np.einsum('ikj,xjk->ixj', dY, self.coeffs) + elif self.axis == 1: + self.dcoeffs[:] = np.einsum('ijx,ijk->jxk', self.X, dY) + return np.einsum('ijk,jxk->ijx', dY, self.coeffs) + class DenseOneLess(Dense): def init(self, W, dW): super().init(W, dW) @@ -122,15 +186,13 @@ class DenseOneLess(Dense): def F(self, X): np.fill_diagonal(self.coeffs, 0) self.X = X - Y = X.dot(self.coeffs) + self.biases - return Y + return X.dot(self.coeffs) + self.biases def dF(self, dY): - dX = dY.dot(self.coeffs.T) self.dcoeffs[:] = self.X.T.dot(dY) self.dbiases[:] = dY.sum(0, keepdims=True) np.fill_diagonal(self.dcoeffs, 0) - return dX + return dY.dot(self.coeffs.T) class CosineDense(Dense): # paper: https://arxiv.org/abs/1702.05870 diff --git a/optim_nn_core.py b/optim_nn_core.py index 0afb073..f2b8b2f 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -49,7 +49,7 @@ class Loss: class CategoricalCrossentropy(Loss): # lifted from theano - def __init__(self, eps=1e-8): + def __init__(self, eps=1e-6): self.eps = _f(eps) def F(self, p, y): @@ -519,14 +519,20 @@ class Dense(Layer): def F(self, X): self.X = X - Y = X.dot(self.coeffs) + self.biases - return Y + return X.dot(self.coeffs) + self.biases def dF(self, dY): - dX = dY.dot(self.coeffs.T) + #Y = np.einsum('ix,xj->ij', X, C) + #dX = np.einsum('ix,jx->ij', dY, C) + #dC = np.einsum('xi,xj->ij', X, dY) + # or rather + #Y = np.einsum('ix,xj->ij', X, C) + #dX = np.einsum('ij,xj->ix', dY, C) + #dC = np.einsum('ix,ij->xj', X, dY) + # that makes sense, just move the pairs around self.dcoeffs[:] = self.X.T.dot(dY) self.dbiases[:] = dY.sum(0, keepdims=True) - return dX + return dY.dot(self.coeffs.T) # Models {{{1