diff --git a/optim_nn.py b/optim_nn.py index 6da6966..0031d79 100644 --- a/optim_nn.py +++ b/optim_nn.py @@ -160,10 +160,10 @@ class Layer: return child def validate_input(self, X): - assert X.shape[1:] == self.input_shape, (self, X.shape[1:], self.input_shape) + assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape) def validate_output(self, Y): - assert Y.shape[1:] == self.output_shape, (self, Y.shape[1:], self.output_shape) + assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape) def forward(self, lut): assert len(self.parents) > 0, self @@ -215,7 +215,7 @@ class Input(Layer): return X def dF(self, dY): - #self.delta = dY + #self.dY = dY return np.zeros_like(dY) class Affine(Layer): @@ -238,6 +238,17 @@ class Relu(Layer): def dF(self, dY): return np.where(self.cond, dY, 0) +class GeluApprox(Layer): + # refer to https://www.desmos.com/calculator/ydzgtccsld + def F(self, X): + from scipy.special import expit as sigmoid + self.a = 1.704 * X + self.sig = sigmoid(self.a) + return X * self.sig + + def dF(self, dY): + return dY * self.sig * (1 + self.a * (1 - self.sig)) + class Dense(Layer): def __init__(self, dim): super().__init__() @@ -250,14 +261,12 @@ class Dense(Layer): self.W = W self.dW = dW - #self.coeffs = np.random.normal(0, s, size=self.size) - #self.biases = np.zeros((self.dim, 1), dtype=nf) self.coeffs = self.W[:self.nW].reshape(outs, ins) self.biases = self.W[self.nW:].reshape(outs, 1) self.dcoeffs = self.dW[:self.nW].reshape(outs, ins) self.dbiases = self.dW[self.nW:].reshape(outs) - # he_normal + # he_normal initialization s = np.sqrt(2 / ins) self.coeffs.flat = np.random.normal(0, s, size=self.nW) self.biases.flat = 0 @@ -278,9 +287,6 @@ class Dense(Layer): return Y def dF(self, dY): - # http://cs231n.github.io/optimization-2/#gradients-for-vectorized-operations - # note: because we only call df once (we only have a df/dy method), - # we have to do df/dw stuff here too. dX = self.coeffs.T.dot(dY) self.dcoeffs[:] = dY.dot(self.X.T) self.dbiases[:] = np.sum(dY, axis=1) @@ -296,8 +302,8 @@ class Model: self.y = y self.ordered_nodes = self.traverse([], self.y) - print([str(node) for node in self.ordered_nodes]) - #print(len(self.ordered_nodes)) + node_names = ' '.join([str(node) for node in self.ordered_nodes]) + print('{} nodes: {}'.format(len(self.ordered_nodes), node_names)) self.make_weights() @@ -336,7 +342,6 @@ class Model: lut = dict() input_node = self.ordered_nodes[0] output_node = self.ordered_nodes[-1] - #lut[input_node] = input_node.F(X) lut[input_node] = input_node.multi(np.expand_dims(X, 0)) for node in self.ordered_nodes[1:]: lut[node] = node.forward(lut) @@ -346,13 +351,10 @@ class Model: lut = dict() input_node = self.ordered_nodes[0] output_node = self.ordered_nodes[-1] - #lut[output_node] = output_node.dF(error) lut[output_node] = output_node.dmulti(np.expand_dims(error, 0)) - #for node in self.ordered_nodes[-2:0:-1]: for node in reversed(self.ordered_nodes[:-1]): lut[node] = node.backward(lut) #return lut[input_node] # meaningless value - return self.dW def load_model(self, fn): @@ -389,7 +391,7 @@ if __name__ == '__main__': res_depth = 3, res_block = 2, # normally 2 res_multi = 4, # normally 1 - activation = 'relu', + activation = 'gelu', optim = 'adam', nesterov = False, # only used with SGD or Adam @@ -427,6 +429,9 @@ if __name__ == '__main__': y = x last_size = input_samples + activations = dict(relu=Relu, gelu=GeluApprox) + activation = activations[config.activation] + for blah in range(config.res_depth): size = config.res_width @@ -437,12 +442,12 @@ if __name__ == '__main__': skip = y merger = Sum() skip.feed(merger) - z_start = skip.feed(Relu()) + z_start = skip.feed(activation()) for i in range(config.res_multi): z = z_start for i in range(config.res_block): if i > 0: - z = z.feed(Relu()) + z = z.feed(activation()) z = z.feed(Dense(size)) z.feed(merger) y = merger