This commit is contained in:
Connor Olding 2017-01-09 04:35:28 -08:00
parent 3cd65a749c
commit 8baa7a267a

View file

@ -160,10 +160,10 @@ class Layer:
return child return child
def validate_input(self, X): def validate_input(self, X):
assert X.shape[1:] == self.input_shape, (self, X.shape[1:], self.input_shape) assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
def validate_output(self, Y): def validate_output(self, Y):
assert Y.shape[1:] == self.output_shape, (self, Y.shape[1:], self.output_shape) assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
def forward(self, lut): def forward(self, lut):
assert len(self.parents) > 0, self assert len(self.parents) > 0, self
@ -215,7 +215,7 @@ class Input(Layer):
return X return X
def dF(self, dY): def dF(self, dY):
#self.delta = dY #self.dY = dY
return np.zeros_like(dY) return np.zeros_like(dY)
class Affine(Layer): class Affine(Layer):
@ -238,6 +238,17 @@ class Relu(Layer):
def dF(self, dY): def dF(self, dY):
return np.where(self.cond, dY, 0) return np.where(self.cond, dY, 0)
class GeluApprox(Layer):
# refer to https://www.desmos.com/calculator/ydzgtccsld
def F(self, X):
from scipy.special import expit as sigmoid
self.a = 1.704 * X
self.sig = sigmoid(self.a)
return X * self.sig
def dF(self, dY):
return dY * self.sig * (1 + self.a * (1 - self.sig))
class Dense(Layer): class Dense(Layer):
def __init__(self, dim): def __init__(self, dim):
super().__init__() super().__init__()
@ -250,14 +261,12 @@ class Dense(Layer):
self.W = W self.W = W
self.dW = dW self.dW = dW
#self.coeffs = np.random.normal(0, s, size=self.size)
#self.biases = np.zeros((self.dim, 1), dtype=nf)
self.coeffs = self.W[:self.nW].reshape(outs, ins) self.coeffs = self.W[:self.nW].reshape(outs, ins)
self.biases = self.W[self.nW:].reshape(outs, 1) self.biases = self.W[self.nW:].reshape(outs, 1)
self.dcoeffs = self.dW[:self.nW].reshape(outs, ins) self.dcoeffs = self.dW[:self.nW].reshape(outs, ins)
self.dbiases = self.dW[self.nW:].reshape(outs) self.dbiases = self.dW[self.nW:].reshape(outs)
# he_normal # he_normal initialization
s = np.sqrt(2 / ins) s = np.sqrt(2 / ins)
self.coeffs.flat = np.random.normal(0, s, size=self.nW) self.coeffs.flat = np.random.normal(0, s, size=self.nW)
self.biases.flat = 0 self.biases.flat = 0
@ -278,9 +287,6 @@ class Dense(Layer):
return Y return Y
def dF(self, dY): def dF(self, dY):
# http://cs231n.github.io/optimization-2/#gradients-for-vectorized-operations
# note: because we only call df once (we only have a df/dy method),
# we have to do df/dw stuff here too.
dX = self.coeffs.T.dot(dY) dX = self.coeffs.T.dot(dY)
self.dcoeffs[:] = dY.dot(self.X.T) self.dcoeffs[:] = dY.dot(self.X.T)
self.dbiases[:] = np.sum(dY, axis=1) self.dbiases[:] = np.sum(dY, axis=1)
@ -296,8 +302,8 @@ class Model:
self.y = y self.y = y
self.ordered_nodes = self.traverse([], self.y) self.ordered_nodes = self.traverse([], self.y)
print([str(node) for node in self.ordered_nodes]) node_names = ' '.join([str(node) for node in self.ordered_nodes])
#print(len(self.ordered_nodes)) print('{} nodes: {}'.format(len(self.ordered_nodes), node_names))
self.make_weights() self.make_weights()
@ -336,7 +342,6 @@ class Model:
lut = dict() lut = dict()
input_node = self.ordered_nodes[0] input_node = self.ordered_nodes[0]
output_node = self.ordered_nodes[-1] output_node = self.ordered_nodes[-1]
#lut[input_node] = input_node.F(X)
lut[input_node] = input_node.multi(np.expand_dims(X, 0)) lut[input_node] = input_node.multi(np.expand_dims(X, 0))
for node in self.ordered_nodes[1:]: for node in self.ordered_nodes[1:]:
lut[node] = node.forward(lut) lut[node] = node.forward(lut)
@ -346,13 +351,10 @@ class Model:
lut = dict() lut = dict()
input_node = self.ordered_nodes[0] input_node = self.ordered_nodes[0]
output_node = self.ordered_nodes[-1] output_node = self.ordered_nodes[-1]
#lut[output_node] = output_node.dF(error)
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0)) lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
#for node in self.ordered_nodes[-2:0:-1]:
for node in reversed(self.ordered_nodes[:-1]): for node in reversed(self.ordered_nodes[:-1]):
lut[node] = node.backward(lut) lut[node] = node.backward(lut)
#return lut[input_node] # meaningless value #return lut[input_node] # meaningless value
return self.dW return self.dW
def load_model(self, fn): def load_model(self, fn):
@ -389,7 +391,7 @@ if __name__ == '__main__':
res_depth = 3, res_depth = 3,
res_block = 2, # normally 2 res_block = 2, # normally 2
res_multi = 4, # normally 1 res_multi = 4, # normally 1
activation = 'relu', activation = 'gelu',
optim = 'adam', optim = 'adam',
nesterov = False, # only used with SGD or Adam nesterov = False, # only used with SGD or Adam
@ -427,6 +429,9 @@ if __name__ == '__main__':
y = x y = x
last_size = input_samples last_size = input_samples
activations = dict(relu=Relu, gelu=GeluApprox)
activation = activations[config.activation]
for blah in range(config.res_depth): for blah in range(config.res_depth):
size = config.res_width size = config.res_width
@ -437,12 +442,12 @@ if __name__ == '__main__':
skip = y skip = y
merger = Sum() merger = Sum()
skip.feed(merger) skip.feed(merger)
z_start = skip.feed(Relu()) z_start = skip.feed(activation())
for i in range(config.res_multi): for i in range(config.res_multi):
z = z_start z = z_start
for i in range(config.res_block): for i in range(config.res_block):
if i > 0: if i > 0:
z = z.feed(Relu()) z = z.feed(activation())
z = z.feed(Dense(size)) z = z.feed(Dense(size))
z.feed(merger) z.feed(merger)
y = merger y = merger