.
This commit is contained in:
parent
3cd65a749c
commit
8baa7a267a
1 changed files with 23 additions and 18 deletions
41
optim_nn.py
41
optim_nn.py
|
@ -160,10 +160,10 @@ class Layer:
|
||||||
return child
|
return child
|
||||||
|
|
||||||
def validate_input(self, X):
|
def validate_input(self, X):
|
||||||
assert X.shape[1:] == self.input_shape, (self, X.shape[1:], self.input_shape)
|
assert X.shape[1:] == self.input_shape, (str(self), X.shape[1:], self.input_shape)
|
||||||
|
|
||||||
def validate_output(self, Y):
|
def validate_output(self, Y):
|
||||||
assert Y.shape[1:] == self.output_shape, (self, Y.shape[1:], self.output_shape)
|
assert Y.shape[1:] == self.output_shape, (str(self), Y.shape[1:], self.output_shape)
|
||||||
|
|
||||||
def forward(self, lut):
|
def forward(self, lut):
|
||||||
assert len(self.parents) > 0, self
|
assert len(self.parents) > 0, self
|
||||||
|
@ -215,7 +215,7 @@ class Input(Layer):
|
||||||
return X
|
return X
|
||||||
|
|
||||||
def dF(self, dY):
|
def dF(self, dY):
|
||||||
#self.delta = dY
|
#self.dY = dY
|
||||||
return np.zeros_like(dY)
|
return np.zeros_like(dY)
|
||||||
|
|
||||||
class Affine(Layer):
|
class Affine(Layer):
|
||||||
|
@ -238,6 +238,17 @@ class Relu(Layer):
|
||||||
def dF(self, dY):
|
def dF(self, dY):
|
||||||
return np.where(self.cond, dY, 0)
|
return np.where(self.cond, dY, 0)
|
||||||
|
|
||||||
|
class GeluApprox(Layer):
|
||||||
|
# refer to https://www.desmos.com/calculator/ydzgtccsld
|
||||||
|
def F(self, X):
|
||||||
|
from scipy.special import expit as sigmoid
|
||||||
|
self.a = 1.704 * X
|
||||||
|
self.sig = sigmoid(self.a)
|
||||||
|
return X * self.sig
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
||||||
|
|
||||||
class Dense(Layer):
|
class Dense(Layer):
|
||||||
def __init__(self, dim):
|
def __init__(self, dim):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -250,14 +261,12 @@ class Dense(Layer):
|
||||||
|
|
||||||
self.W = W
|
self.W = W
|
||||||
self.dW = dW
|
self.dW = dW
|
||||||
#self.coeffs = np.random.normal(0, s, size=self.size)
|
|
||||||
#self.biases = np.zeros((self.dim, 1), dtype=nf)
|
|
||||||
self.coeffs = self.W[:self.nW].reshape(outs, ins)
|
self.coeffs = self.W[:self.nW].reshape(outs, ins)
|
||||||
self.biases = self.W[self.nW:].reshape(outs, 1)
|
self.biases = self.W[self.nW:].reshape(outs, 1)
|
||||||
self.dcoeffs = self.dW[:self.nW].reshape(outs, ins)
|
self.dcoeffs = self.dW[:self.nW].reshape(outs, ins)
|
||||||
self.dbiases = self.dW[self.nW:].reshape(outs)
|
self.dbiases = self.dW[self.nW:].reshape(outs)
|
||||||
|
|
||||||
# he_normal
|
# he_normal initialization
|
||||||
s = np.sqrt(2 / ins)
|
s = np.sqrt(2 / ins)
|
||||||
self.coeffs.flat = np.random.normal(0, s, size=self.nW)
|
self.coeffs.flat = np.random.normal(0, s, size=self.nW)
|
||||||
self.biases.flat = 0
|
self.biases.flat = 0
|
||||||
|
@ -278,9 +287,6 @@ class Dense(Layer):
|
||||||
return Y
|
return Y
|
||||||
|
|
||||||
def dF(self, dY):
|
def dF(self, dY):
|
||||||
# http://cs231n.github.io/optimization-2/#gradients-for-vectorized-operations
|
|
||||||
# note: because we only call df once (we only have a df/dy method),
|
|
||||||
# we have to do df/dw stuff here too.
|
|
||||||
dX = self.coeffs.T.dot(dY)
|
dX = self.coeffs.T.dot(dY)
|
||||||
self.dcoeffs[:] = dY.dot(self.X.T)
|
self.dcoeffs[:] = dY.dot(self.X.T)
|
||||||
self.dbiases[:] = np.sum(dY, axis=1)
|
self.dbiases[:] = np.sum(dY, axis=1)
|
||||||
|
@ -296,8 +302,8 @@ class Model:
|
||||||
self.y = y
|
self.y = y
|
||||||
|
|
||||||
self.ordered_nodes = self.traverse([], self.y)
|
self.ordered_nodes = self.traverse([], self.y)
|
||||||
print([str(node) for node in self.ordered_nodes])
|
node_names = ' '.join([str(node) for node in self.ordered_nodes])
|
||||||
#print(len(self.ordered_nodes))
|
print('{} nodes: {}'.format(len(self.ordered_nodes), node_names))
|
||||||
|
|
||||||
self.make_weights()
|
self.make_weights()
|
||||||
|
|
||||||
|
@ -336,7 +342,6 @@ class Model:
|
||||||
lut = dict()
|
lut = dict()
|
||||||
input_node = self.ordered_nodes[0]
|
input_node = self.ordered_nodes[0]
|
||||||
output_node = self.ordered_nodes[-1]
|
output_node = self.ordered_nodes[-1]
|
||||||
#lut[input_node] = input_node.F(X)
|
|
||||||
lut[input_node] = input_node.multi(np.expand_dims(X, 0))
|
lut[input_node] = input_node.multi(np.expand_dims(X, 0))
|
||||||
for node in self.ordered_nodes[1:]:
|
for node in self.ordered_nodes[1:]:
|
||||||
lut[node] = node.forward(lut)
|
lut[node] = node.forward(lut)
|
||||||
|
@ -346,13 +351,10 @@ class Model:
|
||||||
lut = dict()
|
lut = dict()
|
||||||
input_node = self.ordered_nodes[0]
|
input_node = self.ordered_nodes[0]
|
||||||
output_node = self.ordered_nodes[-1]
|
output_node = self.ordered_nodes[-1]
|
||||||
#lut[output_node] = output_node.dF(error)
|
|
||||||
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
|
lut[output_node] = output_node.dmulti(np.expand_dims(error, 0))
|
||||||
#for node in self.ordered_nodes[-2:0:-1]:
|
|
||||||
for node in reversed(self.ordered_nodes[:-1]):
|
for node in reversed(self.ordered_nodes[:-1]):
|
||||||
lut[node] = node.backward(lut)
|
lut[node] = node.backward(lut)
|
||||||
#return lut[input_node] # meaningless value
|
#return lut[input_node] # meaningless value
|
||||||
|
|
||||||
return self.dW
|
return self.dW
|
||||||
|
|
||||||
def load_model(self, fn):
|
def load_model(self, fn):
|
||||||
|
@ -389,7 +391,7 @@ if __name__ == '__main__':
|
||||||
res_depth = 3,
|
res_depth = 3,
|
||||||
res_block = 2, # normally 2
|
res_block = 2, # normally 2
|
||||||
res_multi = 4, # normally 1
|
res_multi = 4, # normally 1
|
||||||
activation = 'relu',
|
activation = 'gelu',
|
||||||
|
|
||||||
optim = 'adam',
|
optim = 'adam',
|
||||||
nesterov = False, # only used with SGD or Adam
|
nesterov = False, # only used with SGD or Adam
|
||||||
|
@ -427,6 +429,9 @@ if __name__ == '__main__':
|
||||||
y = x
|
y = x
|
||||||
last_size = input_samples
|
last_size = input_samples
|
||||||
|
|
||||||
|
activations = dict(relu=Relu, gelu=GeluApprox)
|
||||||
|
activation = activations[config.activation]
|
||||||
|
|
||||||
for blah in range(config.res_depth):
|
for blah in range(config.res_depth):
|
||||||
size = config.res_width
|
size = config.res_width
|
||||||
|
|
||||||
|
@ -437,12 +442,12 @@ if __name__ == '__main__':
|
||||||
skip = y
|
skip = y
|
||||||
merger = Sum()
|
merger = Sum()
|
||||||
skip.feed(merger)
|
skip.feed(merger)
|
||||||
z_start = skip.feed(Relu())
|
z_start = skip.feed(activation())
|
||||||
for i in range(config.res_multi):
|
for i in range(config.res_multi):
|
||||||
z = z_start
|
z = z_start
|
||||||
for i in range(config.res_block):
|
for i in range(config.res_block):
|
||||||
if i > 0:
|
if i > 0:
|
||||||
z = z.feed(Relu())
|
z = z.feed(activation())
|
||||||
z = z.feed(Dense(size))
|
z = z.feed(Dense(size))
|
||||||
z.feed(merger)
|
z.feed(merger)
|
||||||
y = merger
|
y = merger
|
||||||
|
|
Loading…
Reference in a new issue