.
This commit is contained in:
parent
e28eb0cf06
commit
5b2a293b8c
1 changed files with 37 additions and 30 deletions
67
optim_nn.py
67
optim_nn.py
|
@ -18,6 +18,13 @@ class Loss:
|
||||||
d = self.df(r)
|
d = self.df(r)
|
||||||
return d / len(d)
|
return d / len(d)
|
||||||
|
|
||||||
|
class Squared(Loss):
|
||||||
|
def f(self, r):
|
||||||
|
return np.square(r)
|
||||||
|
|
||||||
|
def df(self, r):
|
||||||
|
return 2 * r
|
||||||
|
|
||||||
class SquaredHalved(Loss):
|
class SquaredHalved(Loss):
|
||||||
def f(self, r):
|
def f(self, r):
|
||||||
return np.square(r) / 2
|
return np.square(r) / 2
|
||||||
|
@ -144,18 +151,18 @@ class Layer:
|
||||||
|
|
||||||
def multi(self, B):
|
def multi(self, B):
|
||||||
assert len(B) == 1, self
|
assert len(B) == 1, self
|
||||||
return self.F(B[0].T).T
|
return self.F(B[0])
|
||||||
|
|
||||||
def dmulti(self, dB):
|
def dmulti(self, dB):
|
||||||
if len(dB) == 1:
|
if len(dB) == 1:
|
||||||
return self.dF(dB[0].T).T
|
return self.dF(dB[0])
|
||||||
else:
|
else:
|
||||||
dX = None
|
dX = None
|
||||||
for dY in dB:
|
for dY in dB:
|
||||||
if dX is None:
|
if dX is None:
|
||||||
dX = self.dF(dY.T).T
|
dX = self.dF(dY)
|
||||||
else:
|
else:
|
||||||
dX += self.dF(dY.T).T
|
dX += self.dF(dY)
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
# general utility methods:
|
# general utility methods:
|
||||||
|
@ -300,10 +307,10 @@ class Dense(Layer):
|
||||||
|
|
||||||
self.W = W
|
self.W = W
|
||||||
self.dW = dW
|
self.dW = dW
|
||||||
self.coeffs = self.W[:self.nW].reshape(outs, ins)
|
self.coeffs = self.W[:self.nW].reshape(ins, outs)
|
||||||
self.biases = self.W[self.nW:].reshape(outs, 1)
|
self.biases = self.W[self.nW:].reshape(1, outs)
|
||||||
self.dcoeffs = self.dW[:self.nW].reshape(outs, ins)
|
self.dcoeffs = self.dW[:self.nW].reshape(ins, outs)
|
||||||
self.dbiases = self.dW[self.nW:].reshape(outs)
|
self.dbiases = self.dW[self.nW:].reshape(1, outs)
|
||||||
|
|
||||||
# he_normal initialization
|
# he_normal initialization
|
||||||
s = np.sqrt(2 / ins)
|
s = np.sqrt(2 / ins)
|
||||||
|
@ -321,14 +328,14 @@ class Dense(Layer):
|
||||||
|
|
||||||
def F(self, X):
|
def F(self, X):
|
||||||
self.X = X
|
self.X = X
|
||||||
Y = self.coeffs.dot(X) \
|
Y = X.dot(self.coeffs) \
|
||||||
+ self.biases
|
+ self.biases
|
||||||
return Y
|
return Y
|
||||||
|
|
||||||
def dF(self, dY):
|
def dF(self, dY):
|
||||||
dX = self.coeffs.T.dot(dY)
|
dX = dY.dot(self.coeffs.T)
|
||||||
self.dcoeffs[:] = dY.dot(self.X.T)
|
self.dcoeffs[:] = self.X.T.dot(dY)
|
||||||
self.dbiases[:] = np.sum(dY, axis=1)
|
self.dbiases[:] = np.sum(dY, axis=0, keepdims=True)
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
# Model
|
# Model
|
||||||
|
@ -411,8 +418,8 @@ class Model:
|
||||||
for i in range(len(denses)):
|
for i in range(len(denses)):
|
||||||
a, b = i, i + 1
|
a, b = i, i + 1
|
||||||
b_name = "dense_{}".format(b)
|
b_name = "dense_{}".format(b)
|
||||||
denses[a].coeffs = weights[b_name+'_W'].T
|
denses[a].coeffs = weights[b_name+'_W']
|
||||||
denses[a].biases = np.expand_dims(weights[b_name+'_b'], -1)
|
denses[a].biases = np.expand_dims(weights[b_name+'_b'], 0)
|
||||||
|
|
||||||
def save_weights(self, fn, overwrite=False):
|
def save_weights(self, fn, overwrite=False):
|
||||||
raise NotImplementedError("unimplemented", self)
|
raise NotImplementedError("unimplemented", self)
|
||||||
|
@ -433,7 +440,7 @@ if __name__ == '__main__':
|
||||||
# style of resnet
|
# style of resnet
|
||||||
# only one is implemented so far
|
# only one is implemented so far
|
||||||
parallel_style = 'batchless',
|
parallel_style = 'batchless',
|
||||||
activation = 'gelu',
|
activation = 'relu',
|
||||||
|
|
||||||
optim = 'adam',
|
optim = 'adam',
|
||||||
nesterov = False, # only used with SGD or Adam
|
nesterov = False, # only used with SGD or Adam
|
||||||
|
@ -465,8 +472,7 @@ if __name__ == '__main__':
|
||||||
return inputs, outputs
|
return inputs, outputs
|
||||||
|
|
||||||
inputs, outputs = read_data("ml/cie_mlp_data.npz")
|
inputs, outputs = read_data("ml/cie_mlp_data.npz")
|
||||||
valid_data = read_data("ml/cie_mlp_vdata.npz")
|
valid_inputs, valid_outputs = read_data("ml/cie_mlp_vdata.npz")
|
||||||
valid_inputs, valid_outputs = valid_data
|
|
||||||
|
|
||||||
# Our Test Model
|
# Our Test Model
|
||||||
|
|
||||||
|
@ -520,8 +526,12 @@ if __name__ == '__main__':
|
||||||
else:
|
else:
|
||||||
raise Exception('unknown optimizer', config.optim)
|
raise Exception('unknown optimizer', config.optim)
|
||||||
|
|
||||||
assert config.loss == 'mse', 'unknown loss function'
|
if config.loss == 'mse':
|
||||||
loss = SquaredHalved()
|
loss = Squared()
|
||||||
|
elif config.loss == 'mshe': # mushy
|
||||||
|
loss = SquaredHalved()
|
||||||
|
else:
|
||||||
|
raise Exception('unknown objective', config.loss)
|
||||||
|
|
||||||
LR = config.LR
|
LR = config.LR
|
||||||
LRprod = 0.5**(1/config.LR_halve_every)
|
LRprod = 0.5**(1/config.LR_halve_every)
|
||||||
|
@ -532,12 +542,14 @@ if __name__ == '__main__':
|
||||||
predicted = model.forward(inputs / x_scale)
|
predicted = model.forward(inputs / x_scale)
|
||||||
residual = predicted - outputs / y_scale
|
residual = predicted - outputs / y_scale
|
||||||
err = loss.mean(residual)
|
err = loss.mean(residual)
|
||||||
print("train loss: {:10.6f}".format(err))
|
print("train loss: {:11.7f}".format(err))
|
||||||
|
print("improvement: {:+7.2f}%".format((0.0007031 / err - 1) * 100))
|
||||||
|
|
||||||
predicted = model.forward(valid_inputs / x_scale)
|
predicted = model.forward(valid_inputs / x_scale)
|
||||||
residual = predicted - valid_outputs / y_scale
|
residual = predicted - valid_outputs / y_scale
|
||||||
err = loss.mean(residual)
|
err = loss.mean(residual)
|
||||||
print("valid loss: {:10.6f}".format(err))
|
print("valid loss: {:11.7f}".format(err))
|
||||||
|
print("improvement: {:+7.2f}%".format((0.0007159 / err - 1) * 100))
|
||||||
|
|
||||||
for i in range(config.restarts + 1):
|
for i in range(config.restarts + 1):
|
||||||
measure_loss()
|
measure_loss()
|
||||||
|
@ -554,6 +566,8 @@ if __name__ == '__main__':
|
||||||
shuffled_inputs = inputs[indices] / x_scale
|
shuffled_inputs = inputs[indices] / x_scale
|
||||||
shuffled_outputs = outputs[indices] / y_scale
|
shuffled_outputs = outputs[indices] / y_scale
|
||||||
|
|
||||||
|
optim.alpha = LR * LRprod**e
|
||||||
|
|
||||||
cumsum_loss = 0
|
cumsum_loss = 0
|
||||||
for b in range(batch_count):
|
for b in range(batch_count):
|
||||||
bi = b * config.batch_size
|
bi = b * config.batch_size
|
||||||
|
@ -561,16 +575,9 @@ if __name__ == '__main__':
|
||||||
batch_outputs = shuffled_outputs[bi:bi+config.batch_size]
|
batch_outputs = shuffled_outputs[bi:bi+config.batch_size]
|
||||||
|
|
||||||
predicted = model.forward(batch_inputs)
|
predicted = model.forward(batch_inputs)
|
||||||
dW = model.backward(np.ones_like(predicted))
|
|
||||||
|
|
||||||
residual = predicted - batch_outputs
|
residual = predicted - batch_outputs
|
||||||
|
dW = model.backward(loss.dmean(residual))
|
||||||
# TODO: try something like this instead?
|
optim.update(dW, model.W)
|
||||||
#err_dW = np.dot(loss.dmean(residual), np.expand_dims(dW, 0))
|
|
||||||
err_dW = loss.df(residual) * dW / len(residual)
|
|
||||||
err_dW = np.sum(err_dW, axis=0)
|
|
||||||
optim.alpha = LR * LRprod**e
|
|
||||||
optim.update(err_dW, model.W)
|
|
||||||
|
|
||||||
# note: we don't actually need this for training, only monitoring.
|
# note: we don't actually need this for training, only monitoring.
|
||||||
cumsum_loss += loss.mean(residual)
|
cumsum_loss += loss.mean(residual)
|
||||||
|
|
Loading…
Reference in a new issue