allow weight sharing; disableable gradient clearing
This commit is contained in:
parent
89fcd25962
commit
4d2251f69f
2 changed files with 26 additions and 14 deletions
23
onn.py
23
onn.py
|
@ -541,9 +541,9 @@ class Conv1Dper(Layer):
|
|||
return convolved
|
||||
|
||||
def backward(self, dY):
|
||||
self.coeffs.g[:] = (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
|
||||
self.coeffs.g += (dY[:,:,None] * self.cols).sum(0)[:,::-1].sum(0, keepdims=True)
|
||||
if self.bias:
|
||||
self.biases.g[:] = dY.sum(0, keepdims=True)
|
||||
self.biases.g += dY.sum(0, keepdims=True)
|
||||
return (dY[:,:,None] * self.coeffs.f[:,::-1]).sum(2)
|
||||
|
||||
class LayerNorm(Layer):
|
||||
|
@ -588,8 +588,8 @@ class LayerNorm(Layer):
|
|||
|
||||
if self.affine:
|
||||
dXnorm = dY * self.gamma.f
|
||||
self.gamma.g[:] = (dY * self.Xnorm).sum(0)
|
||||
self.beta.g[:] = dY.sum(0)
|
||||
self.gamma.g += (dY * self.Xnorm).sum(0)
|
||||
self.beta.g += dY.sum(0)
|
||||
else:
|
||||
dXnorm = dY
|
||||
|
||||
|
@ -644,12 +644,12 @@ class Denses(Layer): # TODO: rename?
|
|||
return np.einsum('ijx,jxk->ijk', X, self.coeffs.f) + self.biases.f
|
||||
|
||||
def backward(self, dY):
|
||||
self.biases.g[:] = dY.sum(0, keepdims=True)
|
||||
self.biases.g += dY.sum(0, keepdims=True)
|
||||
if self.axis == 0:
|
||||
self.coeffs.g[:] = np.einsum('ixj,ikj->xjk', self.X, dY)
|
||||
self.coeffs.g += np.einsum('ixj,ikj->xjk', self.X, dY)
|
||||
return np.einsum('ikj,xjk->ixj', dY, self.coeffs.f)
|
||||
elif self.axis == 1:
|
||||
self.coeffs.g[:] = np.einsum('ijx,ijk->jxk', self.X, dY)
|
||||
self.coeffs.g += np.einsum('ijx,ijk->jxk', self.X, dY)
|
||||
return np.einsum('ijk,jxk->ijx', dY, self.coeffs.f)
|
||||
|
||||
class DenseOneLess(Dense):
|
||||
|
@ -664,8 +664,9 @@ class DenseOneLess(Dense):
|
|||
return X.dot(self.coeffs.f) + self.biases
|
||||
|
||||
def backward(self, dY):
|
||||
self.coeffs.g[:] = self.X.T.dot(dY)
|
||||
self.biases.g[:] = dY.sum(0, keepdims=True)
|
||||
self.coeffs.g += self.X.T.dot(dY)
|
||||
self.biases.g += dY.sum(0, keepdims=True)
|
||||
# FIXME: might not be desireable if weights are being shared.
|
||||
np.fill_diagonal(self.coeffs.g, 0)
|
||||
return dY.dot(self.coeffs.f.T)
|
||||
|
||||
|
@ -693,9 +694,9 @@ class CosineDense(Dense):
|
|||
dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
|
||||
dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
|
||||
|
||||
self.coeffs.g[:] = self.X.T.dot(ddot) \
|
||||
self.coeffs.g += self.X.T.dot(ddot) \
|
||||
+ dW_norm / self.W_norm * self.coeffs.f
|
||||
self.biases.g[:] = ddot.sum(0, keepdims=True) \
|
||||
self.biases.g += ddot.sum(0, keepdims=True) \
|
||||
+ dW_norm / self.W_norm * self.biases.f
|
||||
dX = ddot.dot(self.coeffs.f.T) + dX_norm / self.X_norm * self.X
|
||||
|
||||
|
|
17
onn_core.py
17
onn_core.py
|
@ -551,6 +551,10 @@ class Layer:
|
|||
self.weights[name] = w
|
||||
return w
|
||||
|
||||
def clear_grad(self):
|
||||
for name, w in self.weights.items():
|
||||
w.g[:] = 0
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return sum((w.size for w in self.weights.values()))
|
||||
|
@ -834,8 +838,8 @@ class Dense(Layer):
|
|||
return X.dot(self.coeffs.f) + self.biases.f
|
||||
|
||||
def backward(self, dY):
|
||||
self.coeffs.g[:] = self.X.T.dot(dY)
|
||||
self.biases.g[:] = dY.sum(0, keepdims=True)
|
||||
self.coeffs.g += self.X.T.dot(dY)
|
||||
self.biases.g += dY.sum(0, keepdims=True)
|
||||
return dY.dot(self.coeffs.f.T)
|
||||
|
||||
# Models {{{1
|
||||
|
@ -902,6 +906,10 @@ class Model:
|
|||
values[node] = node.backpropagate(values)
|
||||
return self.dW
|
||||
|
||||
def clear_grad(self):
|
||||
for node in self.nodes:
|
||||
node.clear_grad()
|
||||
|
||||
def regulate_forward(self):
|
||||
loss = _0
|
||||
for node in self.nodes:
|
||||
|
@ -1034,7 +1042,8 @@ class Ritual: # i'm just making up names at this point.
|
|||
|
||||
def train_batched(self, inputs_or_generator, outputs_or_batch_count,
|
||||
batch_size=None,
|
||||
return_losses=False, test_only=False, shuffle=True):
|
||||
return_losses=False, test_only=False, shuffle=True,
|
||||
clear_grad=True):
|
||||
assert isinstance(return_losses, bool) or return_losses == 'both'
|
||||
|
||||
gen = isinstance(inputs_or_generator, types.GeneratorType)
|
||||
|
@ -1082,6 +1091,8 @@ class Ritual: # i'm just making up names at this point.
|
|||
batch_inputs = inputs[ bi:bi+batch_size]
|
||||
batch_outputs = outputs[bi:bi+batch_size]
|
||||
|
||||
if clear_grad:
|
||||
self.model.clear_grad()
|
||||
self._train_batch(batch_inputs, batch_outputs, b, batch_count,
|
||||
test_only, return_losses=='both', return_losses)
|
||||
|
||||
|
|
Loading…
Reference in a new issue