.
This commit is contained in:
parent
81f6e72da0
commit
fe577eb7f4
1 changed files with 96 additions and 30 deletions
126
optim_nn.py
126
optim_nn.py
|
@ -48,6 +48,67 @@ class SomethingElse(ResidualLoss):
|
||||||
|
|
||||||
# Parametric Layers {{{1
|
# Parametric Layers {{{1
|
||||||
|
|
||||||
|
class LayerNorm(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1607.06450
|
||||||
|
# note: nonparametric when affine == False
|
||||||
|
|
||||||
|
def __init__(self, eps=1e-5, affine=True):
|
||||||
|
super().__init__()
|
||||||
|
self.eps = _f(eps)
|
||||||
|
self.affine = bool(affine)
|
||||||
|
self.size = None
|
||||||
|
|
||||||
|
def make_shape(self, shape):
|
||||||
|
super().make_shape(shape)
|
||||||
|
if len(shape) != 1:
|
||||||
|
return False
|
||||||
|
self.features = shape[0]
|
||||||
|
if self.affine:
|
||||||
|
self.size = 2 * self.features
|
||||||
|
return shape
|
||||||
|
|
||||||
|
def init(self, W, dW):
|
||||||
|
# TODO: move this little bit into super(), also assert against self.size
|
||||||
|
self.W = W
|
||||||
|
self.dW = dW
|
||||||
|
|
||||||
|
f = self.features
|
||||||
|
|
||||||
|
self.gamma, self.dgamma = self.W[0*f:1*f], self.dW[0*f:1*f]
|
||||||
|
self.beta, self.dbeta = self.W[1*f:2*f], self.dW[1*f:2*f]
|
||||||
|
|
||||||
|
self.gamma[:] = 1
|
||||||
|
self.beta[:] = 0
|
||||||
|
|
||||||
|
def F(self, X):
|
||||||
|
self.mean = X.mean(0)
|
||||||
|
self.center = X - self.mean
|
||||||
|
self.var = self.center.var(0) + self.eps
|
||||||
|
self.std = np.sqrt(self.var)
|
||||||
|
|
||||||
|
self.Xnorm = self.center / self.std
|
||||||
|
if self.affine:
|
||||||
|
return self.gamma * self.Xnorm + self.beta
|
||||||
|
return self.Xnorm
|
||||||
|
|
||||||
|
def dF(self, dY):
|
||||||
|
length = dY.shape[0]
|
||||||
|
|
||||||
|
if self.affine:
|
||||||
|
# Y = gamma * Xnorm + beta
|
||||||
|
dXnorm = dY * self.gamma
|
||||||
|
self.dgamma[:] = (dY * self.Xnorm).sum(0)
|
||||||
|
self.dbeta[:] = dY.sum(0)
|
||||||
|
else:
|
||||||
|
dXnorm = dY
|
||||||
|
|
||||||
|
dstd = (dXnorm * self.center).sum(0) / -self.var
|
||||||
|
dcenter = dXnorm / self.std + dstd / self.std * self.center / length
|
||||||
|
dmean = -dcenter.sum(0)
|
||||||
|
dX = dcenter + dmean / length
|
||||||
|
|
||||||
|
return dX
|
||||||
|
|
||||||
class DenseOneLess(Dense):
|
class DenseOneLess(Dense):
|
||||||
def init(self, W, dW):
|
def init(self, W, dW):
|
||||||
super().init(W, dW)
|
super().init(W, dW)
|
||||||
|
@ -67,35 +128,6 @@ class DenseOneLess(Dense):
|
||||||
np.fill_diagonal(self.dcoeffs, 0)
|
np.fill_diagonal(self.dcoeffs, 0)
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
class LayerNorm(Layer):
|
|
||||||
# paper: https://arxiv.org/abs/1607.06450
|
|
||||||
# my implementation may be incorrect.
|
|
||||||
|
|
||||||
def __init__(self, eps=1e-3, axis=-1):
|
|
||||||
super().__init__()
|
|
||||||
self.eps = _f(eps)
|
|
||||||
self.axis = int(axis)
|
|
||||||
|
|
||||||
def F(self, X):
|
|
||||||
self.center = X - np.mean(X, axis=self.axis, keepdims=True)
|
|
||||||
#self.var = np.var(X, axis=self.axis, keepdims=True) + self.eps
|
|
||||||
self.var = np.mean(np.square(self.center), axis=self.axis, keepdims=True) + self.eps
|
|
||||||
self.std = np.sqrt(self.var) + self.eps
|
|
||||||
Y = self.center / self.std
|
|
||||||
return Y
|
|
||||||
|
|
||||||
def dF(self, dY):
|
|
||||||
length = self.input_shape[self.axis]
|
|
||||||
|
|
||||||
dstd = dY * (-self.center / self.var)
|
|
||||||
dvar = dstd * (0.5 / self.std)
|
|
||||||
dcenter2 = dvar * (1 / length)
|
|
||||||
dcenter = dY * (1 / self.std)
|
|
||||||
dcenter += dcenter2 * (2 * self.center)
|
|
||||||
dX = dcenter - dcenter / length
|
|
||||||
|
|
||||||
return dX
|
|
||||||
|
|
||||||
# Rituals {{{1
|
# Rituals {{{1
|
||||||
|
|
||||||
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
||||||
|
@ -205,6 +237,28 @@ class DumbLearner(AnnealingLearner):
|
||||||
self.restart_callback(restart)
|
self.restart_callback(restart)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Components {{{1
|
||||||
|
|
||||||
|
def _mr_make_norm(norm):
|
||||||
|
def _mr_norm(y, width, depth, block, multi, activation, style, FC, d):
|
||||||
|
skip = y
|
||||||
|
merger = Sum()
|
||||||
|
skip.feed(merger)
|
||||||
|
z_start = skip
|
||||||
|
z_start = z_start.feed(norm())
|
||||||
|
z_start = z_start.feed(activation())
|
||||||
|
for _ in range(multi):
|
||||||
|
z = z_start
|
||||||
|
for j in range(block):
|
||||||
|
if j > 0:
|
||||||
|
z = z.feed(norm())
|
||||||
|
z = z.feed(activation())
|
||||||
|
z = z.feed(FC())
|
||||||
|
z.feed(merger)
|
||||||
|
y = merger
|
||||||
|
return y
|
||||||
|
return _mr_norm
|
||||||
|
|
||||||
def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
|
def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
|
||||||
skip = y
|
skip = y
|
||||||
merger = Sum()
|
merger = Sum()
|
||||||
|
@ -242,9 +296,11 @@ def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
|
||||||
y = merger
|
y = merger
|
||||||
else:
|
else:
|
||||||
y = z
|
y = z
|
||||||
|
#y = y.feed(LayerNorm())
|
||||||
return y
|
return y
|
||||||
|
|
||||||
_mr_styles = dict(
|
_mr_styles = dict(
|
||||||
|
lnorm=_mr_make_norm(LayerNorm),
|
||||||
batchless=_mr_batchless,
|
batchless=_mr_batchless,
|
||||||
onelesssum=_mr_onelesssum,
|
onelesssum=_mr_onelesssum,
|
||||||
)
|
)
|
||||||
|
@ -507,8 +563,9 @@ def run(program, args=None):
|
||||||
mloss = 'mse',
|
mloss = 'mse',
|
||||||
ritual = 'default',
|
ritual = 'default',
|
||||||
restart_optim = False, # restarts also reset internal state of optimizer
|
restart_optim = False, # restarts also reset internal state of optimizer
|
||||||
|
warmup = True,
|
||||||
|
|
||||||
problem = 3,
|
problem = 2,
|
||||||
compare = (
|
compare = (
|
||||||
# best results for ~10,000 parameters
|
# best results for ~10,000 parameters
|
||||||
# training/validation pairs for each problem (starting from problem 0):
|
# training/validation pairs for each problem (starting from problem 0):
|
||||||
|
@ -583,6 +640,15 @@ def run(program, args=None):
|
||||||
measure_error()
|
measure_error()
|
||||||
|
|
||||||
ritual.prepare(model)
|
ritual.prepare(model)
|
||||||
|
|
||||||
|
if training and config.warmup:
|
||||||
|
log("warming", "up")
|
||||||
|
ritual.train_batched(
|
||||||
|
np.random.normal(0, 1, size=inputs.shape),
|
||||||
|
np.random.normal(0, 1, size=outputs.shape),
|
||||||
|
config.batch_size)
|
||||||
|
measure_error()
|
||||||
|
|
||||||
while training and learner.next():
|
while training and learner.next():
|
||||||
indices = np.arange(inputs.shape[0])
|
indices = np.arange(inputs.shape[0])
|
||||||
np.random.shuffle(indices)
|
np.random.shuffle(indices)
|
||||||
|
|
Loading…
Reference in a new issue