This commit is contained in:
Connor Olding 2017-02-17 13:51:15 -08:00
parent 81f6e72da0
commit fe577eb7f4

View file

@ -48,6 +48,67 @@ class SomethingElse(ResidualLoss):
# Parametric Layers {{{1 # Parametric Layers {{{1
class LayerNorm(Layer):
# paper: https://arxiv.org/abs/1607.06450
# note: nonparametric when affine == False
def __init__(self, eps=1e-5, affine=True):
super().__init__()
self.eps = _f(eps)
self.affine = bool(affine)
self.size = None
def make_shape(self, shape):
super().make_shape(shape)
if len(shape) != 1:
return False
self.features = shape[0]
if self.affine:
self.size = 2 * self.features
return shape
def init(self, W, dW):
# TODO: move this little bit into super(), also assert against self.size
self.W = W
self.dW = dW
f = self.features
self.gamma, self.dgamma = self.W[0*f:1*f], self.dW[0*f:1*f]
self.beta, self.dbeta = self.W[1*f:2*f], self.dW[1*f:2*f]
self.gamma[:] = 1
self.beta[:] = 0
def F(self, X):
self.mean = X.mean(0)
self.center = X - self.mean
self.var = self.center.var(0) + self.eps
self.std = np.sqrt(self.var)
self.Xnorm = self.center / self.std
if self.affine:
return self.gamma * self.Xnorm + self.beta
return self.Xnorm
def dF(self, dY):
length = dY.shape[0]
if self.affine:
# Y = gamma * Xnorm + beta
dXnorm = dY * self.gamma
self.dgamma[:] = (dY * self.Xnorm).sum(0)
self.dbeta[:] = dY.sum(0)
else:
dXnorm = dY
dstd = (dXnorm * self.center).sum(0) / -self.var
dcenter = dXnorm / self.std + dstd / self.std * self.center / length
dmean = -dcenter.sum(0)
dX = dcenter + dmean / length
return dX
class DenseOneLess(Dense): class DenseOneLess(Dense):
def init(self, W, dW): def init(self, W, dW):
super().init(W, dW) super().init(W, dW)
@ -67,35 +128,6 @@ class DenseOneLess(Dense):
np.fill_diagonal(self.dcoeffs, 0) np.fill_diagonal(self.dcoeffs, 0)
return dX return dX
class LayerNorm(Layer):
# paper: https://arxiv.org/abs/1607.06450
# my implementation may be incorrect.
def __init__(self, eps=1e-3, axis=-1):
super().__init__()
self.eps = _f(eps)
self.axis = int(axis)
def F(self, X):
self.center = X - np.mean(X, axis=self.axis, keepdims=True)
#self.var = np.var(X, axis=self.axis, keepdims=True) + self.eps
self.var = np.mean(np.square(self.center), axis=self.axis, keepdims=True) + self.eps
self.std = np.sqrt(self.var) + self.eps
Y = self.center / self.std
return Y
def dF(self, dY):
length = self.input_shape[self.axis]
dstd = dY * (-self.center / self.var)
dvar = dstd * (0.5 / self.std)
dcenter2 = dvar * (1 / length)
dcenter = dY * (1 / self.std)
dcenter += dcenter2 * (2 * self.center)
dX = dcenter - dcenter / length
return dX
# Rituals {{{1 # Rituals {{{1
def stochastic_multiply(W, gamma=0.5, allow_negation=True): def stochastic_multiply(W, gamma=0.5, allow_negation=True):
@ -205,6 +237,28 @@ class DumbLearner(AnnealingLearner):
self.restart_callback(restart) self.restart_callback(restart)
return True return True
# Components {{{1
def _mr_make_norm(norm):
def _mr_norm(y, width, depth, block, multi, activation, style, FC, d):
skip = y
merger = Sum()
skip.feed(merger)
z_start = skip
z_start = z_start.feed(norm())
z_start = z_start.feed(activation())
for _ in range(multi):
z = z_start
for j in range(block):
if j > 0:
z = z.feed(norm())
z = z.feed(activation())
z = z.feed(FC())
z.feed(merger)
y = merger
return y
return _mr_norm
def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d): def _mr_batchless(y, width, depth, block, multi, activation, style, FC, d):
skip = y skip = y
merger = Sum() merger = Sum()
@ -242,9 +296,11 @@ def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
y = merger y = merger
else: else:
y = z y = z
#y = y.feed(LayerNorm())
return y return y
_mr_styles = dict( _mr_styles = dict(
lnorm=_mr_make_norm(LayerNorm),
batchless=_mr_batchless, batchless=_mr_batchless,
onelesssum=_mr_onelesssum, onelesssum=_mr_onelesssum,
) )
@ -507,8 +563,9 @@ def run(program, args=None):
mloss = 'mse', mloss = 'mse',
ritual = 'default', ritual = 'default',
restart_optim = False, # restarts also reset internal state of optimizer restart_optim = False, # restarts also reset internal state of optimizer
warmup = True,
problem = 3, problem = 2,
compare = ( compare = (
# best results for ~10,000 parameters # best results for ~10,000 parameters
# training/validation pairs for each problem (starting from problem 0): # training/validation pairs for each problem (starting from problem 0):
@ -583,6 +640,15 @@ def run(program, args=None):
measure_error() measure_error()
ritual.prepare(model) ritual.prepare(model)
if training and config.warmup:
log("warming", "up")
ritual.train_batched(
np.random.normal(0, 1, size=inputs.shape),
np.random.normal(0, 1, size=outputs.shape),
config.batch_size)
measure_error()
while training and learner.next(): while training and learner.next():
indices = np.arange(inputs.shape[0]) indices = np.arange(inputs.shape[0])
np.random.shuffle(indices) np.random.shuffle(indices)