This commit is contained in:
Connor Olding 2017-02-03 22:15:54 -08:00
parent 259ef8336e
commit b5a6bb0f19

View File

@ -375,10 +375,11 @@ class Dense(Layer):
self.coeffs.flat = self.weight_init(self.nW, ins, outs)
self.biases.flat = 0
self.std = np.std(self.W)
def F(self, X):
self.X = X
Y = X.dot(self.coeffs) \
+ self.biases
Y = X.dot(self.coeffs) + self.biases
return Y
def dF(self, dY):
@ -396,8 +397,7 @@ class DenseOneLess(Dense):
def F(self, X):
np.fill_diagonal(self.coeffs, 0)
self.X = X
Y = X.dot(self.coeffs) \
+ self.biases
Y = X.dot(self.coeffs) + self.biases
return Y
def dF(self, dY):
@ -546,7 +546,19 @@ class Ritual: # i'm just making up names at this point
def derive(self, residual):
return self.loss.dmean(residual)
def train_batched(self, model, inputs, outputs, batch_size, return_losses=False):
def learn(self, inputs, outputs):
predicted = self.model.forward(inputs)
residual = predicted - outputs
self.model.backward(self.derive(residual))
return residual
def update(self):
self.learner.optim.update(self.model.dW, self.model.W)
def prepare(self, model):
self.model = model
def train_batched(self, inputs, outputs, batch_size, return_losses=False):
cumsum_loss = 0
batch_count = inputs.shape[0] // batch_size
losses = []
@ -558,11 +570,8 @@ class Ritual: # i'm just making up names at this point
if self.learner.per_batch:
self.learner.batch(b / batch_count)
predicted = model.forward(batch_inputs)
residual = predicted - batch_outputs
model.backward(self.derive(residual))
self.learner.optim.update(model.dW, model.W)
residual = self.learn(batch_inputs, batch_outputs)
self.update()
batch_loss = self.measure(residual)
if np.isnan(batch_loss):
@ -576,6 +585,55 @@ class Ritual: # i'm just making up names at this point
else:
return avg_loss
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
# paper: https://arxiv.org/abs/1606.01981
assert W.ndim == 1, W.ndim
assert 0 < gamma < 1, gamma
size = len(W)
alpha = np.max(np.abs(W))
# NOTE: numpy gives [low, high) but the paper advocates [low, high]
mult = np.random.uniform(gamma, 1/gamma, size=size)
if allow_negation: # TODO: verify this is correct. seems to wreak havok.
prob = (W / alpha + 1) / 2
samples = np.random.random_sample(size=size)
mult *= np.where(samples < prob, 1, -1)
np.multiply(W, mult, out=W)
class StochMRitual(Ritual):
# paper: https://arxiv.org/abs/1606.01981
# this probably doesn't make sense for regression problems,
# let alone small models, but here it is anyway!
def __init__(self, learner=None, loss=None, mloss=None, gamma=0.5):
super().__init__(learner, loss, mloss)
self.gamma = nf(gamma)
def prepare(self, model):
self.W = np.copy(model.W)
super().prepare(model)
def learn(self, inputs, outputs):
# an experiment:
#assert self.learner.rate < 10, self.learner.rate
#self.gamma = 1 - 1/2**(1 - np.log10(self.learner.rate))
self.W[:] = self.model.W
for layer in self.model.ordered_nodes:
if isinstance(layer, Dense):
stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma,
allow_negation=True)
residual = super().learn(inputs, outputs)
self.model.W[:] = self.W
return residual
def update(self):
super().update()
f = 0.5
for layer in self.model.ordered_nodes:
if isinstance(layer, Dense):
np.clip(layer.W, -layer.std * f, layer.std * f, out=layer.W)
# np.clip(layer.W, -1, 1, out=layer.W)
class Learner:
per_batch = False
@ -801,12 +859,14 @@ def run(program, args=[]):
# misc
batch_size = 64,
init = 'he_normal',
loss = SomethingElse(),
loss = 'msee',
mloss = 'mse',
restart_optim = False, # restarts also reset internal state of optimizer
unsafe = True, # aka gotta go fast mode
train_compare = None,
valid_compare = 0.0000946,
train_compare = 0.0000508,
valid_compare = 0.0000678,
ritual = None,
)
config.pprint()
@ -874,7 +934,6 @@ def run(program, args=[]):
#
if config.learner == 'SGDR':
#decay = 0.5**(1/(config.epochs / config.learn_halve_every))
learner = SGDR(optim, epochs=config.epochs, rate=config.learn,
restart_decay=config.learn_decay, restarts=config.restarts,
callback=rscb)
@ -895,12 +954,19 @@ def run(program, args=[]):
return Squared()
elif maybe_name == 'mshe': # mushy
return SquaredHalved()
elif maybe_name == 'msee':
return SomethingElse()
raise Exception('unknown objective', maybe_name)
loss = lookup_loss(config.loss)
mloss = lookup_loss(config.mloss) if config.mloss else loss
ritual = Ritual(learner=learner, loss=loss, mloss=mloss)
if config.ritual == None:
ritual = Ritual(learner=learner, loss=loss, mloss=mloss)
elif config.ritual == 'stochm':
ritual = StochMRitual(learner=learner, loss=loss, mloss=mloss)
else:
raise Exception('unknown ritual', config.ritual)
# Training
@ -931,13 +997,14 @@ def run(program, args=[]):
assert inputs.shape[0] % config.batch_size == 0, \
"inputs is not evenly divisible by batch_size" # TODO: lift this restriction
ritual.prepare(model)
while learner.next():
indices = np.arange(inputs.shape[0])
np.random.shuffle(indices)
shuffled_inputs = inputs[indices] / x_scale
shuffled_outputs = outputs[indices] / y_scale
avg_loss, losses = ritual.train_batched(model,
avg_loss, losses = ritual.train_batched(
shuffled_inputs, shuffled_outputs,
config.batch_size,
return_losses=True)