.
This commit is contained in:
parent
205d64a8a0
commit
0306b6f1e0
2 changed files with 37 additions and 16 deletions
33
optim_nn.py
33
optim_nn.py
|
@ -239,7 +239,7 @@ class CosineDense(Dense):
|
||||||
|
|
||||||
# Rituals {{{1
|
# Rituals {{{1
|
||||||
|
|
||||||
def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
def stochastic_multiply(W, gamma=0.5, allow_negation=False):
|
||||||
# paper: https://arxiv.org/abs/1606.01981
|
# paper: https://arxiv.org/abs/1606.01981
|
||||||
|
|
||||||
assert W.ndim == 1, W.ndim
|
assert W.ndim == 1, W.ndim
|
||||||
|
@ -248,7 +248,11 @@ def stochastic_multiply(W, gamma=0.5, allow_negation=True):
|
||||||
alpha = np.max(np.abs(W))
|
alpha = np.max(np.abs(W))
|
||||||
# NOTE: numpy gives [low, high) but the paper advocates [low, high]
|
# NOTE: numpy gives [low, high) but the paper advocates [low, high]
|
||||||
mult = np.random.uniform(gamma, 1/gamma, size=size)
|
mult = np.random.uniform(gamma, 1/gamma, size=size)
|
||||||
if allow_negation: # TODO: verify this is correct. seems to wreak havok.
|
if allow_negation:
|
||||||
|
# NOTE: i have yet to see this do anything but cause divergence.
|
||||||
|
# i've referenced the paper several times yet still don't understand
|
||||||
|
# what i'm doing wrong, so i'm disabling it by default in my code.
|
||||||
|
# maybe i just need *a lot* more weights to compensate.
|
||||||
prob = (W / alpha + 1) / 2
|
prob = (W / alpha + 1) / 2
|
||||||
samples = np.random.random_sample(size=size)
|
samples = np.random.random_sample(size=size)
|
||||||
mult *= np.where(samples < prob, 1, -1)
|
mult *= np.where(samples < prob, 1, -1)
|
||||||
|
@ -275,8 +279,7 @@ class StochMRitual(Ritual):
|
||||||
self.W[:] = self.model.W
|
self.W[:] = self.model.W
|
||||||
for layer in self.model.ordered_nodes:
|
for layer in self.model.ordered_nodes:
|
||||||
if isinstance(layer, Dense):
|
if isinstance(layer, Dense):
|
||||||
stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma,
|
stochastic_multiply(layer.coeffs.ravel(), gamma=self.gamma)
|
||||||
allow_negation=True)
|
|
||||||
residual = super().learn(inputs, outputs)
|
residual = super().learn(inputs, outputs)
|
||||||
self.model.W[:] = self.W
|
self.model.W[:] = self.W
|
||||||
return residual
|
return residual
|
||||||
|
@ -299,23 +302,25 @@ class NoisyRitual(Ritual):
|
||||||
|
|
||||||
def learn(self, inputs, outputs):
|
def learn(self, inputs, outputs):
|
||||||
# this is pretty crude
|
# this is pretty crude
|
||||||
s = self.input_noise
|
if self.input_noise > 0:
|
||||||
noisy_inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
s = self.input_noise
|
||||||
s = self.output_noise
|
inputs = inputs + np.random.normal(0, s, size=inputs.shape)
|
||||||
noisy_outputs = outputs + np.random.normal(0, s, size=outputs.shape)
|
if self.output_noise > 0:
|
||||||
return super().learn(noisy_inputs, noisy_outputs)
|
s = self.output_noise
|
||||||
|
outputs = outputs + np.random.normal(0, s, size=outputs.shape)
|
||||||
|
return super().learn(inputs, outputs)
|
||||||
|
|
||||||
def update(self):
|
def update(self):
|
||||||
# gradient noise paper: https://arxiv.org/abs/1511.06807
|
# gradient noise paper: https://arxiv.org/abs/1511.06807
|
||||||
if self.gradient_noise > 0:
|
if self.gradient_noise > 0:
|
||||||
size = len(self.model.dW)
|
size = len(self.model.dW)
|
||||||
gamma = 0.55
|
gamma = 0.55
|
||||||
s = self.gradient_noise / (1 + self.bn) ** gamma
|
#s = self.gradient_noise / (1 + self.bn) ** gamma
|
||||||
# experiments:
|
# experiments:
|
||||||
#s = np.sqrt(self.learner.rate)
|
s = self.gradient_noise * np.sqrt(self.learner.rate)
|
||||||
#s = np.square(self.learner.rate)
|
#s = np.square(self.learner.rate)
|
||||||
#s = self.learner.rate / self.en
|
#s = self.learner.rate / self.en
|
||||||
self.model.dW += np.random.normal(0, s, size=size)
|
self.model.dW += np.random.normal(0, max(s, 1e-8), size=size)
|
||||||
super().update()
|
super().update()
|
||||||
|
|
||||||
# Learners {{{1
|
# Learners {{{1
|
||||||
|
@ -607,8 +612,6 @@ def ritual_from_config(config, learner, loss, mloss):
|
||||||
return ritual
|
return ritual
|
||||||
|
|
||||||
def model_from_config(config, input_features, output_features, callbacks):
|
def model_from_config(config, input_features, output_features, callbacks):
|
||||||
# Our Test Model
|
|
||||||
|
|
||||||
init = inits[config.init]
|
init = inits[config.init]
|
||||||
activation = activations[config.activation]
|
activation = activations[config.activation]
|
||||||
|
|
||||||
|
@ -725,6 +728,8 @@ def run(program, args=None):
|
||||||
input_features = inputs.shape[-1]
|
input_features = inputs.shape[-1]
|
||||||
output_features = outputs.shape[-1]
|
output_features = outputs.shape[-1]
|
||||||
|
|
||||||
|
# Our Test Model
|
||||||
|
|
||||||
callbacks = Dummy()
|
callbacks = Dummy()
|
||||||
|
|
||||||
model, learner, ritual = \
|
model, learner, ritual = \
|
||||||
|
|
|
@ -78,6 +78,20 @@ class Accuracy(Loss):
|
||||||
def backward(self, p, y):
|
def backward(self, p, y):
|
||||||
raise NotImplementedError("cannot take the gradient of Accuracy")
|
raise NotImplementedError("cannot take the gradient of Accuracy")
|
||||||
|
|
||||||
|
class Confidence(Loss):
|
||||||
|
def forward(self, p, y):
|
||||||
|
categories = y.shape[-1]
|
||||||
|
#confidence = (p - 1/categories) / (1 - categories)
|
||||||
|
#confidence = 1 - np.min(p, axis=-1) * categories
|
||||||
|
confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
|
||||||
|
# there's also an upper bound on confidence
|
||||||
|
# due to the exponent in softmax,
|
||||||
|
# but we don't compensate for that. keep it simple.
|
||||||
|
return np.mean(confidence)
|
||||||
|
|
||||||
|
def backward(self, p, y):
|
||||||
|
raise NotImplementedError("this is probably a bad idea")
|
||||||
|
|
||||||
class ResidualLoss(Loss):
|
class ResidualLoss(Loss):
|
||||||
def forward(self, p, y):
|
def forward(self, p, y):
|
||||||
return np.mean(self.f(p - y))
|
return np.mean(self.f(p - y))
|
||||||
|
@ -725,8 +739,10 @@ class Ritual: # i'm just making up names at this point
|
||||||
if not test_only and self.learner.per_batch:
|
if not test_only and self.learner.per_batch:
|
||||||
self.learner.batch(b / batch_count)
|
self.learner.batch(b / batch_count)
|
||||||
|
|
||||||
predicted = self.learn(batch_inputs, batch_outputs)
|
if test_only:
|
||||||
if not test_only:
|
predicted = self.model.forward(batch_inputs)
|
||||||
|
else:
|
||||||
|
predicted = self.learn(batch_inputs, batch_outputs)
|
||||||
self.update()
|
self.update()
|
||||||
|
|
||||||
if return_losses == 'both':
|
if return_losses == 'both':
|
||||||
|
|
Loading…
Reference in a new issue