2018-03-12 19:27:11 -07:00
|
|
|
import numpy as np
|
|
|
|
|
2018-03-17 06:09:15 -07:00
|
|
|
from .float import _f, _0, _1
|
2018-03-06 16:52:26 -08:00
|
|
|
from .layer import Layer
|
|
|
|
from .loss import Loss
|
|
|
|
from .optimizer import Optimizer
|
|
|
|
from .ritual import Ritual
|
|
|
|
from .learner import Learner
|
|
|
|
from .parametric import Dense
|
|
|
|
from .regularizer import Regularizer
|
|
|
|
|
|
|
|
|
|
|
|
class AddSignClip(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1709.07417
|
|
|
|
# with heavy-handed gradient clipping of my own concoction.
|
|
|
|
|
|
|
|
def __init__(self, lr=0.01, mu=0.9, alpha=1.0, clip=1.0):
|
|
|
|
self.mu = _f(mu)
|
|
|
|
self.alpha = _f(alpha)
|
|
|
|
self.clip = _f(clip)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.accum = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.accum is None:
|
|
|
|
self.accum = np.zeros_like(dW)
|
|
|
|
|
|
|
|
self.accum[:] = self.accum * self.mu + dW
|
|
|
|
|
|
|
|
signed = np.sign(dW) * np.sign(self.accum)
|
2018-03-23 22:26:02 -07:00
|
|
|
# signed *= decay
|
2018-03-06 16:52:26 -08:00
|
|
|
|
|
|
|
inter = dW * (self.alpha + signed)
|
|
|
|
|
|
|
|
total_norm = np.linalg.norm(inter)
|
|
|
|
# based on softplus.
|
|
|
|
inter /= np.log(1 + np.exp(total_norm / self.clip - 1)) + 1
|
|
|
|
|
|
|
|
return -self.lr * inter
|
|
|
|
|
|
|
|
|
|
|
|
class PowerSignClip(Optimizer):
|
|
|
|
# paper: https://arxiv.org/abs/1709.07417
|
|
|
|
# with heavy-handed gradient clipping of my own concoction.
|
|
|
|
|
|
|
|
def __init__(self, lr=0.01, mu=0.9, alpha=np.e, clip=1.0):
|
|
|
|
self.mu = _f(mu)
|
|
|
|
self.alpha = _f(alpha)
|
|
|
|
self.use_exp = np.isclose(self.alpha, _f(np.e))
|
|
|
|
self.clip = _f(clip)
|
|
|
|
|
|
|
|
super().__init__(lr)
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
self.accum = None
|
|
|
|
|
|
|
|
def compute(self, dW, W):
|
|
|
|
if self.accum is None:
|
|
|
|
self.accum = np.zeros_like(dW)
|
|
|
|
|
|
|
|
self.accum[:] = self.accum * self.mu + dW
|
|
|
|
|
|
|
|
signed = np.sign(dW) * np.sign(self.accum)
|
2018-03-23 22:26:02 -07:00
|
|
|
# signed *= decay
|
2018-03-06 16:52:26 -08:00
|
|
|
|
|
|
|
if self.use_exp:
|
|
|
|
inter = dW * np.exp(signed)
|
|
|
|
else:
|
|
|
|
inter = dW * np.power(self.alpha, signed)
|
|
|
|
|
|
|
|
total_norm = np.linalg.norm(inter)
|
|
|
|
# based on softplus.
|
|
|
|
inter /= np.log(1 + np.exp(total_norm / self.clip - 1)) + 1
|
|
|
|
|
|
|
|
return -self.lr * inter
|
2018-03-06 16:53:40 -08:00
|
|
|
|
|
|
|
|
|
|
|
class L1L2avg(Regularizer):
|
|
|
|
def __init__(self, l1=0.0, l2=0.0):
|
|
|
|
self.l1 = _f(l1)
|
|
|
|
self.l2 = _f(l2)
|
|
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
f = _0
|
|
|
|
if self.l1:
|
|
|
|
f += np.average(self.l1 * np.abs(X))
|
|
|
|
if self.l2:
|
|
|
|
f += np.average(self.l2 * np.square(X))
|
|
|
|
return f
|
|
|
|
|
|
|
|
def backward(self, X):
|
|
|
|
df = np.zeros_like(X)
|
|
|
|
if self.l1:
|
|
|
|
df += self.l1 / len(X) * np.sign(X)
|
|
|
|
if self.l2:
|
|
|
|
df += self.l2 / len(X) * 2 * X
|
|
|
|
return df
|
2018-03-06 16:54:48 -08:00
|
|
|
|
|
|
|
|
|
|
|
class NoiseInjector(Layer):
|
|
|
|
def __init__(self, scale=1.0, uniform=False, absolute=False,
|
|
|
|
forwards=True, backwards=False):
|
|
|
|
self.scale = _f(scale)
|
|
|
|
self.uniform = bool(uniform)
|
|
|
|
self.absolute = bool(absolute)
|
|
|
|
self.forwards = bool(forwards)
|
|
|
|
self.backwards = bool(backwards)
|
|
|
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
s = self.scale
|
|
|
|
if self.uniform:
|
|
|
|
self.noise = np.random.uniform(-s, s, size=X.shape)
|
|
|
|
else:
|
|
|
|
self.noise = np.random.normal(0, s, size=X.shape)
|
|
|
|
if not self.forwards:
|
|
|
|
return X
|
|
|
|
if self.absolute:
|
|
|
|
return X + np.abs(self.noise)
|
|
|
|
else:
|
|
|
|
return X + self.noise
|
|
|
|
|
|
|
|
def forward_deterministic(self, X):
|
|
|
|
return X
|
|
|
|
|
|
|
|
def backward(self, dY):
|
|
|
|
if not self.backwards:
|
|
|
|
return dY
|
|
|
|
if self.absolute:
|
|
|
|
return dY + np.abs(self.noise)
|
|
|
|
else:
|
|
|
|
return dY + self.noise
|
|
|
|
|
|
|
|
|
|
|
|
class NoiseMultiplier(Layer):
|
|
|
|
def __init__(self, scale=1.0, uniform=False,
|
|
|
|
forwards=True, backwards=True):
|
|
|
|
self.scale = _f(scale)
|
|
|
|
self.uniform = bool(uniform)
|
|
|
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
def forward(self, X):
|
|
|
|
s = self.scale
|
|
|
|
if self.uniform:
|
|
|
|
self.noise = np.exp(np.random.uniform(-s, s, size=X.shape))
|
|
|
|
else:
|
|
|
|
self.noise = np.exp(np.random.normal(0, s, size=X.shape))
|
|
|
|
if not self.forwards:
|
|
|
|
return X
|
|
|
|
return X * self.noise
|
|
|
|
|
|
|
|
def forward_deterministic(self, X):
|
|
|
|
return X
|
|
|
|
|
|
|
|
def backward(self, dY):
|
|
|
|
if not self.backwards:
|
|
|
|
return dY
|
|
|
|
return dY * self.noise
|
2018-03-06 16:58:17 -08:00
|
|
|
|
|
|
|
|
|
|
|
class LookupLearner(Learner):
|
|
|
|
per_batch = True
|
|
|
|
|
|
|
|
def __init__(self, optim, epochs=1, rates=(1,), lerp=False):
|
|
|
|
self.rates = tuple(rates)
|
|
|
|
self.lerp = bool(lerp)
|
|
|
|
self.per_batch = self.lerp
|
|
|
|
super().__init__(optim, epochs, rates[0])
|
|
|
|
|
|
|
|
def rate_at(self, epoch):
|
|
|
|
if self.lerp:
|
|
|
|
ind = min(max(int(epoch), 1), len(self.rates) - 1)
|
|
|
|
t = _f(epoch % 1)
|
|
|
|
a = _f(self.rates[ind-1])
|
|
|
|
b = _f(self.rates[ind])
|
|
|
|
return (_1 - t) * a + t * b
|
|
|
|
else:
|
|
|
|
ind = min(int(epoch), len(self.rates) - 1)
|
|
|
|
return _f(self.rates[ind])
|
2019-03-22 04:53:30 -07:00
|
|
|
|
|
|
|
|
|
|
|
class Confidence(Loss):
|
|
|
|
# this isn't "confidence" in any meaningful way; (e.g. Bayesian)
|
|
|
|
# it's just a metric of how large the value is of the predicted class.
|
|
|
|
# when using it for loss, it acts like a crappy regularizer.
|
|
|
|
# it really just measures how much of a hot-shot the network thinks it is.
|
|
|
|
|
|
|
|
def forward(self, p, y=None):
|
|
|
|
categories = p.shape[-1]
|
|
|
|
confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
|
|
|
|
# the exponent in softmax puts a maximum on confidence,
|
|
|
|
# but we don't compensate for that. if necessary,
|
|
|
|
# it'd be better to use an activation that doesn't have this limit.
|
|
|
|
return np.mean(confidence)
|
|
|
|
|
|
|
|
def backward(self, p, y=None):
|
|
|
|
# in order to agree with the forward pass,
|
|
|
|
# using this backwards pass as-is will minimize confidence.
|
|
|
|
categories = p.shape[-1]
|
|
|
|
detc = p / categories / (1 - 1/categories)
|
|
|
|
dmax = p == np.max(p, axis=-1, keepdims=True)
|
|
|
|
return detc * dmax
|