optim/onn/activation.py

246 lines
6 KiB
Python
Raw Normal View History

2018-01-21 14:04:25 -08:00
import numpy as np
# just for speed, not strictly essential:
from scipy.special import expit as sigmoid
from .float import _f, _1
2018-01-21 14:04:25 -08:00
from .layer_base import *
2018-01-22 11:40:36 -08:00
class Activation(Layer):
pass
class Identity(Activation):
2018-01-21 14:04:25 -08:00
def forward(self, X):
return X
def backward(self, dY):
return dY
2018-01-22 11:40:36 -08:00
class Sigmoid(Activation): # aka Logistic, Expit (inverse of Logit)
2018-01-21 14:04:25 -08:00
def forward(self, X):
self.sig = sigmoid(X)
return self.sig
def backward(self, dY):
return dY * self.sig * (1 - self.sig)
2018-01-22 11:40:36 -08:00
class Softplus(Activation):
2018-01-21 14:04:25 -08:00
# integral of Sigmoid.
def forward(self, X):
self.X = X
return np.log(1 + np.exp(X))
def backward(self, dY):
return dY * sigmoid(self.X)
2018-01-22 11:40:36 -08:00
class Tanh(Activation):
2018-01-21 14:04:25 -08:00
def forward(self, X):
self.sig = np.tanh(X)
return self.sig
def backward(self, dY):
return dY * (1 - self.sig * self.sig)
2018-01-22 11:40:36 -08:00
class LeCunTanh(Activation):
2018-01-21 14:04:25 -08:00
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
# scaled such that f([-1, 1]) = [-1, 1].
# helps preserve an input variance of 1.
# second derivative peaks around an input of ±1.
def forward(self, X):
self.sig = np.tanh(2 / 3 * X)
return 1.7159 * self.sig
def backward(self, dY):
return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)
2018-01-22 11:40:36 -08:00
class Relu(Activation):
2018-01-21 14:04:25 -08:00
def forward(self, X):
self.cond = X >= 0
return np.where(self.cond, X, 0)
def backward(self, dY):
return np.where(self.cond, dY, 0)
2018-01-22 11:40:36 -08:00
class Elu(Activation):
2018-01-21 14:04:25 -08:00
# paper: https://arxiv.org/abs/1511.07289
def __init__(self, alpha=1):
super().__init__()
2018-01-22 11:40:36 -08:00
self.alpha = _f(alpha) # FIXME: unused
2018-01-21 14:04:25 -08:00
def forward(self, X):
self.cond = X >= 0
self.neg = np.exp(X) - 1
return np.where(self.cond, X, self.neg)
def backward(self, dY):
return dY * np.where(self.cond, 1, self.neg + 1)
2018-01-22 11:40:36 -08:00
class Swish(Activation):
2018-03-10 18:34:00 -08:00
# paper: https://arxiv.org/abs/1710.05941
# the beta parameter here is constant instead of trainable.
# note that Swish generalizes both SiLU and an approximation of GELU.
def __init__(self, scale=1.0):
2018-03-11 18:15:36 -07:00
super().__init__()
2018-03-10 18:34:00 -08:00
self.scale = _f(scale)
2018-01-21 14:04:25 -08:00
def forward(self, X):
2018-03-10 18:34:00 -08:00
self.a = self.scale * X
2018-01-21 14:04:25 -08:00
self.sig = sigmoid(self.a)
return X * self.sig
def backward(self, dY):
return dY * self.sig * (1 + self.a * (1 - self.sig))
2018-01-22 11:40:36 -08:00
2018-03-10 18:34:00 -08:00
class Silu(Swish):
# paper: https://arxiv.org/abs/1702.03118
def __init__(self):
2018-03-11 18:15:36 -07:00
super().__init__(_1)
2018-03-10 18:34:00 -08:00
2018-03-11 18:15:36 -07:00
class GeluApprox(Swish):
2018-03-10 18:34:00 -08:00
# paper: https://arxiv.org/abs/1606.08415
# plot: https://www.desmos.com/calculator/ydzgtccsld
def __init__(self):
2018-03-11 18:15:36 -07:00
super().__init__(_f(1.704))
2018-03-10 18:34:00 -08:00
class Softmax(Activation):
2018-01-21 14:04:25 -08:00
def forward(self, X):
alpha = np.max(X, axis=-1, keepdims=True)
num = np.exp(X - alpha)
den = np.sum(num, axis=-1, keepdims=True)
self.sm = num / den
return self.sm
def backward(self, dY):
return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm
2018-01-22 11:40:36 -08:00
2018-01-21 14:04:25 -08:00
class LogSoftmax(Softmax):
def __init__(self, eps=1e-6):
super().__init__()
self.eps = _f(eps)
def forward(self, X):
return np.log(super().forward(X) + self.eps)
def backward(self, dY):
return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm
2018-01-22 11:40:36 -08:00
class Cos(Activation):
2018-01-21 14:04:25 -08:00
# performs well on MNIST for some strange reason.
def forward(self, X):
self.X = X
return np.cos(X)
def backward(self, dY):
return dY * -np.sin(self.X)
2018-01-22 11:40:36 -08:00
class Selu(Activation):
2018-01-21 14:04:25 -08:00
# paper: https://arxiv.org/abs/1706.02515
def __init__(self, alpha=1.67326324, lamb=1.05070099):
super().__init__()
self.alpha = _f(alpha)
self.lamb = _f(lamb)
def forward(self, X):
self.cond = X >= 0
self.neg = self.alpha * np.exp(X)
return self.lamb * np.where(self.cond, X, self.neg - self.alpha)
def backward(self, dY):
return dY * self.lamb * np.where(self.cond, 1, self.neg)
2018-01-22 11:40:36 -08:00
2018-01-21 14:04:25 -08:00
# more
class TanhTest(Activation):
2018-01-21 14:04:25 -08:00
def forward(self, X):
self.sig = np.tanh(1 / 2 * X)
return 2.4004 * self.sig
def backward(self, dY):
return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)
2018-01-22 11:40:36 -08:00
class ExpGB(Activation):
2018-01-21 14:04:25 -08:00
# an output layer for one-hot classification problems.
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
# paper: https://arxiv.org/abs/1707.04199
def __init__(self, alpha=0.1, beta=0.0):
super().__init__()
self.alpha = _f(alpha)
self.beta = _f(beta)
def forward(self, X):
return self.alpha * np.exp(X) + self.beta
def backward(self, dY):
# this gradient is intentionally incorrect.
return dY
2018-01-22 11:40:36 -08:00
class CubicGB(Activation):
2018-01-21 14:04:25 -08:00
# an output layer for one-hot classification problems.
# use with MSE (SquaredHalved), not CategoricalCrossentropy!
# paper: https://arxiv.org/abs/1707.04199
# note: in the paper, it's called pow3GB, which is ugly.
def __init__(self, alpha=0.1, beta=0.0):
# note: the paper suggests defaults of 0.001 and 0.0,
# but these didn't seem to work as well in my limited testing.
super().__init__()
self.alpha = _f(alpha)
self.beta = _f(beta)
def forward(self, X):
return self.alpha * X**3 + self.beta
def backward(self, dY):
# this gradient is intentionally incorrect.
return dY
2018-03-06 16:29:48 -08:00
class Arcsinh(Activation):
2018-03-06 16:29:48 -08:00
def forward(self, X):
self.X = X
return np.arcsinh(X)
def backward(self, dY):
return dY / np.sqrt(self.X * self.X + 1)
2018-03-07 17:40:42 -08:00
class HardClip(Activation): # aka HardTanh when at default settings
2018-03-07 17:40:42 -08:00
def __init__(self, lower=-1.0, upper=1.0):
super().__init__()
self.lower = _f(lower)
self.upper = _f(upper)
def forward(self, X):
self.X = X
return np.clip(X, self.lower, self.upper)
def backward(self, dY):
return dY * ((self.X >= self.lower) & (self.X <= self.upper))