optim/onn/activation.py

import numpy as np

# just for speed, not strictly essential:
from scipy.special import expit as sigmoid

from .float import *
from .layer_base import *


class Activation(Layer):
    pass


class Identity(Activation):
    def forward(self, X):
        return X

    def backward(self, dY):
        return dY


class Sigmoid(Activation):  # aka Logistic, Expit (inverse of Logit)
    def forward(self, X):
        self.sig = sigmoid(X)
        return self.sig

    def backward(self, dY):
        return dY * self.sig * (1 - self.sig)


class Softplus(Activation):
    # integral of Sigmoid.

    def forward(self, X):
        self.X = X
        return np.log(1 + np.exp(X))

    def backward(self, dY):
        return dY * sigmoid(self.X)


class Tanh(Activation):
    def forward(self, X):
        self.sig = np.tanh(X)
        return self.sig

    def backward(self, dY):
        return dY * (1 - self.sig * self.sig)


class LeCunTanh(Activation):
    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
    # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf
    # scaled such that f([-1, 1]) = [-1, 1].
    # helps preserve an input variance of 1.
    # second derivative peaks around an input of ±1.

    def forward(self, X):
        self.sig = np.tanh(2 / 3 * X)
        return 1.7159 * self.sig

    def backward(self, dY):
        return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)


class Relu(Activation):
    def forward(self, X):
        self.cond = X >= 0
        return np.where(self.cond, X, 0)

    def backward(self, dY):
        return np.where(self.cond, dY, 0)


class Elu(Activation):
    # paper: https://arxiv.org/abs/1511.07289

    def __init__(self, alpha=1):
        super().__init__()
        self.alpha = _f(alpha)  # FIXME: unused

    def forward(self, X):
        self.cond = X >= 0
        self.neg = np.exp(X) - 1
        return np.where(self.cond, X, self.neg)

    def backward(self, dY):
        return dY * np.where(self.cond, 1, self.neg + 1)


class Swish(Activation):
    # paper: https://arxiv.org/abs/1710.05941
    # the beta parameter here is constant instead of trainable.
    # note that Swish generalizes both SiLU and an approximation of GELU.

    def __init__(self, scale=1.0):
        self.scale = _f(scale)

    def forward(self, X):
        self.a = self.scale * X
        self.sig = sigmoid(self.a)
        return X * self.sig

    def backward(self, dY):
        return dY * self.sig * (1 + self.a * (1 - self.sig))


class Silu(Swish):
    # paper: https://arxiv.org/abs/1702.03118
    def __init__(self):
        self.scale = _1


class GeluApprox(Activation):
    # paper: https://arxiv.org/abs/1606.08415
    #  plot: https://www.desmos.com/calculator/ydzgtccsld

    def __init__(self):
        self.scale = _f(1.704)


class Softmax(Activation):
    def forward(self, X):
        alpha = np.max(X, axis=-1, keepdims=True)
        num = np.exp(X - alpha)
        den = np.sum(num, axis=-1, keepdims=True)
        self.sm = num / den
        return self.sm

    def backward(self, dY):
        return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm


class LogSoftmax(Softmax):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.eps = _f(eps)

    def forward(self, X):
        return np.log(super().forward(X) + self.eps)

    def backward(self, dY):
        return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm


class Cos(Activation):
    # performs well on MNIST for some strange reason.

    def forward(self, X):
        self.X = X
        return np.cos(X)

    def backward(self, dY):
        return dY * -np.sin(self.X)


class Selu(Activation):
    # paper: https://arxiv.org/abs/1706.02515

    def __init__(self, alpha=1.67326324, lamb=1.05070099):
        super().__init__()
        self.alpha = _f(alpha)
        self.lamb = _f(lamb)

    def forward(self, X):
        self.cond = X >= 0
        self.neg = self.alpha * np.exp(X)
        return self.lamb * np.where(self.cond, X, self.neg - self.alpha)

    def backward(self, dY):
        return dY * self.lamb * np.where(self.cond, 1, self.neg)


# more

class TanhTest(Activation):
    def forward(self, X):
        self.sig = np.tanh(1 / 2 * X)
        return 2.4004 * self.sig

    def backward(self, dY):
        return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)


class ExpGB(Activation):
    # an output layer for one-hot classification problems.
    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
    # paper: https://arxiv.org/abs/1707.04199

    def __init__(self, alpha=0.1, beta=0.0):
        super().__init__()
        self.alpha = _f(alpha)
        self.beta = _f(beta)

    def forward(self, X):
        return self.alpha * np.exp(X) + self.beta

    def backward(self, dY):
        # this gradient is intentionally incorrect.
        return dY


class CubicGB(Activation):
    # an output layer for one-hot classification problems.
    # use with MSE (SquaredHalved), not CategoricalCrossentropy!
    # paper: https://arxiv.org/abs/1707.04199
    # note: in the paper, it's called pow3GB, which is ugly.

    def __init__(self, alpha=0.1, beta=0.0):
        # note: the paper suggests defaults of 0.001 and 0.0,
        # but these didn't seem to work as well in my limited testing.
        super().__init__()
        self.alpha = _f(alpha)
        self.beta = _f(beta)

    def forward(self, X):
        return self.alpha * X**3 + self.beta

    def backward(self, dY):
        # this gradient is intentionally incorrect.
        return dY


class Arcsinh(Activation):
    def forward(self, X):
        self.X = X
        return np.arcsinh(X)

    def backward(self, dY):
        return dY / np.sqrt(self.X * self.X + 1)


class HardClip(Activation):  # aka HardTanh when at default settings
    def __init__(self, lower=-1.0, upper=1.0):
        super().__init__()
        self.lower = _f(lower)
        self.upper = _f(upper)

    def forward(self, X):
        self.X = X
        return np.clip(X, self.lower, self.upper)

    def backward(self, dY):
        return dY * ((self.X >= self.lower) & (self.X <= self.upper))
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`import numpy as np`

			`# just for speed, not strictly essential:`
			`from scipy.special import expit as sigmoid`

rename stuff and add a couple missing imports 2018-01-21 14:16:36 -08:00			`from .float import *`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`from .layer_base import *`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Activation(Layer):`
			`pass`


			`class Identity(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`def forward(self, X):`
			`return X`

			`def backward(self, dY):`
			`return dY`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Sigmoid(Activation): # aka Logistic, Expit (inverse of Logit)`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`def forward(self, X):`
			`self.sig = sigmoid(X)`
			`return self.sig`

			`def backward(self, dY):`
			`return dY * self.sig * (1 - self.sig)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Softplus(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# integral of Sigmoid.`

			`def forward(self, X):`
			`self.X = X`
			`return np.log(1 + np.exp(X))`

			`def backward(self, dY):`
			`return dY * sigmoid(self.X)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Tanh(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`def forward(self, X):`
			`self.sig = np.tanh(X)`
			`return self.sig`

			`def backward(self, dY):`
			`return dY * (1 - self.sig * self.sig)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class LeCunTanh(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf`
			`# paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf`
			`# scaled such that f([-1, 1]) = [-1, 1].`
			`# helps preserve an input variance of 1.`
			`# second derivative peaks around an input of ±1.`

			`def forward(self, X):`
			`self.sig = np.tanh(2 / 3 * X)`
			`return 1.7159 * self.sig`

			`def backward(self, dY):`
			`return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Relu(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`def forward(self, X):`
			`self.cond = X >= 0`
			`return np.where(self.cond, X, 0)`

			`def backward(self, dY):`
			`return np.where(self.cond, dY, 0)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Elu(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# paper: https://arxiv.org/abs/1511.07289`

			`def __init__(self, alpha=1):`
			`super().__init__()`
basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00			`self.alpha = _f(alpha) # FIXME: unused`
merge and split modules into a package 2018-01-21 14:04:25 -08:00
			`def forward(self, X):`
			`self.cond = X >= 0`
			`self.neg = np.exp(X) - 1`
			`return np.where(self.cond, X, self.neg)`

			`def backward(self, dY):`
			`return dY * np.where(self.cond, 1, self.neg + 1)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Swish(Activation):`
add Swish and SiLU activations 2018-03-10 18:34:00 -08:00			`# paper: https://arxiv.org/abs/1710.05941`
			`# the beta parameter here is constant instead of trainable.`
			`# note that Swish generalizes both SiLU and an approximation of GELU.`

			`def __init__(self, scale=1.0):`
			`self.scale = _f(scale)`
merge and split modules into a package 2018-01-21 14:04:25 -08:00
			`def forward(self, X):`
add Swish and SiLU activations 2018-03-10 18:34:00 -08:00			`self.a = self.scale * X`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`self.sig = sigmoid(self.a)`
			`return X * self.sig`

			`def backward(self, dY):`
			`return dY * self.sig * (1 + self.a * (1 - self.sig))`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
add Swish and SiLU activations 2018-03-10 18:34:00 -08:00			`class Silu(Swish):`
			`# paper: https://arxiv.org/abs/1702.03118`
			`def __init__(self):`
			`self.scale = _1`


activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class GeluApprox(Activation):`
add Swish and SiLU activations 2018-03-10 18:34:00 -08:00			`# paper: https://arxiv.org/abs/1606.08415`
			`# plot: https://www.desmos.com/calculator/ydzgtccsld`

			`def __init__(self):`
			`self.scale = _f(1.704)`


activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Softmax(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`def forward(self, X):`
			`alpha = np.max(X, axis=-1, keepdims=True)`
			`num = np.exp(X - alpha)`
			`den = np.sum(num, axis=-1, keepdims=True)`
			`self.sm = num / den`
			`return self.sm`

			`def backward(self, dY):`
			`return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`class LogSoftmax(Softmax):`
			`def __init__(self, eps=1e-6):`
			`super().__init__()`
			`self.eps = _f(eps)`

			`def forward(self, X):`
			`return np.log(super().forward(X) + self.eps)`

			`def backward(self, dY):`
			`return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Cos(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# performs well on MNIST for some strange reason.`

			`def forward(self, X):`
			`self.X = X`
			`return np.cos(X)`

			`def backward(self, dY):`
			`return dY * -np.sin(self.X)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Selu(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# paper: https://arxiv.org/abs/1706.02515`

			`def __init__(self, alpha=1.67326324, lamb=1.05070099):`
			`super().__init__()`
			`self.alpha = _f(alpha)`
			`self.lamb = _f(lamb)`

			`def forward(self, X):`
			`self.cond = X >= 0`
			`self.neg = self.alpha * np.exp(X)`
			`return self.lamb * np.where(self.cond, X, self.neg - self.alpha)`

			`def backward(self, dY):`
			`return dY * self.lamb * np.where(self.cond, 1, self.neg)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# more`

activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class TanhTest(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`def forward(self, X):`
			`self.sig = np.tanh(1 / 2 * X)`
			`return 2.4004 * self.sig`

			`def backward(self, dY):`
			`return dY * (1 / 2 * 2.4004) * (1 - self.sig * self.sig)`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class ExpGB(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# an output layer for one-hot classification problems.`
			`# use with MSE (SquaredHalved), not CategoricalCrossentropy!`
			`# paper: https://arxiv.org/abs/1707.04199`

			`def __init__(self, alpha=0.1, beta=0.0):`
			`super().__init__()`
			`self.alpha = _f(alpha)`
			`self.beta = _f(beta)`

			`def forward(self, X):`
			`return self.alpha * np.exp(X) + self.beta`

			`def backward(self, dY):`
			`# this gradient is intentionally incorrect.`
			`return dY`

basic PEP 8 compliance rip readability 2018-01-22 11:40:36 -08:00
activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class CubicGB(Activation):`
merge and split modules into a package 2018-01-21 14:04:25 -08:00			`# an output layer for one-hot classification problems.`
			`# use with MSE (SquaredHalved), not CategoricalCrossentropy!`
			`# paper: https://arxiv.org/abs/1707.04199`
			`# note: in the paper, it's called pow3GB, which is ugly.`

			`def __init__(self, alpha=0.1, beta=0.0):`
			`# note: the paper suggests defaults of 0.001 and 0.0,`
			`# but these didn't seem to work as well in my limited testing.`
			`super().__init__()`
			`self.alpha = _f(alpha)`
			`self.beta = _f(beta)`

			`def forward(self, X):`
			`return self.alpha * X**3 + self.beta`

			`def backward(self, dY):`
			`# this gradient is intentionally incorrect.`
			`return dY`
add Arcsinh activation 2018-03-06 16:29:48 -08:00

activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class Arcsinh(Activation):`
add Arcsinh activation 2018-03-06 16:29:48 -08:00			`def forward(self, X):`
			`self.X = X`
			`return np.arcsinh(X)`

			`def backward(self, dY):`
			`return dY / np.sqrt(self.X * self.X + 1)`
add HardClip activation layer 2018-03-07 17:40:42 -08:00

activation layers inherit a dummy class 2018-03-11 14:34:46 -07:00			`class HardClip(Activation): # aka HardTanh when at default settings`
add HardClip activation layer 2018-03-07 17:40:42 -08:00			`def __init__(self, lower=-1.0, upper=1.0):`
			`super().__init__()`
			`self.lower = _f(lower)`
			`self.upper = _f(upper)`

			`def forward(self, X):`
			`self.X = X`
			`return np.clip(X, self.lower, self.upper)`

			`def backward(self, dY):`
fix inequalities in HardClip oldest trick in the book 2018-03-09 19:06:42 -08:00			`return dY * ((self.X >= self.lower) & (self.X <= self.upper))`