add Swish and SiLU activations
This commit is contained in:
parent
91cdea3b26
commit
cb4e6d3725
1 changed files with 22 additions and 4 deletions
|
@ -84,12 +84,16 @@ class Elu(Layer):
|
||||||
return dY * np.where(self.cond, 1, self.neg + 1)
|
return dY * np.where(self.cond, 1, self.neg + 1)
|
||||||
|
|
||||||
|
|
||||||
class GeluApprox(Layer):
|
class Swish(Layer):
|
||||||
# paper: https://arxiv.org/abs/1606.08415
|
# paper: https://arxiv.org/abs/1710.05941
|
||||||
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
# the beta parameter here is constant instead of trainable.
|
||||||
|
# note that Swish generalizes both SiLU and an approximation of GELU.
|
||||||
|
|
||||||
|
def __init__(self, scale=1.0):
|
||||||
|
self.scale = _f(scale)
|
||||||
|
|
||||||
def forward(self, X):
|
def forward(self, X):
|
||||||
self.a = 1.704 * X
|
self.a = self.scale * X
|
||||||
self.sig = sigmoid(self.a)
|
self.sig = sigmoid(self.a)
|
||||||
return X * self.sig
|
return X * self.sig
|
||||||
|
|
||||||
|
@ -97,6 +101,20 @@ class GeluApprox(Layer):
|
||||||
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
return dY * self.sig * (1 + self.a * (1 - self.sig))
|
||||||
|
|
||||||
|
|
||||||
|
class Silu(Swish):
|
||||||
|
# paper: https://arxiv.org/abs/1702.03118
|
||||||
|
def __init__(self):
|
||||||
|
self.scale = _1
|
||||||
|
|
||||||
|
|
||||||
|
class GeluApprox(Layer):
|
||||||
|
# paper: https://arxiv.org/abs/1606.08415
|
||||||
|
# plot: https://www.desmos.com/calculator/ydzgtccsld
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.scale = _f(1.704)
|
||||||
|
|
||||||
|
|
||||||
class Softmax(Layer):
|
class Softmax(Layer):
|
||||||
def forward(self, X):
|
def forward(self, X):
|
||||||
alpha = np.max(X, axis=-1, keepdims=True)
|
alpha = np.max(X, axis=-1, keepdims=True)
|
||||||
|
|
Loading…
Reference in a new issue