Compare commits
5 Commits
06bbfc6340
...
a8871d4a67
Author | SHA1 | Date | |
---|---|---|---|
Connor Olding | a8871d4a67 | ||
Connor Olding | 96dfc904b1 | ||
Connor Olding | b6fe7f711e | ||
Connor Olding | 5cd4e8d1c1 | ||
Connor Olding | 4a5084df48 |
|
@ -262,6 +262,23 @@ class HardClip(Activation): # aka HardTanh when at default settings
|
|||
return dY * ((self.X >= self.lower) & (self.X <= self.upper))
|
||||
|
||||
|
||||
class ISRLU(Activation):
|
||||
# Inverse Square Root Linear Unit, a faster alternative to ELU
|
||||
# paper: https://arxiv.org/abs/1710.09967
|
||||
|
||||
def __init__(self, alpha=1.0):
|
||||
super().__init__()
|
||||
self.alpha = _f(alpha)
|
||||
|
||||
def forward(self, X):
|
||||
self.memo = np.reciprocal(np.sqrt(1 + X * X * self.alpha))
|
||||
self.cond = X < 0
|
||||
return np.where(self.cond, X * self.memo, X)
|
||||
|
||||
def backward(self, dY):
|
||||
return self.cond * self.memo * self.memo * self.memo * dY
|
||||
|
||||
|
||||
class PolyFeat(Layer):
|
||||
# i haven't yet decided if this counts as an Activation subclass
|
||||
# due to the increased output size, so i'm opting not to inherit it.
|
||||
|
|
|
@ -12,6 +12,10 @@ class Loss:
|
|||
|
||||
|
||||
class NLL(Loss): # Negative Log Likelihood
|
||||
# NOTE: this is a misnomer -- the "log" part is not implemented here.
|
||||
# instead, you should use a Log activation at the end of your network
|
||||
# e.g. LogSoftmax.
|
||||
# TODO: simplify the math that comes about it.
|
||||
def forward(self, p, y):
|
||||
correct = p * y
|
||||
return np.mean(-correct)
|
||||
|
|
|
@ -436,8 +436,8 @@ class Adamlike(Optimizer):
|
|||
debias=True, runmax=False, yogi=False, eps=1e-8):
|
||||
self.b1 = _f(b1) # decay term
|
||||
self.b2 = _f(b2) # decay term
|
||||
self.b1_t_default = _f(b1) # decay term power t
|
||||
self.b2_t_default = _f(b2) # decay term power t
|
||||
self.b1_t_default = _f(np.abs(b1)) # decay term power t
|
||||
self.b2_t_default = _f(np.abs(b2)) # decay term power t
|
||||
self.power = _f(power)
|
||||
self.debias = bool(debias)
|
||||
self.runmax = bool(runmax)
|
||||
|
@ -487,17 +487,19 @@ class Adamlike(Optimizer):
|
|||
delta = mt
|
||||
elif self.power == 1:
|
||||
delta = mt / (vt + self.eps)
|
||||
elif self.power == 1/2: # TODO: is this actually faster?
|
||||
elif self.power == 1/2:
|
||||
delta = mt / (np.sqrt(vt) + self.eps)
|
||||
elif self.power == 1/3: # TODO: is this actually faster?
|
||||
elif self.power == 1/3:
|
||||
delta = mt / (np.cbrt(vt) + self.eps)
|
||||
elif self.power == 1/4:
|
||||
delta = mt / (np.sqrt(np.sqrt(vt)) + self.eps)
|
||||
else:
|
||||
delta = mt / (vt**self.power + self.eps)
|
||||
|
||||
if self.debias:
|
||||
# decay gain.
|
||||
self.b1_t *= self.b1
|
||||
self.b2_t *= self.b2
|
||||
self.b1_t *= np.abs(self.b1)
|
||||
self.b2_t *= np.abs(self.b2)
|
||||
|
||||
return -self.lr * delta
|
||||
|
||||
|
|
|
@ -62,6 +62,33 @@ class Dense(Layer):
|
|||
return dY @ self.coeffs.f.T
|
||||
|
||||
|
||||
class DenseUnbiased(Layer):
|
||||
serialized = {
|
||||
'W': 'coeffs',
|
||||
}
|
||||
|
||||
def __init__(self, dim, init=init_he_uniform, reg_w=None):
|
||||
super().__init__()
|
||||
self.dim = int(dim)
|
||||
self.output_shape = (dim,)
|
||||
self.coeffs = self._new_weights('coeffs', init=init,
|
||||
regularizer=reg_w)
|
||||
|
||||
def make_shape(self, parent):
|
||||
shape = parent.output_shape
|
||||
self.input_shape = shape
|
||||
assert len(shape) == 1, shape
|
||||
self.coeffs.shape = (shape[0], self.dim)
|
||||
|
||||
def forward(self, X):
|
||||
self.X = X
|
||||
return X @ self.coeffs.f
|
||||
|
||||
def backward(self, dY):
|
||||
self.coeffs.g += self.X.T @ dY
|
||||
return dY @ self.coeffs.f.T
|
||||
|
||||
|
||||
# more
|
||||
|
||||
class Conv1Dper(Layer):
|
||||
|
@ -265,3 +292,62 @@ class CosineDense(Dense):
|
|||
dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X
|
||||
|
||||
return dX
|
||||
|
||||
|
||||
class Sparse(Layer):
|
||||
# (WIP)
|
||||
# roughly implements a structured, sparsely-connected layer.
|
||||
# paper: https://arxiv.org/abs/1812.01164
|
||||
|
||||
# TODO: (re)implement serialization.
|
||||
|
||||
def __init__(self, dim, con, init=init_he_uniform, reg=None):
|
||||
super().__init__()
|
||||
self.dim = int(dim)
|
||||
self.con = int(con)
|
||||
self.output_shape = (dim,)
|
||||
self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg)
|
||||
self.indices = None
|
||||
|
||||
def make_shape(self, parent):
|
||||
shape = parent.output_shape
|
||||
self.input_shape = shape
|
||||
assert len(shape) == 1, shape
|
||||
self.coeffs.shape = (self.con, self.dim)
|
||||
self.size_in = shape[0]
|
||||
self.make_indices(self.size_in, self.con, self.dim)
|
||||
|
||||
def make_indices(self, size_in, connectivity, size_out):
|
||||
basic = np.arange(size_in)
|
||||
indices = []
|
||||
inv_ind = []
|
||||
count = 0
|
||||
desired = size_out * connectivity
|
||||
# TODO: replace with a for loop.
|
||||
while count < desired:
|
||||
np.random.shuffle(basic)
|
||||
indices.append(basic.copy())
|
||||
inverse = np.zeros_like(basic)
|
||||
inverse[basic] = np.arange(len(basic)) + count
|
||||
inv_ind.append(inverse)
|
||||
count += len(basic)
|
||||
self.indices = np.concatenate(indices)[:desired].copy()
|
||||
self.inv_ind = np.concatenate(inv_ind)
|
||||
|
||||
def forward(self, X):
|
||||
self.X = X
|
||||
self.O = X[:,self.indices].reshape(len(X), self.con, self.dim)
|
||||
return np.sum(self.O * self.coeffs.f, 1)
|
||||
|
||||
def backward(self, dY):
|
||||
dY = np.expand_dims(dY, 1)
|
||||
self.coeffs.g += np.sum(dY * self.O, 0)
|
||||
dO = dY * self.coeffs.f
|
||||
|
||||
x = dO
|
||||
batch_size = len(x)
|
||||
x = x.reshape(batch_size, -1)
|
||||
if x.shape[1] % self.size_in != 0:
|
||||
x = np.pad(x, ((0, 0), (0, self.size_in - x.shape[1] % self.size_in)))
|
||||
x = x[:, self.inv_ind].reshape(batch_size, -1, self.size_in)
|
||||
return x.sum(1)
|
||||
|
|
Loading…
Reference in New Issue
Block a user