comment on NLL implementation

add ISRLU activation
needs testing
2020-03-17 07:27:39 -07:00 · 2020-03-17 07:27:03 -07:00 · 2020-03-17 07:26:45 -07:00 · 2020-03-17 07:26:31 -07:00 · 2020-03-17 07:26:05 -07:00
4 changed files with 115 additions and 6 deletions
--- a/onn/activation.py
+++ b/onn/activation.py
@ -262,6 +262,23 @@ class HardClip(Activation):  # aka HardTanh when at default settings
        return dY * ((self.X >= self.lower) & (self.X <= self.upper))


+class ISRLU(Activation):
+    # Inverse Square Root Linear Unit, a faster alternative to ELU
+    # paper: https://arxiv.org/abs/1710.09967
+
+    def __init__(self, alpha=1.0):
+        super().__init__()
+        self.alpha = _f(alpha)
+
+    def forward(self, X):
+        self.memo = np.reciprocal(np.sqrt(1 + X * X * self.alpha))
+        self.cond = X < 0
+        return np.where(self.cond, X * self.memo, X)
+
+    def backward(self, dY):
+        return self.cond * self.memo * self.memo * self.memo * dY
+
+
 class PolyFeat(Layer):
    # i haven't yet decided if this counts as an Activation subclass
    # due to the increased output size, so i'm opting not to inherit it.
--- a/onn/loss.py
+++ b/onn/loss.py
@ -12,6 +12,10 @@ class Loss:


 class NLL(Loss):  # Negative Log Likelihood
+    # NOTE: this is a misnomer -- the "log" part is not implemented here.
+    #       instead, you should use a Log activation at the end of your network
+    #       e.g. LogSoftmax.
+    #       TODO: simplify the math that comes about it.
    def forward(self, p, y):
        correct = p * y
        return np.mean(-correct)
--- a/onn/optimizer.py
+++ b/onn/optimizer.py
@ -436,8 +436,8 @@ class Adamlike(Optimizer):
                 debias=True, runmax=False, yogi=False, eps=1e-8):
        self.b1 = _f(b1)  # decay term
        self.b2 = _f(b2)  # decay term
-        self.b1_t_default = _f(b1)  # decay term power t
-        self.b2_t_default = _f(b2)  # decay term power t
+        self.b1_t_default = _f(np.abs(b1))  # decay term power t
+        self.b2_t_default = _f(np.abs(b2))  # decay term power t
        self.power = _f(power)
        self.debias = bool(debias)
        self.runmax = bool(runmax)
@ -487,17 +487,19 @@ class Adamlike(Optimizer):
            delta = mt
        elif self.power == 1:
            delta = mt / (vt + self.eps)
-        elif self.power == 1/2:  # TODO: is this actually faster?
+        elif self.power == 1/2:
            delta = mt / (np.sqrt(vt) + self.eps)
-        elif self.power == 1/3:  # TODO: is this actually faster?
+        elif self.power == 1/3:
            delta = mt / (np.cbrt(vt) + self.eps)
+        elif self.power == 1/4:
+            delta = mt / (np.sqrt(np.sqrt(vt)) + self.eps)
        else:
            delta = mt / (vt**self.power + self.eps)

        if self.debias:
            # decay gain.
-            self.b1_t *= self.b1
-            self.b2_t *= self.b2
+            self.b1_t *= np.abs(self.b1)
+            self.b2_t *= np.abs(self.b2)

        return -self.lr * delta

--- a/onn/parametric.py
+++ b/onn/parametric.py
@ -62,6 +62,33 @@ class Dense(Layer):
        return dY @ self.coeffs.f.T


+class DenseUnbiased(Layer):
+    serialized = {
+        'W': 'coeffs',
+    }
+
+    def __init__(self, dim, init=init_he_uniform, reg_w=None):
+        super().__init__()
+        self.dim = int(dim)
+        self.output_shape = (dim,)
+        self.coeffs = self._new_weights('coeffs', init=init,
+                                        regularizer=reg_w)
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        assert len(shape) == 1, shape
+        self.coeffs.shape = (shape[0], self.dim)
+
+    def forward(self, X):
+        self.X = X
+        return X @ self.coeffs.f
+
+    def backward(self, dY):
+        self.coeffs.g += self.X.T @ dY
+        return dY @ self.coeffs.f.T
+
+
 # more

 class Conv1Dper(Layer):
@ -265,3 +292,62 @@ class CosineDense(Dense):
        dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X

        return dX
+
+
+class Sparse(Layer):
+    # (WIP)
+    # roughly implements a structured, sparsely-connected layer.
+    # paper: https://arxiv.org/abs/1812.01164
+
+    # TODO: (re)implement serialization.
+
+    def __init__(self, dim, con, init=init_he_uniform, reg=None):
+        super().__init__()
+        self.dim = int(dim)
+        self.con = int(con)
+        self.output_shape = (dim,)
+        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg)
+        self.indices = None
+
+    def make_shape(self, parent):
+        shape = parent.output_shape
+        self.input_shape = shape
+        assert len(shape) == 1, shape
+        self.coeffs.shape = (self.con, self.dim)
+        self.size_in = shape[0]
+        self.make_indices(self.size_in, self.con, self.dim)
+
+    def make_indices(self, size_in, connectivity, size_out):
+        basic = np.arange(size_in)
+        indices = []
+        inv_ind = []
+        count = 0
+        desired = size_out * connectivity
+        # TODO: replace with a for loop.
+        while count < desired:
+            np.random.shuffle(basic)
+            indices.append(basic.copy())
+            inverse = np.zeros_like(basic)
+            inverse[basic] = np.arange(len(basic)) + count
+            inv_ind.append(inverse)
+            count += len(basic)
+        self.indices = np.concatenate(indices)[:desired].copy()
+        self.inv_ind = np.concatenate(inv_ind)
+
+    def forward(self, X):
+        self.X = X
+        self.O = X[:,self.indices].reshape(len(X), self.con, self.dim)
+        return np.sum(self.O * self.coeffs.f, 1)
+
+    def backward(self, dY):
+        dY = np.expand_dims(dY, 1)
+        self.coeffs.g += np.sum(dY * self.O, 0)
+        dO = dY * self.coeffs.f
+
+        x = dO
+        batch_size = len(x)
+        x = x.reshape(batch_size, -1)
+        if x.shape[1] % self.size_in != 0:
+            x = np.pad(x, ((0, 0), (0, self.size_in - x.shape[1] % self.size_in)))
+        x = x[:, self.inv_ind].reshape(batch_size, -1, self.size_in)
+        return x.sum(1)
Author	SHA1	Message	Date
Connor Olding	a8871d4a67	comment on NLL implementation	2020-03-17 07:27:39 -07:00
Connor Olding	96dfc904b1	add ISRLU activation needs testing	2020-03-17 07:27:03 -07:00
Connor Olding	b6fe7f711e	add DenseUnbiased layer	2020-03-17 07:26:45 -07:00
Connor Olding	5cd4e8d1c1	add preliminary Sparse layer	2020-03-17 07:26:31 -07:00
Connor Olding	4a5084df48	refine Adamlike	2020-03-17 07:26:05 -07:00