4 changed files with 6 additions and 115 deletions
--- a/onn/activation.py
+++ b/onn/activation.py
@ -262,23 +262,6 @@ class HardClip(Activation):  # aka HardTanh when at default settings
        return dY * ((self.X >= self.lower) & (self.X <= self.upper))


-class ISRLU(Activation):
-    # Inverse Square Root Linear Unit, a faster alternative to ELU
-    # paper: https://arxiv.org/abs/1710.09967
-
-    def __init__(self, alpha=1.0):
-        super().__init__()
-        self.alpha = _f(alpha)
-
-    def forward(self, X):
-        self.memo = np.reciprocal(np.sqrt(1 + X * X * self.alpha))
-        self.cond = X < 0
-        return np.where(self.cond, X * self.memo, X)
-
-    def backward(self, dY):
-        return self.cond * self.memo * self.memo * self.memo * dY
-
-
 class PolyFeat(Layer):
    # i haven't yet decided if this counts as an Activation subclass
    # due to the increased output size, so i'm opting not to inherit it.
--- a/onn/loss.py
+++ b/onn/loss.py
@ -12,10 +12,6 @@ class Loss:


 class NLL(Loss):  # Negative Log Likelihood
-    # NOTE: this is a misnomer -- the "log" part is not implemented here.
-    #       instead, you should use a Log activation at the end of your network
-    #       e.g. LogSoftmax.
-    #       TODO: simplify the math that comes about it.
    def forward(self, p, y):
        correct = p * y
        return np.mean(-correct)
--- a/onn/optimizer.py
+++ b/onn/optimizer.py
@ -436,8 +436,8 @@ class Adamlike(Optimizer):
                 debias=True, runmax=False, yogi=False, eps=1e-8):
        self.b1 = _f(b1)  # decay term
        self.b2 = _f(b2)  # decay term
-        self.b1_t_default = _f(np.abs(b1))  # decay term power t
-        self.b2_t_default = _f(np.abs(b2))  # decay term power t
+        self.b1_t_default = _f(b1)  # decay term power t
+        self.b2_t_default = _f(b2)  # decay term power t
        self.power = _f(power)
        self.debias = bool(debias)
        self.runmax = bool(runmax)
@ -487,19 +487,17 @@ class Adamlike(Optimizer):
            delta = mt
        elif self.power == 1:
            delta = mt / (vt + self.eps)
-        elif self.power == 1/2:
+        elif self.power == 1/2:  # TODO: is this actually faster?
            delta = mt / (np.sqrt(vt) + self.eps)
-        elif self.power == 1/3:
+        elif self.power == 1/3:  # TODO: is this actually faster?
            delta = mt / (np.cbrt(vt) + self.eps)
-        elif self.power == 1/4:
-            delta = mt / (np.sqrt(np.sqrt(vt)) + self.eps)
        else:
            delta = mt / (vt**self.power + self.eps)

        if self.debias:
            # decay gain.
-            self.b1_t *= np.abs(self.b1)
-            self.b2_t *= np.abs(self.b2)
+            self.b1_t *= self.b1
+            self.b2_t *= self.b2

        return -self.lr * delta

--- a/onn/parametric.py
+++ b/onn/parametric.py
@ -62,33 +62,6 @@ class Dense(Layer):
        return dY @ self.coeffs.f.T


-class DenseUnbiased(Layer):
-    serialized = {
-        'W': 'coeffs',
-    }
-
-    def __init__(self, dim, init=init_he_uniform, reg_w=None):
-        super().__init__()
-        self.dim = int(dim)
-        self.output_shape = (dim,)
-        self.coeffs = self._new_weights('coeffs', init=init,
-                                        regularizer=reg_w)
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        assert len(shape) == 1, shape
-        self.coeffs.shape = (shape[0], self.dim)
-
-    def forward(self, X):
-        self.X = X
-        return X @ self.coeffs.f
-
-    def backward(self, dY):
-        self.coeffs.g += self.X.T @ dY
-        return dY @ self.coeffs.f.T
-
-
 # more

 class Conv1Dper(Layer):
@ -292,62 +265,3 @@ class CosineDense(Dense):
        dX = ddot @ self.coeffs.f.T + dX_norm / self.X_norm * self.X

        return dX
-
-
-class Sparse(Layer):
-    # (WIP)
-    # roughly implements a structured, sparsely-connected layer.
-    # paper: https://arxiv.org/abs/1812.01164
-
-    # TODO: (re)implement serialization.
-
-    def __init__(self, dim, con, init=init_he_uniform, reg=None):
-        super().__init__()
-        self.dim = int(dim)
-        self.con = int(con)
-        self.output_shape = (dim,)
-        self.coeffs = self._new_weights('coeffs', init=init, regularizer=reg)
-        self.indices = None
-
-    def make_shape(self, parent):
-        shape = parent.output_shape
-        self.input_shape = shape
-        assert len(shape) == 1, shape
-        self.coeffs.shape = (self.con, self.dim)
-        self.size_in = shape[0]
-        self.make_indices(self.size_in, self.con, self.dim)
-
-    def make_indices(self, size_in, connectivity, size_out):
-        basic = np.arange(size_in)
-        indices = []
-        inv_ind = []
-        count = 0
-        desired = size_out * connectivity
-        # TODO: replace with a for loop.
-        while count < desired:
-            np.random.shuffle(basic)
-            indices.append(basic.copy())
-            inverse = np.zeros_like(basic)
-            inverse[basic] = np.arange(len(basic)) + count
-            inv_ind.append(inverse)
-            count += len(basic)
-        self.indices = np.concatenate(indices)[:desired].copy()
-        self.inv_ind = np.concatenate(inv_ind)
-
-    def forward(self, X):
-        self.X = X
-        self.O = X[:,self.indices].reshape(len(X), self.con, self.dim)
-        return np.sum(self.O * self.coeffs.f, 1)
-
-    def backward(self, dY):
-        dY = np.expand_dims(dY, 1)
-        self.coeffs.g += np.sum(dY * self.O, 0)
-        dO = dY * self.coeffs.f
-
-        x = dO
-        batch_size = len(x)
-        x = x.reshape(batch_size, -1)
-        if x.shape[1] % self.size_in != 0:
-            x = np.pad(x, ((0, 0), (0, self.size_in - x.shape[1] % self.size_in)))
-        x = x[:, self.inv_ind].reshape(batch_size, -1, self.size_in)
-        return x.sum(1)