.

2017-02-26 07:41:38 +00:00 · 2017-02-26 07:41:38 +00:00 · 166644023e
commit 166644023e
parent a8952bebf5
2 changed files with 97 additions and 16 deletions
--- a/optim_nn.py
+++ b/optim_nn.py
@ -132,6 +132,38 @@ class DenseOneLess(Dense):
        np.fill_diagonal(self.dcoeffs, 0)
        return dX
 class CosineDense(Dense):
    # paper: https://arxiv.org/abs/1702.05870
    # another implementation: https://github.com/farizrahman4u/keras-contrib/pull/36
    # the paper doesn't mention bias,
    # so we treat bias as an additional weight with a constant input of 1.
    # this is correct in Dense layers, so i hope it's correct here too.
    eps = 1e-4
    def F(self, X):
        self.X = X
        self.X_norm = np.sqrt(np.square(X).sum(-1, keepdims=True) \
          + 1 + self.eps)
        self.W_norm = np.sqrt(np.square(self.coeffs).sum(0, keepdims=True) \
          + np.square(self.biases) + self.eps)
        self.dot = X.dot(self.coeffs) + self.biases
        Y = self.dot / (self.X_norm * self.W_norm)
        return Y
    def dF(self, dY):
        ddot = dY / self.X_norm / self.W_norm
        dX_norm = -(dY * self.dot / self.W_norm).sum(-1, keepdims=True) / self.X_norm**2
        dW_norm = -(dY * self.dot / self.X_norm).sum( 0, keepdims=True) / self.W_norm**2
        self.dcoeffs[:] = self.X.T.dot(ddot)         \
          + dW_norm / self.W_norm * self.coeffs
        self.dbiases[:] = ddot.sum(0, keepdims=True) \
          + dW_norm / self.W_norm * self.biases
        dX = ddot.dot(self.coeffs.T) + dX_norm / self.X_norm * self.X
        return dX
 # Rituals {{{1
 def stochastic_multiply(W, gamma=0.5, allow_negation=True):
@ -300,7 +332,6 @@ def _mr_onelesssum(y, width, depth, block, multi, activation, style, FC, d):
        y = merger
    else:
        y = z
    #y = y.feed(LayerNorm())
    return y
 _mr_styles = dict(
@ -312,6 +343,11 @@ _mr_styles = dict(
 def multiresnet(x, width, depth, block=2, multi=1,
                activation=Relu, style='batchless',
                init=init_he_normal):
    if style == 'cossim':
        style = 'batchless'
        DenseClass = CosineDense
    else:
        DenseClass = Dense
    if style not in _mr_styles:
        raise Exception('unknown resnet style', style)
@ -320,7 +356,7 @@ def multiresnet(x, width, depth, block=2, multi=1,
    for d in range(depth):
        size = width
-        FC = lambda: Dense(size, init)
+        FC = lambda: DenseClass(size, init)
        if last_size != size:
            y = y.feed(FC())
@ -433,8 +469,10 @@ def optim_from_config(config):
        mu = np.exp(-1/d2)
        optim = RMSprop(mu=mu)
    elif config.optim == 'sgd':
-        if config.momentum != 0:
+        d1 = config.optim_decay1 if 'optim_decay1' in config else 0
-            optim = Momentum(mu=config.momentum, nesterov=config.nesterov)
+        if d1 > 0:
            b1 = np.exp(-1/d1)
            optim = Momentum(mu=b1, nesterov=config.nesterov)
        else:
            optim = Optimizer()
    else:
@ -453,6 +491,7 @@ def learner_from_config(config, optim, rscb):
    elif config.learner == 'anneal':
        learner = AnnealingLearner(optim, epochs=config.epochs, rate=config.learn,
                                   halve_every=config.learn_halve_every)
        log("final learning rate", "{:10.8f}".format(learner.final_rate))
    elif config.learner == 'dumb':
        learner = DumbLearner(optim, epochs=config.epochs, rate=config.learn,
                              halve_every=config.learn_halve_every,
@ -462,7 +501,6 @@ def learner_from_config(config, optim, rscb):
        log("final learning rate", "{:10.8f}".format(learner.final_rate))
    elif config.learner == 'sgd':
        learner = Learner(optim, epochs=config.epochs, rate=config.learn)
        log("final learning rate", "{:10.8f}".format(learner.final_rate))
    else:
        raise Exception('unknown learner', config.learner)
@ -559,11 +597,10 @@ def run(program, args=None):
        parallel_style = 'onelesssum',
        activation = 'gelu',
-        optim = 'adam',
+        optim = 'adam', # note: most features only implemented for Adam
-        optim_decay1 = 2,   # given in epochs (optional)
+        optim_decay1 = 2, #  first momentum given in epochs (optional)
-        optim_decay2 = 100, # given in epochs (optional)
+        optim_decay2 = 100, # second momentum given in epochs (optional)
-        momentum = 0.90, # only used with SGD
+        nesterov = True,
        nesterov = True, # only used with SGD or Adam
        batch_size = 64,
        # learning parameters
@ -571,7 +608,7 @@ def run(program, args=None):
        learn = 1e-2,
        epochs = 24,
        learn_halve_every = 16, # only used with anneal/dumb
-        restarts = 2,
+        restarts = 8,
        restart_decay = 0.25, # only used with SGDR
        expando = lambda i: 24 * i,
@ -585,9 +622,9 @@ def run(program, args=None):
        # logging/output
        log10_loss = True, # personally, i'm sick of looking linear loss values!
-        #fancy_logs = True, # unimplemented
+        #fancy_logs = True, # unimplemented (can't turn it off yet)
-        problem = 3,
+        problem = 2,
        compare = (
            # best results for ~10,000 parameters
            # training/validation pairs for each problem (starting from problem 0):
@ -595,7 +632,7 @@ def run(program, args=None):
            (7.577717e-04, 1.255284e-03),
            # 1080 epochs on these...
            (1.790511e-07, 2.785208e-07),
-            (2.233277e-08, 3.580281e-08),
+            (  10**-7.774,   10**-7.626),
            (5.266719e-07, 5.832677e-06), # overfitting? bad valid set?
        ),
--- a/optim_nn_core.py
+++ b/optim_nn_core.py
@ -33,6 +33,14 @@ def init_he_uniform(size, ins, outs):
    s = np.sqrt(6 / ins)
    return np.random.uniform(-s, s, size=size)
 def init_glorot_normal(size, ins, outs):
    s = np.sqrt(2 / (ins + outs))
    return np.random.normal(0, s, size=size)
 def init_glorot_uniform(size, ins, outs):
    s = np.sqrt(6 / (ins + outs))
    return np.random.uniform(-s, s, size=size)
 # Loss functions {{{1
 class Loss:
@ -162,8 +170,6 @@ class Adam(Optimizer):
    # * Adam == RMSprop when
    #   Adam.b1 == 0
    #   Adam.b2 == RMSprop.mu
    #   Adam.b1_t == 0
    #   Adam.b2_t == 0
    def __init__(self, alpha=0.002, b1=0.9, b2=0.999, eps=1e-8):
        self.b1 = _f(b1) # decay term
@ -373,6 +379,20 @@ class Input(Layer):
        #self.dY = dY
        return np.zeros_like(dY)
 class Reshape(Layer):
    def __init__(self, new_shape):
        super().__init__()
        self.shape = tuple(new_shape)
        self.output_shape = self.shape
    def F(self, X):
        self.batch_size = X.shape[0]
        return X.reshape(self.batch_size, *self.output_shape)
    def dF(self, dY):
        assert dY.shape[0] == self.batch_size
        return dY.reshape(self.batch_size, *self.input_shape)
 class Affine(Layer):
    def __init__(self, a=1, b=0):
        super().__init__()
@ -666,6 +686,30 @@ class Ritual: # i'm just making up names at this point
            return avg_loss, losses
        return avg_loss
    def test_batched(self, inputs, outputs, batch_size, return_losses=False):
        cumsum_loss = _0
        batch_count = inputs.shape[0] // batch_size
        losses = []
        assert inputs.shape[0] % batch_size == 0, \
          "inputs is not evenly divisible by batch_size" # TODO: lift this restriction
        for b in range(batch_count):
            bi = b * batch_size
            batch_inputs  = inputs[ bi:bi+batch_size]
            batch_outputs = outputs[bi:bi+batch_size]
            predicted = self.model.forward(batch_inputs)
            batch_loss = self.measure(predicted, batch_outputs)
            if np.isnan(batch_loss):
                raise Exception("nan")
            cumsum_loss += batch_loss
            if return_losses:
                losses.append(batch_loss)
        avg_loss = cumsum_loss / _f(batch_count)
        if return_losses:
            return avg_loss, losses
        return avg_loss
 # Learners {{{1
 class Learner: