diff --git a/optim_nn.py b/optim_nn.py index 0858d09..6ae5219 100755 --- a/optim_nn.py +++ b/optim_nn.py @@ -468,8 +468,10 @@ def multiresnet(x, width, depth, block=2, multi=1, # Toy Data {{{1 -inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform) -activations = dict(sigmoid=Sigmoid, tanh=Tanh, relu=Relu, elu=Elu, gelu=GeluApprox) +inits = dict(he_normal=init_he_normal, he_uniform=init_he_uniform, + glorot_normal=init_glorot_normal, glorot_uniform=init_glorot_uniform) +activations = dict(sigmoid=Sigmoid, tanh=Tanh, lecun=LeCunTanh, + relu=Relu, elu=Elu, gelu=GeluApprox, softplus=Softplus) def prettyize(data): if isinstance(data, np.ndarray): @@ -692,10 +694,10 @@ def run(program, args=None): # style of resnet (order of layers, which layers, etc.) parallel_style = 'onelesssum', - activation = 'gelu', + activation = 'lecun', optim = 'adam', # note: most features only implemented for Adam - optim_decay1 = 2, # first momentum given in epochs (optional) + optim_decay1 = 24, # first momentum given in epochs (optional) optim_decay2 = 100, # second momentum given in epochs (optional) nesterov = True, batch_size = 64, @@ -705,13 +707,13 @@ def run(program, args=None): learn = 1e-2, epochs = 24, learn_halve_every = 16, # only used with anneal/dumb - restarts = 8, + restarts = 5, restart_decay = 0.25, # only used with SGDR expando = lambda i: 24 * i, # misc - init = 'he_normal', - loss = 'msee', + init = 'glorot_uniform', + loss = 'mse', mloss = 'mse', ritual = 'default', restart_optim = False, # restarts also reset internal state of optimizer diff --git a/optim_nn_core.py b/optim_nn_core.py index 86c1f69..feba346 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -570,7 +570,7 @@ class Dropout(Layer): # Activation Layers {{{2 -class Sigmoid(Layer): # aka Logistic +class Sigmoid(Layer): # aka Logistic, Expit (inverse of Logit) def forward(self, X): self.sig = sigmoid(X) return self.sig @@ -578,6 +578,16 @@ class Sigmoid(Layer): # aka Logistic def backward(self, dY): return dY * self.sig * (1 - self.sig) +class Softplus(Layer): + # integral of Sigmoid. + + def forward(self, X): + self.X = X + return np.log(1 + np.exp(X)) + + def backward(self, dY): + return sigmoid(self.X) + class Tanh(Layer): def forward(self, X): self.sig = np.tanh(X) @@ -586,6 +596,20 @@ class Tanh(Layer): def backward(self, dY): return dY * (1 - self.sig * self.sig) +class LeCunTanh(Layer): + # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf + # paper: http://yann.lecun.com/exdb/publis/pdf/lecun-89.pdf + # scaled such that f([-1, 1]) = [-1, 1]. + # helps preserve an input variance of 1. + # second derivative peaks around an input of ±1. + + def forward(self, X): + self.sig = np.tanh(2 / 3 * X) + return 1.7159 * self.sig + + def backward(self, dY): + return dY * (2 / 3 * 1.7159) * (1 - self.sig * self.sig) + class Relu(Layer): def forward(self, X): self.cond = X >= 0