diff --git a/optim_nn.py b/optim_nn.py old mode 100644 new mode 100755 index e808b79..8f66952 --- a/optim_nn.py +++ b/optim_nn.py @@ -51,6 +51,36 @@ class SomethingElse(ResidualLoss): def df(self, r): return np.sign(r) * np.abs(r)**self.c +class Confidence(Loss): + # this isn't "confidence" in any meaningful way; (e.g. Bayesian) + # it's just a metric of how large the value is of the predicted class. + # when using it for loss, it acts like a crappy regularizer. + # it really just measures how much of a hot-shot the network thinks it is. + + def forward(self, p, y=None): + categories = p.shape[-1] + confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories) + # the exponent in softmax puts a maximum on confidence, + # but we don't compensate for that. if necessary, + # it'd be better to use an activation that doesn't have this limit. + return np.mean(confidence) + + def backward(self, p, y=None): + # in order to agree with the forward pass, + # using this backwards pass as-is will minimize confidence. + categories = p.shape[-1] + detc = p / categories / (1 - 1/categories) + dmax = p == np.max(p, axis=-1, keepdims=True) + return detc * dmax + +class NLL(Loss): # Negative Log Likelihood + def forward(self, p, y): + correct = p * y + return np.mean(-correct) + + def backward(self, p, y): + return -y / len(p) + # Nonparametric Layers {{{1 # Parametric Layers {{{1 diff --git a/optim_nn_core.py b/optim_nn_core.py index 0afe9f0..e6b3599 100644 --- a/optim_nn_core.py +++ b/optim_nn_core.py @@ -77,28 +77,6 @@ class Accuracy(Loss): def backward(self, p, y): raise NotImplementedError("cannot take the gradient of Accuracy") -class Confidence(Loss): - # this isn't "confidence" in any meaningful way; (e.g. Bayesian) - # it's just a metric of how large the value is of the predicted class. - # when using it for loss, it acts like a crappy regularizer. - # it really just measures how much of a hot-shot the network thinks it is. - - def forward(self, p, y=None): - categories = p.shape[-1] - confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories) - # the exponent in softmax puts a maximum on confidence, - # but we don't compensate for that. if necessary, - # it'd be better to use an activation that doesn't have this limit. - return np.mean(confidence) - - def backward(self, p, y=None): - # in order to agree with the forward pass, - # using this backwards pass as-is will minimize confidence. - categories = p.shape[-1] - detc = p / categories / (1 - 1/categories) - dmax = p == np.max(p, axis=-1, keepdims=True) - return detc * dmax - class ResidualLoss(Loss): def forward(self, p, y): return np.mean(self.f(p - y)) @@ -327,12 +305,12 @@ class Layer: # TODO: better names for these (still) - def _propogate(self, edges): + def _propagate(self, edges): if not self.unsafe: assert len(edges) == 1, self return self.forward(edges[0]) - def _backpropogate(self, edges): + def _backpropagate(self, edges): if len(edges) == 1: return self.backward(edges[0]) return sum((self.backward(dY) for dY in edges)) @@ -378,7 +356,7 @@ class Layer: if not self.unsafe: self.validate_input(X) edges.append(X) - Y = self._propogate(edges) + Y = self._propagate(edges) if not self.unsafe: self.validate_output(Y) return Y @@ -393,7 +371,7 @@ class Layer: if not self.unsafe: self.validate_output(dY) edges.append(dY) - dX = self._backpropogate(edges) + dX = self._backpropagate(edges) if not self.unsafe: self.validate_input(dX) return dX @@ -443,7 +421,7 @@ class Flatten(Layer): assert dY.shape[0] == self.batch_size return dY.reshape(self.batch_size, *self.input_shape) -class Affine(Layer): +class ConstAffine(Layer): def __init__(self, a=1, b=0): super().__init__() self.a = _f(a) @@ -456,10 +434,10 @@ class Affine(Layer): return dY * self.a class Sum(Layer): - def _propogate(self, edges): + def _propagate(self, edges): return np.sum(edges, axis=0) - def _backpropogate(self, edges): + def _backpropagate(self, edges): #assert len(edges) == 1, "unimplemented" return edges[0] # TODO: does this always work? @@ -515,8 +493,6 @@ class GeluApprox(Layer): return dY * self.sig * (1 + self.a * (1 - self.sig)) class Softmax(Layer): - # lifted from theano - def __init__(self, axis=-1): super().__init__() self.axis = int(axis) @@ -529,9 +505,19 @@ class Softmax(Layer): return self.sm def backward(self, dY): - dYsm = dY * self.sm - dX = dYsm - np.sum(dYsm, axis=-1, keepdims=True) * self.sm - return dX + return (dY - np.sum(dY * self.sm, axis=-1, keepdims=True)) * self.sm + +class LogSoftmax(Softmax): + def __init__(self, axis=-1, eps=1e-6): + super().__init__() + self.axis = int(axis) + self.eps = _f(eps) + + def forward(self, X): + return np.log(super().forward(X) + self.eps) + + def backward(self, dY): + return dY - np.sum(dY, axis=-1, keepdims=True) * self.sm # Parametric Layers {{{1 @@ -626,7 +612,7 @@ class Model: values = dict() input_node = self.ordered_nodes[0] output_node = self.ordered_nodes[-1] - values[input_node] = input_node._propogate(np.expand_dims(X, 0)) + values[input_node] = input_node._propagate(np.expand_dims(X, 0)) for node in self.ordered_nodes[1:]: values[node] = node.propagate(values) return values[output_node] @@ -634,7 +620,7 @@ class Model: def backward(self, error): values = dict() output_node = self.ordered_nodes[-1] - values[output_node] = output_node._backpropogate(np.expand_dims(error, 0)) + values[output_node] = output_node._backpropagate(np.expand_dims(error, 0)) for node in reversed(self.ordered_nodes[:-1]): values[node] = node.backpropagate(values) return self.dW