diff --git a/optim_nn_core.py b/optim_nn_core.py
index 27df335..0afe9f0 100644
--- a/optim_nn_core.py
+++ b/optim_nn_core.py
@@ -56,7 +56,6 @@ class CategoricalCrossentropy(Loss):
         self.eps = _f(eps)
 
     def forward(self, p, y):
-        # TODO: assert dimensionality and p > 0 (if not self.unsafe?)
         p = np.clip(p, self.eps, 1 - self.eps)
         f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
         return np.mean(f)
@@ -68,7 +67,7 @@ class CategoricalCrossentropy(Loss):
 
 class Accuracy(Loss):
     # returns percentage of categories correctly predicted.
-    # utilizes max(), so it cannot be used for gradient descent.
+    # utilizes argmax(), so it cannot be used for gradient descent.
     # use CategoricalCrossentropy for that instead.
 
     def forward(self, p, y):
@@ -79,18 +78,26 @@ class Accuracy(Loss):
         raise NotImplementedError("cannot take the gradient of Accuracy")
 
 class Confidence(Loss):
+    # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
+    # it's just a metric of how large the value is of the predicted class.
+    # when using it for loss, it acts like a crappy regularizer.
+    # it really just measures how much of a hot-shot the network thinks it is.
+
     def forward(self, p, y=None):
         categories = p.shape[-1]
-        #confidence = (p - 1/categories) / (1 - categories)
-        #confidence = 1 - np.min(p, axis=-1) * categories
         confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
-        # there's also an upper bound on confidence
-        # due to the exponent in softmax,
-        # but we don't compensate for that. keep it simple.
+        # the exponent in softmax puts a maximum on confidence,
+        # but we don't compensate for that.  if necessary,
+        # it'd be better to use an activation that doesn't have this limit.
         return np.mean(confidence)
 
     def backward(self, p, y=None):
-        raise NotImplementedError("this is probably a bad idea")
+        # in order to agree with the forward pass,
+        # using this backwards pass as-is will minimize confidence.
+        categories = p.shape[-1]
+        detc = p / categories / (1 - 1/categories)
+        dmax = p == np.max(p, axis=-1, keepdims=True)
+        return detc * dmax
 
 class ResidualLoss(Loss):
     def forward(self, p, y):