diff --git a/onn/optimizer.py b/onn/optimizer.py
index a37c6b7..9eb9d1a 100644
--- a/onn/optimizer.py
+++ b/onn/optimizer.py
@@ -587,6 +587,54 @@ class Padam(Adamlike):
                          debias=debias, runmax=True, yogi=False, eps=eps)
 
 
+class MSVAG(Optimizer):
+    # paper: https://arxiv.org/abs/1705.07774
+    # this is the variance-reducing aspect isolated from the rest of Adam.
+
+    def __init__(self, lr=0.1, b=0.99):
+        self.b = _f(b)
+        super().__init__(lr=lr)
+
+    def reset(self):
+        self.mt = None
+        self.vt = None
+        self.bt = self.b
+
+        super().reset()
+
+    def compute(self, dW, W):
+        if self.mt is None:
+            self.mt = np.zeros_like(dW)
+        if self.vt is None:
+            self.vt = np.zeros_like(dW)
+
+        mt = filter_gradients(self.mt, dW, self.b)
+        vt = filter_gradients(self.vt, np.square(dW), self.b)
+
+        # debiasing:
+        if self.bt != 1:
+            mt = mt / (1 - self.bt)
+            vt = vt / (1 - self.bt)
+            num = (1 - self.b) * (1 + self.bt)
+            den = (1 + self.b) * (1 - self.bt)
+            rho = num / den
+        else:
+            # technically, this should be 1 / (t + 1),
+            # but we don't keep track of t directly.
+            rho = 1
+
+        if rho != 1:
+            mt2 = np.square(mt)
+            s = (vt - mt2) / (1 - rho)
+            gamma = div0(mt2, mt2 + rho * s)
+        else:
+            gamma = 1
+
+        self.bt *= self.b
+
+        return -self.lr * (gamma * mt)
+
+
 AMSGrad = AMSgrad
 AdaDelta = Adadelta
 AdaGrad = Adagrad
diff --git a/onn/utility.py b/onn/utility.py
index 274aff1..063f491 100644
--- a/onn/utility.py
+++ b/onn/utility.py
@@ -29,6 +29,17 @@ def lower_priority():
         os.nice(1)
 
 
+def div0(a, b):
+    """division, whereby division by zero equals zero"""
+    # http://stackoverflow.com/a/35696047
+    a = np.asanyarray(a)
+    b = np.asanyarray(b)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        c = np.true_divide(a, b)
+        c[~np.isfinite(c)] = 0 # -inf inf NaN
+    return c
+
+
 def onehot(y):
     unique = np.unique(y)
     Y = np.zeros((y.shape[0], len(unique)), dtype=np.int8)