import numpy as np

from .float import *

class Loss:
    def forward(self, p, y):
        raise NotImplementedError("unimplemented", self)

    def backward(self, p, y):
        raise NotImplementedError("unimplemented", self)

class NLL(Loss): # Negative Log Likelihood
    def forward(self, p, y):
        correct = p * y
        return np.mean(-correct)

    def backward(self, p, y):
        return -y / len(p)

class CategoricalCrossentropy(Loss):
    # lifted from theano

    def __init__(self, eps=1e-6):
        self.eps = _f(eps)

    def forward(self, p, y):
        p = np.clip(p, self.eps, 1 - self.eps)
        f = np.sum(-y * np.log(p) - (1 - y) * np.log(1 - p), axis=-1)
        return np.mean(f)

    def backward(self, p, y):
        p = np.clip(p, self.eps, 1 - self.eps)
        df = (p - y) / (p * (1 - p))
        return df / len(y)

class Accuracy(Loss):
    # returns percentage of categories correctly predicted.
    # utilizes argmax(), so it cannot be used for gradient descent.
    # use CategoricalCrossentropy or NLL for that instead.

    def forward(self, p, y):
        correct = np.argmax(p, axis=-1) == np.argmax(y, axis=-1)
        return np.mean(correct)

    def backward(self, p, y):
        raise NotImplementedError("cannot take the gradient of Accuracy")

class ResidualLoss(Loss):
    def forward(self, p, y):
        return np.mean(self.f(p - y))

    def backward(self, p, y):
        ret = self.df(p - y) / len(y)
        return ret

class SquaredHalved(ResidualLoss):
    def f(self, r):
        return np.square(r) / 2

    def df(self, r):
        return r

class Squared(ResidualLoss):
    def f(self, r):
        return np.square(r)

    def df(self, r):
        return 2 * r

class Absolute(ResidualLoss):
    def f(self, r):
        return np.abs(r)

    def df(self, r):
        return np.sign(r)

class Huber(ResidualLoss):
    def __init__(self, delta=1.0):
        self.delta = _f(delta)

    def f(self, r):
        return np.where(r <= self.delta,
                        np.square(r) / 2,
                        self.delta * (np.abs(r) - self.delta / 2))

    def df(self, r):
        return np.where(r <= self.delta,
                        r,
                        self.delta * np.sign(r))

# more

class SomethingElse(ResidualLoss):
    # generalizes Absolute and SquaredHalved.
    # plot: https://www.desmos.com/calculator/fagjg9vuz7
    def __init__(self, a=4/3):
        assert 1 <= a <= 2, "parameter out of range"
        self.a = _f(a / 2)
        self.b = _f(2 / a)
        self.c = _f(2 / a - 1)

    def f(self, r):
        return self.a * np.abs(r)**self.b

    def df(self, r):
        return np.sign(r) * np.abs(r)**self.c

class Confidence(Loss):
    # this isn't "confidence" in any meaningful way; (e.g. Bayesian)
    # it's just a metric of how large the value is of the predicted class.
    # when using it for loss, it acts like a crappy regularizer.
    # it really just measures how much of a hot-shot the network thinks it is.

    def forward(self, p, y=None):
        categories = p.shape[-1]
        confidence = (np.max(p, axis=-1) - 1/categories) / (1 - 1/categories)
        # the exponent in softmax puts a maximum on confidence,
        # but we don't compensate for that. if necessary,
        # it'd be better to use an activation that doesn't have this limit.
        return np.mean(confidence)

    def backward(self, p, y=None):
        # in order to agree with the forward pass,
        # using this backwards pass as-is will minimize confidence.
        categories = p.shape[-1]
        detc = p / categories / (1 - 1/categories)
        dmax = p == np.max(p, axis=-1, keepdims=True)
        return detc * dmax