add rough stratified k-folding utility class

2018-03-08 02:41:45 +01:00 · 2018-03-08 02:41:45 +01:00 · 9a45b26b7f
commit 9a45b26b7f
parent 65bc9b8a6f
1 changed files with 50 additions and 0 deletions
--- a/onn/utility.py
+++ b/onn/utility.py
@ -54,3 +54,53 @@ def log(left, right, update=False):
 class Dummy:
    pass
 class Folding:
    # NOTE: this class assumes classes are *exactly* evenly distributed.
    def __init__(self, inputs, outputs, folds):
        # outputs should be one-hot.
        self.folds = int(folds)
        # this temporarily converts one-hot encoding back to integer indices.
        classes = np.argmax(outputs, axis=-1)
        # we need to do stratified k-folds,
        # so let's put them in an order that's easy to split
        # without breaking class distribution.
        # don't worry, they'll get shuffled again in train_batched.
        classes = np.argmax(outputs, axis=-1)
        class_n = np.max(classes) + 1
        sorted_inputs = np.array([inputs[classes == n]
                                  for n in range(class_n)], inputs.dtype)
        sorted_outputs = np.arange(class_n
            ).repeat(sorted_inputs.shape[1]).reshape(sorted_inputs.shape[:2])
        # now to interleave the classes instead of having them grouped:
        inputs = np.swapaxes(sorted_inputs, 0, 1
            ).reshape(-1, *sorted_inputs.shape[2:])
        outputs = np.swapaxes(sorted_outputs, 0, 1
            ).reshape(-1, *sorted_outputs.shape[2:])
        # one final thing: we need to make our outputs one-hot again.
        self.inputs = inputs
        self.outputs = onehot(outputs)
        # now we can do stratified folds simply by contiguous slices!
        self.foldstep = len(self.inputs) // self.folds
        assert len(self.inputs) % self.foldstep == 0, \
            "bad number of folds; cannot be stratified"
    def fold(self, i):
        roll = i * self.foldstep
        split = (self.folds - 1) * self.foldstep
        train_inputs = np.roll(self.inputs, roll, axis=0)[:split]
        valid_inputs = np.roll(self.inputs, roll, axis=0)[split:]
        train_outputs = np.roll(self.outputs, roll, axis=0)[:split]
        valid_outputs = np.roll(self.outputs, roll, axis=0)[split:]
        return train_inputs, train_outputs, valid_inputs, valid_outputs