smbot/ars.lua

-- Augmented Random Search
-- https://arxiv.org/abs/1803.07055

local abs = math.abs
local exp = math.exp
local floor = math.floor
local insert = table.insert
local remove = table.remove
local ipairs = ipairs
local log = math.log
local max = math.max
local print = print
local sqrt = math.sqrt

local Base = require "Base"

local nn = require "nn"
local normal = nn.normal
local prod = nn.prod
local uniform = nn.uniform
local zeros = nn.zeros

local util = require "util"
local argsort = util.argsort
local calc_mean_dev = util.calc_mean_dev
local calc_mean_dev_unbiased = util.calc_mean_dev_unbiased
local normalize_sums = util.normalize_sums
local sign = util.sign

local Ars = Base:extend()

local exp_lut = {}
exp_lut[-1] = exp(-1)
exp_lut[0] = exp(0)
exp_lut[1] = exp(1)

local function collect_best_indices(scored, top, antithetic)
    -- select one (the best) reward of each pos/neg pair.
    local best_rewards
    if antithetic then
        best_rewards = {}
        for i = 1, #scored / 2 do
            local pos = scored[i * 2 - 1]
            local neg = scored[i * 2 - 0]
            best_rewards[i] = max(pos, neg)
        end
    else
        best_rewards = scored
    end

    local indices = argsort(best_rewards, function(a, b) return a > b end)

    for i = top + 1, #best_rewards do indices[i] = nil end
    return indices
end

function Ars:init(dims, popsize, poptop, base_rate, sigma, antithetic,
                  momentum, beta)
    self.dims = dims
    self.popsize = popsize or 4 + (3 * floor(log(dims)))
    base_rate = base_rate or 3/5 * (3 + log(dims)) / (dims * sqrt(dims))
    self.param_rate = base_rate
    self.sigma = sigma or 1
    self.antithetic = antithetic == nil and true or antithetic
    self.momentum = momentum or 0
    self.beta = beta or 1.0

    self.poptop = poptop or popsize
    assert(self.poptop <= popsize)
    if self.antithetic then self.popsize = self.popsize * 2 end

    self._params = zeros(self.dims)
    if self.momentum > 0 then self.accum = zeros(self.dims) end

    self.evals = 0
end

function Ars:params(new_params)
    if new_params ~= nil then
        assert(#self._params == #new_params, "new parameters have the wrong size")
        for i, v in ipairs(new_params) do self._params[i] = v end
    end
    return self._params
end

function Ars:decay(param_decay, sigma_decay)
    if param_decay > 0 then
        for i, v in ipairs(self._params) do
            self._params[i] = v * (1 - self.param_rate * param_decay * self.sigma)
        end
    end
end

function Ars:ask()
    local asked = {}
    local noise = {}

    for i = 1, self.popsize do
        local asking = zeros(self.dims)
        local noisy = zeros(self.dims)
        asked[i] = asking
        noise[i] = noisy

        if self.antithetic and i % 2 == 0 then
            local old_noisy = noise[i - 1]
            for j, v in ipairs(old_noisy) do
                noisy[j] = -v
            end
        else
            for j = 1, self.dims do
                noisy[j] = self.sigma * normal()
            end
        end

        for j, v in ipairs(self._params) do
            asking[j] = v + noisy[j]
        end
    end

    self.noise = noise
    return asked, noise
end

function Ars:tell(scored, unperturbed_score)
    self.evals = self.evals + #scored
    if unperturbed_score ~= nil then self.evals = self.evals + 1 end

    local indices = collect_best_indices(scored, self.poptop, self.antithetic)

    local top_rewards = {}
    if self.antithetic then
        for _, ind in ipairs(indices) do
            insert(top_rewards, scored[ind * 2 - 1])
            insert(top_rewards, scored[ind * 2 - 0])
        end
    else
        -- ARS is built around antithetic sampling,
        -- but we can still do something without.
        -- this is getting to be very similar to SNES however.
        for _, ind in ipairs(indices) do insert(top_rewards, scored[ind]) end
        -- note: although this normalizes the scale, it's later
        --       re-normalized differently by reward_dev anyway.
        top_rewards = normalize_sums(top_rewards)
    end

    local step = zeros(self.dims)

    local _, reward_dev
    if unperturbed_score ~= nil then
        -- new stuff:
        insert(top_rewards, unperturbed_score)
        _, reward_dev = calc_mean_dev_unbiased(top_rewards)
        remove(top_rewards)
    else
        _, reward_dev = calc_mean_dev(top_rewards)
    end
    if reward_dev == 0 then reward_dev = 1 end

    if self.antithetic then
        for i, ind in ipairs(indices) do
            local pos = top_rewards[i * 2 - 1]
            local neg = top_rewards[i * 2 - 0]
            local reward = pos - neg
            if reward ~= 0 then
                local noisy = self.noise[ind * 2 - 1]
                reward = reward / reward_dev

                --[[ new stuff:
                local sum_of_squares = 0
                for _, v in ipairs(noisy) do
                    sum_of_squares = sum_of_squares + v * v
                end
                reward = reward / sqrt(sum_of_squares)
                -]]

                local scale = reward / self.poptop * self.beta / 2
                for j, v in ipairs(noisy) do
                    step[j] = step[j] + scale * v
                end
            end
        end

    else
        error("TODO: update with sum of squares stuff")
        for i, ind in ipairs(indices) do
            local reward = top_rewards[i] / reward_dev
            if reward ~= 0 then
                local noisy = self.noise[ind]

                local scale = reward / self.poptop * self.beta
                for j, v in ipairs(noisy) do
                    step[j] = step[j] + scale * v
                end
            end
        end
    end

    --[[ powersign momentum
    if self.momentum > 0 then
        for i, v in ipairs(step) do
            self.accum[i] = self.momentum * self.accum[i] + v
            step[i] = v * exp_lut[sign(v) * sign(self.accum[i])]
        end
    end

    for i, v in ipairs(self._params) do
        self._params[i] = v + self.param_rate * step[i]
    end
    --]]

    -- neumann momentum
    if self.momentum > 0 then
        local count = self.count or 0
        local period = 10
        local mu = 1 - 1 / (1 + count % period)
        mu = self.momentum / (1 - 1 / period) * mu
        self.count = count + 1
        -- mu is intentionally 0 for one iteration.

        -- make learning rate invariant to sigma.
        for i, v in ipairs(step) do
            step[i] = v / self.sigma
        end

        -- update neumann iterate.
        for i, v in ipairs(self.accum) do
            self.accum[i] = mu * v - self.param_rate * step[i]
        end

        for i, v in ipairs(self._params) do
            self._params[i] = v - mu * self.accum[i] + self.param_rate * step[i]
        end
    else
        for i, v in ipairs(self._params) do
            self._params[i] = v + self.param_rate * step[i]
        end
    end

    self.noise = nil

    return step
end

return {
    collect_best_indices = collect_best_indices,
    Ars = Ars,
}