smbot/guided.lua

-- Guided Evolutionary Strategies
-- https://arxiv.org/abs/1806.10230

-- this is just ARS extended to utilize gradients
-- approximated from previous iterations.

-- for simplicity:
-- antithetic is always true
-- momentum is always 0
-- no graycode/lipschitz nonsense

local floor = math.floor
local insert = table.insert
local ipairs = ipairs
local max = math.max
local print = print
local sqrt = math.sqrt

local Base = require "Base"

local nn = require "nn"
local dot_mv = nn.dot_mv
local transpose = nn.transpose
local normal = nn.normal
local prod = nn.prod
local uniform = nn.uniform
local zeros = nn.zeros

local qr = require "qr2"

local util = require "util"
local argsort = util.argsort
local calc_mean_dev = util.calc_mean_dev

local Guided = Base:extend()

local function collect_best_indices(scored, top)
    -- select one (the best) reward of each pos/neg pair.
    local best_rewards
    best_rewards = {}
    for i = 1, #scored / 2 do
        local pos = scored[i * 2 - 1]
        local neg = scored[i * 2 - 0]
        best_rewards[i] = max(pos, neg)
    end

    local indices = argsort(best_rewards, function(a, b) return a > b end)

    for i = top + 1, #best_rewards do indices[i] = nil end
    return indices
end

function Guided:init(dims, popsize, poptop, base_rate, sigma, alpha, beta)
    -- sigma: scale of random perturbations.
    -- alpha: blend between full parameter space and its gradient subspace.
    --        1.0 is roughly equivalent to ARS.
    self.dims = dims
    self.popsize = popsize or 4 + (3 * floor(log(dims)))
    base_rate = base_rate or 3/5 * (3 + log(dims)) / (dims * sqrt(dims))
    self.param_rate = base_rate
    self.sigma = sigma or 1.0
    self.alpha = alpha or 0.5
    self.beta = beta or 1.0

    self.poptop = poptop or popsize
    assert(self.poptop <= popsize)
    self.popsize = self.popsize * 2 -- antithetic

    self._params = zeros(self.dims)
    --self.accum = zeros(self.dims) -- momentum

    self.evals = 0
end

function Guided:params(new_params)
    if new_params ~= nil then
        assert(#self._params == #new_params, "new parameters have the wrong size")
        for i, v in ipairs(new_params) do self._params[i] = v end
    end
    return self._params
end

function Guided:decay(param_decay, sigma_decay)
    -- FIXME: multiplying by sigma probably isn't correct anymore.
    --        is this correct now?
    if param_decay > 0 then
        local scale = self.sigma / sqrt(self.dims)
        scale = scale * self.beta
        scale = scale * self.param_rate / (self.sigma * self.sigma)

        scale = 1 - param_decay * scale
        for i, v in ipairs(self._params) do
            self._params[i] = scale * v
        end
    end
end

function Guided:ask(grads)
    local asked = {}
    local noise = {}

    local n_grad = 0
    local gnoise, U, dummy, left, right
    if grads ~= nil and #grads > 0 then
        n_grad = grads.shape[1]
        gnoise = zeros(n_grad)

        U, dummy = qr(transpose(grads))
        --print(nn.pp(transpose(U), "%9.4f"))

        left = sqrt(self.alpha / self.dims)
        right = sqrt((1 - self.alpha) / n_grad)
        --print(left, right)
    end

    for i = 1, self.popsize do
        local asking = zeros(self.dims)
        local noisy = zeros(self.dims)
        asked[i] = asking
        noise[i] = noisy

        if i % 2 == 0 then
            local old_noisy = noise[i - 1]
            for j, v in ipairs(old_noisy) do
                noisy[j] = -v
            end
        elseif n_grad == 0 then
            local scale = self.sigma / sqrt(self.dims)
            for j = 1, self.dims do
                noisy[j] = scale * normal()
            end
        else
            for j = 1, self.dims do noisy[j] = normal() end
            for j = 1, n_grad do gnoise[j] = normal() end
            local noisier = dot_mv(U, gnoise)
            for j, v in ipairs(noisy) do
                noisy[j] = self.sigma * (left * v + right * noisier[j])
            end
        end

        for j, v in ipairs(self._params) do
            asking[j] = v + noisy[j]
        end
    end

    self.noise = noise
    return asked, noise
end

function Guided:tell(scored, unperturbed_score)
    self.evals = self.evals + #scored

    local indices = collect_best_indices(scored, self.poptop)

    local top_rewards = {}
    for _, ind in ipairs(indices) do
        insert(top_rewards, scored[ind * 2 - 1])
        insert(top_rewards, scored[ind * 2 - 0])
    end

    local step = zeros(self.dims)
    local _, reward_dev = calc_mean_dev(top_rewards)
    if reward_dev == 0 then reward_dev = 1 end

    for i, ind in ipairs(indices) do
        local pos = top_rewards[i * 2 - 1]
        local neg = top_rewards[i * 2 - 0]
        local reward = pos - neg
        if reward ~= 0 then
            local noisy = self.noise[ind * 2 - 1]
            -- NOTE: technically this reward divide isn't part of guided search.
            reward = reward / reward_dev

            local scale = reward / self.poptop * self.beta / 2
            for j, v in ipairs(noisy) do
                step[j] = step[j] + scale * v
            end
        end
    end

    local coeff = self.param_rate / (self.sigma * self.sigma)
    for i, v in ipairs(self._params) do
        self._params[i] = v + coeff * step[i]
    end

    self.noise = nil

    return step
end

return {
    --collect_best_indices = collect_best_indices, -- ars.lua has more features
    Guided = Guided,
}