smbot/guided.lua
2019-02-26 21:53:38 +01:00

195 lines
5.3 KiB
Lua

-- Guided Evolutionary Strategies
-- https://arxiv.org/abs/1806.10230
-- this is just ARS extended to utilize gradients
-- approximated from previous iterations.
-- for simplicity:
-- antithetic is always true
-- momentum is always 0
-- no graycode/lipschitz nonsense
local floor = math.floor
local insert = table.insert
local ipairs = ipairs
local max = math.max
local print = print
local sqrt = math.sqrt
local Base = require "Base"
local nn = require "nn"
local dot_mv = nn.dot_mv
local transpose = nn.transpose
local normal = nn.normal
local prod = nn.prod
local uniform = nn.uniform
local zeros = nn.zeros
local qr = require "qr2"
local util = require "util"
local argsort = util.argsort
local calc_mean_dev = util.calc_mean_dev
local Guided = Base:extend()
local function collect_best_indices(scored, top)
-- select one (the best) reward of each pos/neg pair.
local best_rewards
best_rewards = {}
for i = 1, #scored / 2 do
local pos = scored[i * 2 - 1]
local neg = scored[i * 2 - 0]
best_rewards[i] = max(pos, neg)
end
local indices = argsort(best_rewards, function(a, b) return a > b end)
for i = top + 1, #best_rewards do indices[i] = nil end
return indices
end
function Guided:init(dims, popsize, poptop, base_rate, sigma, alpha, beta)
-- sigma: scale of random perturbations.
-- alpha: blend between full parameter space and its gradient subspace.
-- 1.0 is roughly equivalent to ARS.
self.dims = dims
self.popsize = popsize or 4 + (3 * floor(log(dims)))
base_rate = base_rate or 3/5 * (3 + log(dims)) / (dims * sqrt(dims))
self.param_rate = base_rate
self.sigma = sigma or 1.0
self.alpha = alpha or 0.5
self.beta = beta or 1.0
self.poptop = poptop or popsize
assert(self.poptop <= popsize)
self.popsize = self.popsize * 2 -- antithetic
self._params = zeros(self.dims)
--self.accum = zeros(self.dims) -- momentum
self.evals = 0
end
function Guided:params(new_params)
if new_params ~= nil then
assert(#self._params == #new_params, "new parameters have the wrong size")
for i, v in ipairs(new_params) do self._params[i] = v end
end
return self._params
end
function Guided:decay(param_decay, sigma_decay)
-- FIXME: multiplying by sigma probably isn't correct anymore.
-- is this correct now?
if param_decay > 0 then
local scale = self.sigma / sqrt(self.dims)
scale = scale * self.beta
scale = scale * self.param_rate / (self.sigma * self.sigma)
scale = 1 - param_decay * scale
for i, v in ipairs(self._params) do
self._params[i] = scale * v
end
end
end
function Guided:ask(grads)
local asked = {}
local noise = {}
local n_grad = 0
local gnoise, U, dummy, left, right
if grads ~= nil and #grads > 0 then
n_grad = grads.shape[1]
gnoise = zeros(n_grad)
U, dummy = qr(transpose(grads))
--print(nn.pp(transpose(U), "%9.4f"))
left = sqrt(self.alpha / self.dims)
right = sqrt((1 - self.alpha) / n_grad)
--print(left, right)
end
for i = 1, self.popsize do
local asking = zeros(self.dims)
local noisy = zeros(self.dims)
asked[i] = asking
noise[i] = noisy
if i % 2 == 0 then
local old_noisy = noise[i - 1]
for j, v in ipairs(old_noisy) do
noisy[j] = -v
end
elseif n_grad == 0 then
local scale = self.sigma / sqrt(self.dims)
for j = 1, self.dims do
noisy[j] = scale * normal()
end
else
for j = 1, self.dims do noisy[j] = normal() end
for j = 1, n_grad do gnoise[j] = normal() end
local noisier = dot_mv(U, gnoise)
for j, v in ipairs(noisy) do
noisy[j] = self.sigma * (left * v + right * noisier[j])
end
end
for j, v in ipairs(self._params) do
asking[j] = v + noisy[j]
end
end
self.noise = noise
return asked, noise
end
function Guided:tell(scored, unperturbed_score)
self.evals = self.evals + #scored
local indices = collect_best_indices(scored, self.poptop)
local top_rewards = {}
for _, ind in ipairs(indices) do
insert(top_rewards, scored[ind * 2 - 1])
insert(top_rewards, scored[ind * 2 - 0])
end
local step = zeros(self.dims)
local _, reward_dev = calc_mean_dev(top_rewards)
if reward_dev == 0 then reward_dev = 1 end
for i, ind in ipairs(indices) do
local pos = top_rewards[i * 2 - 1]
local neg = top_rewards[i * 2 - 0]
local reward = pos - neg
if reward ~= 0 then
local noisy = self.noise[ind * 2 - 1]
-- NOTE: technically this reward divide isn't part of guided search.
reward = reward / reward_dev
local scale = reward / self.poptop * self.beta / 2
for j, v in ipairs(noisy) do
step[j] = step[j] + scale * v
end
end
end
local coeff = self.param_rate / (self.sigma * self.sigma)
for i, v in ipairs(self._params) do
self._params[i] = v + coeff * step[i]
end
self.noise = nil
return step
end
return {
--collect_best_indices = collect_best_indices, -- ars.lua has more features
Guided = Guided,
}