194 lines
5.3 KiB
Lua
194 lines
5.3 KiB
Lua
-- Guided Evolutionary Strategies
|
|
-- https://arxiv.org/abs/1806.10230
|
|
|
|
-- this is just ARS extended to utilize gradients
|
|
-- approximated from previous iterations.
|
|
|
|
-- for simplicity:
|
|
-- antithetic is always true
|
|
-- momentum is always 0
|
|
-- no graycode/lipschitz nonsense
|
|
|
|
local floor = math.floor
|
|
local insert = table.insert
|
|
local ipairs = ipairs
|
|
local max = math.max
|
|
local print = print
|
|
local sqrt = math.sqrt
|
|
|
|
local Base = require "Base"
|
|
|
|
local nn = require "nn"
|
|
local dot_mv = nn.dot_mv
|
|
local transpose = nn.transpose
|
|
local normal = nn.normal
|
|
local prod = nn.prod
|
|
local uniform = nn.uniform
|
|
local zeros = nn.zeros
|
|
|
|
local qr = require "qr2"
|
|
|
|
local util = require "util"
|
|
local argsort = util.argsort
|
|
local calc_mean_dev = util.calc_mean_dev
|
|
|
|
local Guided = Base:extend()
|
|
|
|
local function collect_best_indices(scored, top)
|
|
-- select one (the best) reward of each pos/neg pair.
|
|
local best_rewards
|
|
best_rewards = {}
|
|
for i = 1, #scored / 2 do
|
|
local pos = scored[i * 2 - 1]
|
|
local neg = scored[i * 2 - 0]
|
|
best_rewards[i] = max(pos, neg)
|
|
end
|
|
|
|
local indices = argsort(best_rewards, function(a, b) return a > b end)
|
|
|
|
for i = top + 1, #best_rewards do indices[i] = nil end
|
|
return indices
|
|
end
|
|
|
|
function Guided:init(dims, popsize, poptop, base_rate, sigma, alpha, beta)
|
|
-- sigma: scale of random perturbations.
|
|
-- alpha: blend between full parameter space and its gradient subspace.
|
|
-- 1.0 is roughly equivalent to ARS.
|
|
self.dims = dims
|
|
self.popsize = popsize or 4 + (3 * floor(log(dims)))
|
|
base_rate = base_rate or 3/5 * (3 + log(dims)) / (dims * sqrt(dims))
|
|
self.param_rate = base_rate
|
|
self.sigma = sigma or 1.0
|
|
self.alpha = alpha or 0.5
|
|
self.beta = beta or 1.0
|
|
|
|
self.poptop = poptop or popsize
|
|
assert(self.poptop <= popsize)
|
|
self.popsize = self.popsize * 2 -- antithetic
|
|
|
|
self._params = zeros(self.dims)
|
|
--self.accum = zeros(self.dims) -- momentum
|
|
|
|
self.evals = 0
|
|
end
|
|
|
|
function Guided:params(new_params)
|
|
if new_params ~= nil then
|
|
assert(#self._params == #new_params, "new parameters have the wrong size")
|
|
for i, v in ipairs(new_params) do self._params[i] = v end
|
|
end
|
|
return self._params
|
|
end
|
|
|
|
function Guided:decay(param_decay, sigma_decay)
|
|
-- FIXME: multiplying by sigma probably isn't correct anymore.
|
|
-- is this correct now?
|
|
if param_decay > 0 then
|
|
local scale = self.sigma / sqrt(self.dims)
|
|
scale = scale * self.beta
|
|
scale = scale * self.param_rate / (self.sigma * self.sigma)
|
|
|
|
scale = 1 - param_decay * scale
|
|
for i, v in ipairs(self._params) do
|
|
self._params[i] = scale * v
|
|
end
|
|
end
|
|
end
|
|
|
|
function Guided:ask(grads)
|
|
local asked = {}
|
|
local noise = {}
|
|
|
|
local n_grad = 0
|
|
local gnoise, U, dummy, left, right
|
|
if grads ~= nil and #grads > 0 then
|
|
n_grad = grads.shape[1]
|
|
gnoise = zeros(n_grad)
|
|
|
|
U, dummy = qr(transpose(grads))
|
|
--print(nn.pp(transpose(U), "%9.4f"))
|
|
|
|
left = sqrt(self.alpha / self.dims)
|
|
right = sqrt((1 - self.alpha) / n_grad)
|
|
--print(left, right)
|
|
end
|
|
|
|
for i = 1, self.popsize do
|
|
local asking = zeros(self.dims)
|
|
local noisy = zeros(self.dims)
|
|
asked[i] = asking
|
|
noise[i] = noisy
|
|
|
|
if i % 2 == 0 then
|
|
local old_noisy = noise[i - 1]
|
|
for j, v in ipairs(old_noisy) do
|
|
noisy[j] = -v
|
|
end
|
|
elseif n_grad == 0 then
|
|
local scale = self.sigma / sqrt(self.dims)
|
|
for j = 1, self.dims do
|
|
noisy[j] = scale * normal()
|
|
end
|
|
else
|
|
for j = 1, self.dims do noisy[j] = normal() end
|
|
for j = 1, n_grad do gnoise[j] = normal() end
|
|
local noisier = dot_mv(U, gnoise)
|
|
for j, v in ipairs(noisy) do
|
|
noisy[j] = self.sigma * (left * v + right * noisier[j])
|
|
end
|
|
end
|
|
|
|
for j, v in ipairs(self._params) do
|
|
asking[j] = v + noisy[j]
|
|
end
|
|
end
|
|
|
|
self.noise = noise
|
|
return asked, noise
|
|
end
|
|
|
|
function Guided:tell(scored, unperturbed_score)
|
|
self.evals = self.evals + #scored
|
|
|
|
local indices = collect_best_indices(scored, self.poptop)
|
|
|
|
local top_rewards = {}
|
|
for _, ind in ipairs(indices) do
|
|
insert(top_rewards, scored[ind * 2 - 1])
|
|
insert(top_rewards, scored[ind * 2 - 0])
|
|
end
|
|
|
|
local step = zeros(self.dims)
|
|
local _, reward_dev = calc_mean_dev(top_rewards)
|
|
if reward_dev == 0 then reward_dev = 1 end
|
|
|
|
for i, ind in ipairs(indices) do
|
|
local pos = top_rewards[i * 2 - 1]
|
|
local neg = top_rewards[i * 2 - 0]
|
|
local reward = pos - neg
|
|
if reward ~= 0 then
|
|
local noisy = self.noise[ind * 2 - 1]
|
|
-- NOTE: technically this reward divide isn't part of guided search.
|
|
reward = reward / reward_dev
|
|
|
|
local scale = reward / self.poptop * self.beta / 2
|
|
for j, v in ipairs(noisy) do
|
|
step[j] = step[j] + scale * v
|
|
end
|
|
end
|
|
end
|
|
|
|
local coeff = self.param_rate / (self.sigma * self.sigma)
|
|
for i, v in ipairs(self._params) do
|
|
self._params[i] = v + coeff * step[i]
|
|
end
|
|
|
|
self.noise = nil
|
|
|
|
return step
|
|
end
|
|
|
|
return {
|
|
--collect_best_indices = collect_best_indices, -- ars.lua has more features
|
|
Guided = Guided,
|
|
}
|