add Lipschitz heuristic/approximation

2018-05-07 05:57:52 +02:00 · 2018-05-07 05:57:52 +02:00 · 5201b75509
commit 5201b75509
parent ee066154b2
2 changed files with 28 additions and 8 deletions
--- a/config.lua
+++ b/config.lua
@ -33,6 +33,7 @@ local common_cfg = {
    -- ^ note that this now doubles the effective trials.
    time_inputs = true, -- binary inputs of global frame count
    ars_lips = false,
    adamant = false, -- run steps through AMSgrad.
    cap_time = 300,
@ -50,6 +51,7 @@ local cfg = {
    epoch_top_trials = 10,
    learning_rate = 1.0,
    ars_lips = true,
    deviation = 0.1,
    weight_decay = 0.004,
--- a/main.lua
+++ b/main.lua
@ -656,11 +656,12 @@ local function learn_from_epoch()
    end
    print("best deltas:", top_delta_rewards)
    if not cfg.ars_lips then
        local _, reward_dev = calc_mean_dev(top_rewards)
        --print("mean, dev:", _, reward_dev)
        if reward_dev == 0 then reward_dev = 1 end
        for i, v in ipairs(top_rewards) do top_rewards[i] = v / reward_dev end
    end
    -- NOTE: step no longer directly incorporates learning_rate.
    for i = 1, cfg.epoch_trials do
@ -668,11 +669,28 @@ local function learn_from_epoch()
        local pos = top_rewards[ind + 0]
        local neg = top_rewards[ind + 1]
        local reward = pos - neg
        if reward ~= 0 then
            local noise = trial_noise[i]
            if cfg.ars_lips then
                local _, dev = calc_mean_dev(noise)
                local c0 = neg - current_cost
                local c1 = pos - current_cost
                local l0 = abs(3 * c1 + c0)
                local l1 = abs(c1 + 3 * c0)
                local lips = max(l0, l1) / (2 * dev)
                --reward = pos / lips - neg / lips
                local old_reward = reward
                reward = reward / lips
                reward = reward / cfg.deviation -- FIXME: hack?
                --print(("trial %i reward: %.0f -> %.5f"):format(i, old_reward, reward))
            end
            for j, v in ipairs(noise) do
                step[j] = step[j] + reward * v / cfg.epoch_top_trials
            end
        end
    end
    local step_mean, step_dev = calc_mean_dev(step)
    print("step mean:", step_mean)