add unperturbed trial with ranking

2017-09-07 19:00:09 +00:00 · 2017-09-07 19:00:09 +00:00 · ddc8cae4c6
commit ddc8cae4c6
parent 9ce1f87ade
1 changed files with 31 additions and 11 deletions
--- a/main.lua
+++ b/main.lua
@ -41,6 +41,7 @@ local consider_past_rewards = false
 local learn_start_select = false
 --
 local epoch_trials = 40
+local unperturbed_trial = true -- do a trial without any noise.
 local learning_rate = 0.3 -- bigger now that i'm shaping trials etc.
 local deviation = 0.05
 --
@ -75,7 +76,7 @@ local bad_states = {

 local epoch_i = 0
 local base_params
-local trial_i = 0
+local trial_i = -1 -- NOTE: trial 0 is an unperturbed trial, if enabled.
 local trial_noise = {}
 local trial_rewards = {}
 local trials_remaining = 0
@ -535,17 +536,25 @@ local function prepare_epoch()
        for j = 1, #base_params do noise[j] = nn.normal() end
        trial_noise[i] = noise
    end
-    trial_i = 0
+    trial_i = -1
 end

 local function load_next_trial()
    trial_i = trial_i + 1
-    print('loading trial', trial_i)
    local W = nn.copy(base_params)
    local noise = trial_noise[trial_i]
    local devsqrt = sqrt(deviation)
+    if trial_i == 0 and not unperturbed_trial then
+        trial_i = 1
+    end
+    if trial_i > 0 then
+        print('loading trial', trial_i)
+        local noise = trial_noise[trial_i]
        for i, v in ipairs(base_params) do
-        W[i] = v + devsqrt * noise[i]
+            W[i] = v + deviation * noise[i]
+        end
+    else
+        print("test trial")
    end
    network:distribute(W)
 end
@ -574,6 +583,17 @@ local function fitness_shaping(rewards)
    return shaped_returns
 end

+local function unperturbed_rank(rewards, unperturbed_reward)
+    -- lifted from: https://github.com/atgambardella/pytorch-es/blob/master/train.py
+    local nth_place = 1
+    for i, v in ipairs(rewards) do
+        if v > unperturbed_reward then
+            nth_place = nth_place +  1
+        end
+    end
+    return nth_place
+end
+
 local function learn_from_epoch()
    print()
    print('rewards:', trial_rewards)
@ -582,12 +602,12 @@ local function learn_from_epoch()
        insert(all_rewards, v)
    end

-    if consider_past_rewards then
-        normalize_wrt(trial_rewards, all_rewards)
-    else
-        normalize(trial_rewards)
+    if unperturbed_trial then
+        local nth_place = unperturbed_rank(trial_rewards, trial_rewards[0])
+
+        -- a rank of 1 means our gradient is uninformative.
+        print(("test trial: %d out of %d"):format(nth_place, #trial_rewards))
    end
-    --print('normalized:', trial_rewards)

    local step = nn.zeros(#base_params)
    local shaped_rewards = fitness_shaping(trial_rewards)
@ -625,7 +645,7 @@ local function do_reset()
    if state == 'dead' and get_timer() == 0 then state = 'timeup' end
    print("resetting in state: "..state..". reward:", reward)

-    if trial_i > 0 then trial_rewards[trial_i] = reward end
+    if trial_i >= 0 then trial_rewards[trial_i] = reward end

    if epoch_i == 0 or trial_i == epoch_trials then
        if epoch_i > 0 then learn_from_epoch() end