diff --git a/main.lua b/main.lua index 69b3231..994ac2f 100644 --- a/main.lua +++ b/main.lua @@ -40,9 +40,9 @@ local eps_frames = 1000000 local consider_past_rewards = false local learn_start_select = false -- -local epoch_trials = 40 -- 24 -local learning_rate = 1e-3 -local deviation = 1e-2 -- 4e-3 +local epoch_trials = 40 +local learning_rate = 0.3 -- bigger now that i'm shaping trials etc. +local deviation = 0.05 -- local cap_time = 400 local timer_loser = 1/3 @@ -127,6 +127,7 @@ local randomseed = math.randomseed local insert = table.insert local remove = table.remove local unpack = table.unpack or unpack +local sort = table.sort local R = memory.readbyteunsigned local S = memory.readbyte --signed local W = memory.writebyte @@ -143,12 +144,22 @@ local ror = bit.ror -- utilities. +local function ifind(haystack, needle) + for i, v in ipairs(haystack) do + if v == needle then return i end + end + return nil +end + local function boolean_xor(a, b) if a and b then return false end if not a and not b then return false end return true end +local _invlog2 = 1 / log(2) +local function log2(x) return log(x) * _invlog2 end + local function clamp(x, l, u) return min(max(x, l), u) end local function lerp(a, b, t) return a + (b - a) * clamp(t, 0, 1) end @@ -517,6 +528,8 @@ local function prepare_epoch() base_params = network:collect() empty(trial_noise) empty(trial_rewards) + -- TODO: save memory. generate noise as needed by saving the seed + -- (the os.time() as of here) and calling nn.normal() each trial. for i = 1, epoch_trials do local noise = nn.zeros(#base_params) for j = 1, #base_params do noise[j] = nn.normal() end @@ -537,6 +550,30 @@ local function load_next_trial() network:distribute(W) end +local function fitness_shaping(rewards) + -- lifted from: https://github.com/atgambardella/pytorch-es/blob/master/train.py + local decreasing = nn.copy(rewards) + sort(decreasing, function(a, b) return a > b end) + local shaped_returns = {} + local lamb = #rewards + + local denom = 0 + for i, v in ipairs(rewards) do + local l = log2(lamb / 2 + 1) + local r = log2(ifind(decreasing, v)) + denom = denom + max(0, l - r) + end + + for i, v in ipairs(rewards) do + local l = log2(lamb / 2 + 1) + local r = log2(ifind(decreasing, v)) + local numer = max(0, l - r) + insert(shaped_returns, numer / denom + 1 / lamb) + end + + return shaped_returns +end + local function learn_from_epoch() print() print('rewards:', trial_rewards) @@ -552,37 +589,19 @@ local function learn_from_epoch() end --print('normalized:', trial_rewards) - local reward_mean, reward_dev = calc_mean_dev(trial_rewards) - local step = nn.zeros(#base_params) + local shaped_rewards = fitness_shaping(trial_rewards) + + local altogether = learning_rate / (epoch_trials * deviation) for i = 1, epoch_trials do - local reward = trial_rewards[i] + local reward = shaped_rewards[i] local noise = trial_noise[i] for j, v in ipairs(noise) do - step[j] = step[j] + reward * v + step[j] = step[j] + altogether * (reward * v) end end - local magnitude = learning_rate / deviation - --print('stepping with magnitude', magnitude) - -- throw the division from the averaging in there too. - local altogether = magnitude / epoch_trials - for i, v in ipairs(step) do - step[i] = altogether * v - end - local step_mean, step_dev = calc_mean_dev(step) - if step_dev < 1e-8 then - -- we didn't get anywhere. step in a random direction. - print("stepping randomly.") - local noise = trial_noise[1] - local devsqrt = sqrt(deviation) - for i, v in ipairs(step) do - step[i] = devsqrt * noise[i] - end - - step_mean, step_dev = calc_mean_dev(step) - end if abs(step_mean) > 1e-3 then print("step mean:", step_mean) end print("step stddev:", step_dev)