diff --git a/config.lua b/config.lua index b2aca37..c7ce9b3 100644 --- a/config.lua +++ b/config.lua @@ -12,22 +12,22 @@ local cfg = { init_zeros = true, -- instead of he_normal noise or whatever. frameskip = 4, -- true greedy epsilon has both deterministic and det_epsilon set. - deterministic = true, -- use argmax on outputs instead of random sampling. + deterministic = false, -- use argmax on outputs instead of random sampling. det_epsilon = false, -- take random actions with probability eps. graycode = false, - epoch_trials = 50, - epoch_top_trials = 25, -- new with ARS. - unperturbed_trial = true, -- do a trial without any noise. + epoch_trials = 5, + epoch_top_trials = 2, -- new with ARS. + unperturbed_trial = false, -- do a trial without any noise. negate_trials = true, -- try pairs of normal and negated noise directions. time_inputs = true, -- binary inputs of global frame count -- ^ note that this now doubles the effective trials. - deviation = 0.05, --0.075 --0.1 + deviation = 0.32, --learning_rate = 0.01 / approx_cossim(7051) - learning_rate = 1.0, + learning_rate = 0.32, --learning_rate = 0.0032 / approx_cossim(66573) --learning_rate = 0.0056 / approx_cossim(66573) - weight_decay = 0.00032, --0.001 --0.0023 + weight_decay = 0.0032, cap_time = 200, --400 timer_loser = 1/2, @@ -36,6 +36,9 @@ local cfg = { playback_mode = false, } +-- TODO: so, uhh.. +-- what happens when playback_mode is true but unperturbed_trial is false? + cfg.epoch_top_trials = math.min(cfg.epoch_trials, cfg.epoch_top_trials) cfg.eps_start = 1.0 * cfg.frameskip / 64 diff --git a/main.lua b/main.lua index f52b5c7..0c71453 100644 --- a/main.lua +++ b/main.lua @@ -436,14 +436,16 @@ local function prepare_epoch() -- but that's a fair tradeoff for dividing memory used by noise by `epoch_trials`. local precision = (pow(cfg.deviation, 1/-0.51175585) - 8.68297257) / 1.66484392 - print(("chosen precision: %.2f"):format(precision)) + if cfg.graycode then + print(("chosen precision: %.2f"):format(precision)) + end for i = 1, cfg.epoch_trials do local noise = nn.zeros(#base_params) -- NOTE: change in implementation: deviation is multiplied here -- and ONLY here now. - if i % 2 == 0 then -- FIXME: just messing around. - --if cfg.graycode then + --if i % 2 == 0 then -- FIXME: just messing around. + if cfg.graycode then --local precision = 1 / cfg.deviation --print(cfg.deviation, precision) for j = 1, #base_params do @@ -598,7 +600,14 @@ local function learn_from_epoch() top_rewards[sind + 0] = trial_rewards[sind + 0] top_rewards[sind + 1] = trial_rewards[sind + 1] end - print("top:", top_rewards) + --print("top:", top_rewards) + + local delta_rewards = {} -- only used for printing. + for i, ind in ipairs(indices) do + local sind = (ind - 1) * 2 + 1 + delta_rewards[i] = abs(top_rewards[sind + 0] - top_rewards[sind + 1]) + end + print("best deltas:", delta_rewards) local _, reward_dev = calc_mean_dev(top_rewards) --print("mean, dev:", _, reward_dev)