From b4210c105a34a2b7f83f5e6a29095f8017318cda Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 5 Jan 2023 01:49:50 +0530 Subject: [PATCH 01/47] Merge py file changes from benchmark-algs --- src/imitation/algorithms/dagger.py | 62 +++ src/imitation/scripts/analyze.py | 24 +- src/imitation/scripts/config/parallel.py | 406 ++++++++++++++++-- .../scripts/config/train_adversarial.py | 175 +++++++- .../scripts/config/train_imitation.py | 26 ++ .../config/train_preference_comparisons.py | 128 +++++- src/imitation/scripts/config/train_rl.py | 203 ++++++++- src/imitation/scripts/ingredients/reward.py | 5 + src/imitation/scripts/parallel.py | 166 ++++++- src/imitation/scripts/train_adversarial.py | 1 + src/imitation/scripts/train_imitation.py | 4 +- .../scripts/train_preference_comparisons.py | 1 + src/imitation/scripts/train_rl.py | 4 +- tests/algorithms/test_dagger.py | 25 +- tests/scripts/test_scripts.py | 31 +- 15 files changed, 1173 insertions(+), 88 deletions(-) diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py index a7194a5bf..0034fc4ba 100644 --- a/src/imitation/algorithms/dagger.py +++ b/src/imitation/algorithms/dagger.py @@ -65,6 +65,68 @@ def __call__(self, round_num: int) -> float: assert round_num >= 0 return min(1, max(0, (self.rampdown_rounds - round_num) / self.rampdown_rounds)) + def __repr__(self): + return f"{type(self).__name__}({self.rampdown_rounds!r})" + + +class IndicatorBetaSchedule(BetaSchedule): + """Beta schedule that switches off after a number of rounds.""" + + def __init__(self, rampdown_rounds: int): + """Builds IndicatorBetaSchedule. + + Args: + rampdown_rounds: number of rounds after which beta switches off. + """ + self.rampdown_rounds = rampdown_rounds + + def __call__(self, round_num: int) -> float: + """Computes beta value. + + Args: + round_num: the current round number. + + Returns: + beta as `1` until `self.rampdown_rounds` and then beta as `0`. + """ + assert round_num >= 0 + return 1 if round_num < self.rampdown_rounds else 0 + + def __repr__(self): + return f"{type(self).__name__}({self.rampdown_rounds!r})" + + +class ExponentialBetaSchedule(BetaSchedule): + """Exponentially decaying schedule for beta.""" + + def __init__(self, decay_probability: float): + """Builds ExponentialBetaSchedule. + + Args: + decay_probability: the decay factor for beta. + + Raises: + ValueError: if `decay_probability` not within (0, 1]. + """ + if not (0 < decay_probability <= 1): + raise ValueError("decay_probability lies outside the range (0, 1].") + self.decay_probability = decay_probability + + def __call__(self, round_num: int) -> float: + """Computes beta value. + + Args: + round_num: the current round number. + + Returns: + beta as `self.decay_probability ^ round_num` + """ + assert round_num >= 0 + return self.decay_probability**round_num + + def __repr__(self): + return f"{type(self).__name__}({self.decay_probability!r})" + def reconstruct_trainer( scratch_dir: types.AnyPath, diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py index 0586f86d6..54fed52f9 100644 --- a/src/imitation/scripts/analyze.py +++ b/src/imitation/scripts/analyze.py @@ -166,6 +166,8 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str: def _return_summaries(sd: sacred_util.SacredDicts) -> dict: imit_stats = get(sd.run, "result.imit_stats") + if imit_stats is None: + imit_stats = get(sd.run, "result.rollout") expert_stats = get(sd.run, "result.expert_stats") expert_return_summary = None @@ -232,7 +234,7 @@ def _return_summaries(sd: sacred_util.SacredDicts) -> dict: # verbosity 2 table_verbosity_mapping.append( table_verbosity_mapping[-1] - | {"status", "imit_expert_ratio", "exp_command", "run_name"}, + | {"status", "imit_expert_ratio", "exp_command", "run_name", "seed", ""}, ) @@ -268,20 +270,26 @@ def analyze_imitation( Returns: The DataFrame generated from the Sacred logs. """ - table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity) + if table_verbosity == -1: + table_entry_fns_subset = _get_table_entry_fns_subset(0) + else: + table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity) - rows = [] + df = pd.DataFrame() for sd in _gather_sacred_dicts(): - row = {} + new_df = pd.DataFrame() + if table_verbosity == -1: + new_df = pd.json_normalize(sd.config) + for col_name, make_entry_fn in table_entry_fns_subset.items(): - row[col_name] = make_entry_fn(sd) - rows.append(row) + new_df[col_name] = make_entry_fn(sd) + + df = pd.concat([df, new_df]) - df = pd.DataFrame(rows) if len(df) > 0: df.sort_values(by=["algo", "env_name"], inplace=True) - display_options = dict(index=False) + display_options: Mapping[str, Any] = dict(index=False) if csv_output_path is not None: df.to_csv(csv_output_path, **display_options) print(f"Wrote CSV file to {csv_output_path}") diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index eb206893f..59295d3d3 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -5,13 +5,15 @@ `@parallel_ex.named_config` to define a new parallel experiment. Adding custom named configs is necessary because the CLI interface can't add -search spaces to the config like `"seed": tune.grid_search([0, 1, 2, 3])`. +search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`. """ import numpy as np import ray.tune as tune import sacred +from torch import nn +from imitation.algorithms.dagger import ExponentialBetaSchedule, LinearBetaSchedule from imitation.util.util import make_unique_timestamp parallel_ex = sacred.Experiment("parallel") @@ -33,12 +35,39 @@ def config(): local_dir = None # `local_dir` arg for `ray.tune.run` upload_dir = None # `upload_dir` arg for `ray.tune.run` - n_seeds = 3 # Number of seeds to search over by default + # n_seeds_start = 0 + # n_seeds = 1 # Number of seeds to search over by default + experiment_checkpoint_path = "" + eval_best_trial = False + eval_trial_seeds = 5 # Number of seeds to search over by default + num_samples = 1 # Number of samples per grid search configuration + repeat = 3 + env = "seals_half_cheetah" + wandb_name_prefix = "" + + +# @parallel_ex.config +# def seeds(n_seeds_start, n_seeds): +# search_space = { +# "config_updates": { +# "seed": tune.choice( +# list(range(n_seeds_start, n_seeds_start + n_seeds)), +# ) +# } +# } @parallel_ex.config -def seeds(n_seeds): - search_space = {"config_updates": {"seed": tune.grid_search(list(range(n_seeds)))}} +def wandb(run_name): + base_config_updates = { + "common": { + "wandb": { + "wandb_name_prefix": run_name, + "wandb_kwargs": {"project": "algorithm-benchmark"}, + }, + }, + } + # base_named_configs = ["common.wandb_logging"] @parallel_ex.named_config @@ -63,7 +92,7 @@ def generate_test_data(): "config_updates": { "rl": { "rl_kwargs": { - "learning_rate": tune.grid_search( + "learning_rate": tune.choice( [3e-4 * x for x in (1 / 3, 1 / 2)], ), }, @@ -91,8 +120,8 @@ def example_cartpole_rl(): "config_updates": { "rl": { "rl_kwargs": { - "learning_rate": tune.grid_search(np.logspace(3e-6, 1e-1, num=3)), - "nminibatches": tune.grid_search([16, 32, 64]), + "learning_rate": tune.choice(np.logspace(3e-6, 1e-1, num=3)), + "nminibatches": tune.choice([16, 32, 64]), }, }, }, @@ -105,44 +134,367 @@ def example_cartpole_rl(): @parallel_ex.named_config -def example_rl_easy(): +def example_rl(): sacred_ex_name = "train_rl" - run_name = "example-rl-easy" - n_seeds = 2 + run_name = "rl_tuning" + # n_seeds = 2 + base_named_configs = ["common.wandb_logging", "seals_half_cheetah"] + base_config_updates = { + "common": { + "wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}, + "num_vec": 1, + }, + } search_space = { - "named_configs": tune.grid_search([[env] for env in EASY_ENVS]), + # "named_configs": tune.choice([[env] for env in EASY_ENVS]), "config_updates": { "rl": { + "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]), "rl_kwargs": { - "learning_rate": tune.grid_search(np.logspace(3e-6, 1e-1, num=3)), - "nminibatches": tune.grid_search([16, 32, 64]), + "learning_rate": tune.loguniform(1e-5, 1e-2), + "batch_size": tune.choice([64, 128, 256, 512]), + "n_epochs": tune.choice([5, 10, 20]), }, }, }, } - resources_per_trial = dict(cpu=4) + num_samples = 100 + eval_best_trial = True + eval_trial_seeds = 5 + repeat = 1 + resources_per_trial = dict(cpu=1) @parallel_ex.named_config -def example_gail_easy(): +def example_bc(): + sacred_ex_name = "train_imitation" + run_name = "bc_tuning_hc" + base_named_configs = ["common.wandb_logging", "seals_half_cheetah"] + base_config_updates = { + # "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}, + "common": {"num_vec": 1}, + } + search_space = { + "config_updates": { + "bc_kwargs": dict( + batch_size=tune.choice([8, 16, 32, 64]), + l2_weight=tune.loguniform(1e-6, 1e-2), # L2 regularization weight + optimizer_kwargs=dict( + lr=tune.loguniform(1e-5, 1e-2), + ), + ), + "bc_train_kwargs": dict( + n_epochs=tune.choice([1, 5, 10, 20]), + ), + }, + "command_name": "bc", + } + num_samples = 64 + eval_best_trial = True + eval_trial_seeds = 5 + repeat = 3 + resources_per_trial = dict(cpu=1) + + +@parallel_ex.named_config +def example_dagger(): + sacred_ex_name = "train_imitation" + run_name = "dagger_tuning_hc" + base_named_configs = ["common.wandb_logging", "seals_half_cheetah"] + base_config_updates = { + # "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}, + "common": {"num_vec": 1}, + "dagger": {"total_timesteps": 1e5}, + "bc_kwargs": { + "batch_size": 16, + "l2_weight": 1e-4, + "optimizer_kwargs": {"lr": 1e-3}, + }, + } + search_space = { + "config_updates": { + "bc_train_kwargs": dict( + n_epochs=tune.choice([1, 5, 10]), + ), + "dagger": dict( + beta_schedule=tune.choice( + [LinearBetaSchedule(i) for i in [1, 5, 15]] + + [ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]], + ), + rollout_round_min_episodes=tune.choice([3, 5, 10]), + ), + }, + "command_name": "dagger", + } + num_samples = 50 + repeat = 3 + eval_best_trial = True + eval_trial_seeds = 5 + resources_per_trial = dict(cpu=1) + + +@parallel_ex.named_config +def example_gail(): sacred_ex_name = "train_adversarial" - run_name = "example-gail-easy" - n_seeds = 1 + run_name = "gail_tuning_hc" + base_named_configs = ["common.wandb_logging"] + base_config_updates = { + "common": {"num_vec": 1}, + "total_timesteps": 1e7, + } search_space = { - "named_configs": tune.grid_search([[env] for env in EASY_ENVS]), + # "named_configs": tune.choice([[env] for env in MY_ENVS]), "config_updates": { - "init_trainer_kwargs": { - "rl": { - "rl_kwargs": { - "learning_rate": tune.grid_search( - np.logspace(3e-6, 1e-1, num=3), - ), - "nminibatches": tune.grid_search([16, 32, 64]), - }, + "algorithm_kwargs": dict( + demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), + n_disc_updates_per_round=tune.choice([8, 16]), + # both are same as rl.batch_size + # gen_replay_buffer_capacity=tune.choice([512, 1024]), + # gen_train_timesteps=0, + ), + "rl": { + "batch_size": tune.choice([4096, 8192, 16384]), + "rl_kwargs": { + "ent_coef": tune.loguniform(1e-7, 1e-3), + "learning_rate": tune.loguniform(1e-5, 1e-2), }, }, + "algorithm_specific": {}, }, + "command_name": "gail", + } + num_samples = 100 + eval_best_trial = True + eval_trial_seeds = 5 + repeat = 3 + # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}" + resources_per_trial = dict(cpu=1) + + +@parallel_ex.named_config +def example_airl(): + sacred_ex_name = "train_adversarial" + run_name = "airl_tuning_hc" + # n_seeds = 1 + base_named_configs = ["common.wandb_logging"] + base_config_updates = { + "common": {"num_vec": 1}, + "total_timesteps": 1e7, } search_space = { - "command_name": "gail", + # "named_configs": tune.choice([[env] for env in MY_ENVS]), + "config_updates": { + "algorithm_kwargs": dict( + demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), + n_disc_updates_per_round=tune.choice([8, 16]), + # both are same as rl.batch_size + # gen_replay_buffer_capacity=tune.choice([512, 1024]), + # gen_train_timesteps=0, + ), + "rl": { + "batch_size": tune.choice([4096, 8192, 16384]), + "rl_kwargs": { + "ent_coef": tune.loguniform(1e-7, 1e-3), + "learning_rate": tune.loguniform(1e-5, 1e-2), + }, + }, + "algorithm_specific": {}, + }, + "command_name": "airl", + } + num_samples = 100 + eval_best_trial = True + eval_trial_seeds = 5 + repeat = 3 + # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}" + resources_per_trial = dict(cpu=1) + + +@parallel_ex.named_config +def example_pc(): + sacred_ex_name = "train_preference_comparisons" + run_name = "pc_tuning" + base_named_configs = ["common.wandb_logging", "seals_half_cheetah"] + base_config_updates = { + "common": {"num_vec": 1}, + "total_timesteps": 2e7, + "total_comparisons": 5000, + "query_schedule": "hyperbolic", + "gatherer_kwargs": {"sample": True}, + } + search_space = { + "named_configs": tune.choice( + [ + ["reward.normalize_output_disable"], + # ["reward.normalize_output_running"], + # ["reward.normalize_output_ema"], + ], + ), + "config_updates": { + "train": { + "policy_kwargs": { + "activation_fn": tune.choice( + [ + nn.ReLU, + # nn.Tanh, + ], + ), + }, + }, + "num_iterations": tune.choice([25, 50]), + # "initial_comparison_frac": tune.choice([0.1, 0.25]), + # "reward_trainer_kwargs": { + # "epochs": tune.choice([1, 3, 6]), + # }, + # "query_schedule": tune.choice( + # ["constant", "hyperbolic", "inverse_quadratic"], + # ), + "rl": { + "batch_size": tune.choice([512, 2048, 8192]), + "rl_kwargs": { + "learning_rate": tune.loguniform(1e-5, 1e-2), + "ent_coef": tune.loguniform(1e-7, 1e-3), + }, + }, + }, } + num_samples = 24 + eval_best_trial = True + eval_trial_seeds = 5 + repeat = 3 + resources_per_trial = dict(cpu=1) + + +@parallel_ex.named_config +def debug_eval(): + sacred_ex_name = "train_preference_comparisons" + run_name = "debug_eval" + eval_trial_seeds = 2 + eval_best_trial = True + # base_named_configs = ["seals_half_cheetah"] + base_config_updates = { + "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}, + "total_timesteps": 30, + "total_comparisons": 10, + # "query_schedule": "hyperbolic", + "num_iterations": 1, + "fragment_length": 2, + } + search_space = { + # "named_configs": tune.choice([[env] for env in MY_ENVS]), + "config_updates": { + # "num_iterations": tune.choice([5, 20, 50]), + "initial_comparison_frac": tune.choice([0.1, 0.2]), + # "reward_trainer_kwargs": { + # "epochs": tune.choice([1, 2, 3]), + # }, + # "query_schedule": tune.choice( + # ["constant", "hyperbolic", "inverse_quadratic"], + # ), + }, + } + resources_per_trial = dict(cpu=1) + + +@parallel_ex.named_config +def debug_eval_adv(): + sacred_ex_name = "train_adversarial" + run_name = "airl_tuning_debug" + # n_seeds = 5 + base_named_configs = [] + eval_best_trial = True + eval_trial_seeds = 2 + base_config_updates = { + "common": { + "wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}, + # "num_env": 1, + }, + "total_timesteps": 2048, + } + search_space = { + # "named_configs": tune.choice([[env] for env in MY_ENVS]), + "config_updates": { + "algorithm_kwargs": dict( + # demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), + n_disc_updates_per_round=tune.choice([1, 2]), + # both are same as rl.batch_size + # gen_replay_buffer_capacity=tune.choice([512, 1024]), + # gen_train_timesteps=0, + ), + "rl": { + "batch_size": 8, + # "rl_kwargs": { + # "ent_coef": tune.choice([0, 1e-3, 1e-1]), + # "learning_rate": tune.loguniform(1e-5, 5e-3), + # }, + }, + "algorithm_specific": dict(demo_batch_size=1), + }, + "command_name": "airl", + } + num_samples = 2 + repeat = 2 + resources_per_trial = dict(cpu=8) + + +@parallel_ex.named_config +def debug_airl(): + sacred_ex_name = "train_adversarial" + run_name = "airl_debug" + # n_seeds = 1 + base_named_configs = ["common.wandb_logging", "seals_walker"] + base_config_updates = { + "common": {"num_vec": 8}, + "total_timesteps": 1e7, + } + search_space = { + # "named_configs": tune.choice([[env] for env in MY_ENVS]), + "config_updates": { + "train": { + "policy_kwargs": { + "activation_fn": tune.choice( + [ + nn.ReLU, + # nn.Tanh, + ], + ), + }, + }, + "algorithm_kwargs": dict( + demo_batch_size=tune.choice([32]), + n_disc_updates_per_round=tune.choice([10]), + # both are same as rl.batch_size + # gen_replay_buffer_capacity=tune.choice([512, 1024]), + # gen_train_timesteps=0, + ), + "rl": { + "batch_size": tune.choice([10000]), + "rl_kwargs": { + "ent_coef": tune.choice([0.1]), + "learning_rate": tune.choice([1e-4]), + }, + }, + "algorithm_specific": {}, + }, + "command_name": "airl", + } + num_samples = 1 + eval_best_trial = False + # eval_trial_seeds = 5 + repeat = 5 + # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}" + resources_per_trial = dict(cpu=8) + + +# @parallel_ex.config_hook +# def config_hook(config, command_name, logger): +# """Sets env.""" +# del command_name, logger +# res = {} +# print(config) +# if config["env"]: +# res["base_named_configs"] = tuple( +# config["base_named_configs"] + [config["env"]] +# ) +# print(res) +# return res diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py index aae3baeb0..bd9df6287 100644 --- a/src/imitation/scripts/config/train_adversarial.py +++ b/src/imitation/scripts/config/train_adversarial.py @@ -1,6 +1,7 @@ """Configuration for imitation.scripts.train_adversarial.""" import sacred +from torch import nn from imitation.rewards import reward_nets from imitation.scripts.ingredients import demonstrations, environment, expert @@ -98,9 +99,25 @@ def pendulum(): @train_adversarial_ex.named_config def seals_ant(): - locals().update(**MUJOCO_SHARED_LOCALS) - locals().update(**ANT_SHARED_LOCALS) + # locals().update(**MUJOCO_SHARED_LOCALS) + # locals().update(**ANT_SHARED_LOCALS) environment = dict(gym_id="seals/Ant-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=16, + clip_range=0.3, + ent_coef=3.1441389214159857e-06, + gae_lambda=0.8, + gamma=0.995, + learning_rate=0.00017959211641976886, + max_grad_norm=0.9, + n_epochs=10, + # policy_kwargs are same as the defaults + vf_coef=0.4351450387648799, + ), + ) CHEETAH_SHARED_LOCALS = dict( @@ -139,40 +156,145 @@ def half_cheetah(): @train_adversarial_ex.named_config def seals_half_cheetah(): - locals().update(**CHEETAH_SHARED_LOCALS) + # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/HalfCheetah-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + rl = dict( + batch_size=512, + rl_kwargs=dict( + batch_size=64, + clip_range=0.1, + ent_coef=3.794797423594763e-06, + gae_lambda=0.95, + gamma=0.95, + learning_rate=0.0003286871805949382, + max_grad_norm=0.8, + n_epochs=5, + vf_coef=0.11483689492120866, + ), + ) + # algorithm_specific = dict( + # airl=dict(total_timesteps=int(5e6)), + # gail=dict(total_timesteps=int(8e6)), + # ) + # reward = dict( + # algorithm_specific=dict( + # airl=dict( + # net_cls=reward_nets.BasicShapedRewardNet, + # net_kwargs=dict( + # reward_hid_sizes=(32,), + # potential_hid_sizes=(32,), + # ), + # ), + # ), + # ) + algorithm_kwargs = dict( + # Number of discriminator updates after each round of generator updates + n_disc_updates_per_round=16, + # Equivalent to no replay buffer if batch size is the same + gen_replay_buffer_capacity=512, + demo_batch_size=8192, + ) @train_adversarial_ex.named_config def seals_hopper(): - locals().update(**MUJOCO_SHARED_LOCALS) + # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Hopper-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=512, + clip_range=0.1, + ent_coef=0.0010159833764878474, + gae_lambda=0.98, + gamma=0.995, + learning_rate=0.0003904770450788824, + max_grad_norm=0.9, + n_epochs=20, + vf_coef=0.20315938606555833, + ), + ) @train_adversarial_ex.named_config -def seals_humanoid(): - locals().update(**MUJOCO_SHARED_LOCALS) - environment = dict(gym_id="seals/Humanoid-v0") - total_timesteps = int(4e6) +def seals_swimmer(): + # locals().update(**MUJOCO_SHARED_LOCALS) + environment = dict(gym_id="seals/Swimmer-v0") + total_timesteps = int(2e6) + demonstrations = dict(rollout_type="ppo-huggingface") + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=64, + clip_range=0.1, + ent_coef=5.167107294612664e-08, + gae_lambda=0.95, + gamma=0.999, + learning_rate=0.000414936134792374, + max_grad_norm=2, + n_epochs=5, + # policy_kwargs are same as the defaults + vf_coef=0.6162112311062333, + ), + ) @train_adversarial_ex.named_config -def reacher(): - environment = dict(gym_id="Reacher-v2") - algorithm_kwargs = {"allow_variable_horizon": True} +def seals_walker(): + # locals().update(**MUJOCO_SHARED_LOCALS) + environment = dict(gym_id="seals/Walker2d-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + rl = dict( + batch_size=8192, + rl_kwargs=dict( + batch_size=128, + clip_range=0.4, + ent_coef=0.00013057334805552262, + gae_lambda=0.92, + gamma=0.98, + learning_rate=0.000138575372312869, + max_grad_norm=0.6, + n_epochs=20, + # policy_kwargs are same as the defaults + vf_coef=0.6167177795726859, + ), + ) @train_adversarial_ex.named_config -def seals_swimmer(): +def seals_humanoid(): locals().update(**MUJOCO_SHARED_LOCALS) - environment = dict(gym_id="seals/Swimmer-v0") - total_timesteps = int(2e6) + environment = dict(gym_id="seals/Humanoid-v0") + total_timesteps = int(4e6) @train_adversarial_ex.named_config -def seals_walker(): - locals().update(**MUJOCO_SHARED_LOCALS) - environment = dict(gym_id="seals/Walker2d-v0") +def reacher(): + environment = dict(gym_id="Reacher-v2") + algorithm_kwargs = {"allow_variable_horizon": True} # Debug configs @@ -189,3 +311,22 @@ def fast(): demo_batch_size=1, n_disc_updates_per_round=4, ) + + +@train_adversarial_ex.named_config +def debug_nans(): + environment = {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}} + total_timesteps = 1e7 + algorithm_kwargs = dict( + demo_batch_size=128, + n_disc_updates_per_round=8, + # both are same as rl.batch_size + # gen_replay_buffer_capacity=tune.choice([512, 1024]), + # gen_train_timesteps=0, + ) + rl = { + "batch_size": 4096, + "rl_kwargs": {"ent_coef": 0.1, "learning_rate": 7.316377404994506e-05}, + } + seed = 0 + checkpoint_interval = 1 diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py index 16da9c694..23e24ec0b 100644 --- a/src/imitation/scripts/config/train_imitation.py +++ b/src/imitation/scripts/config/train_imitation.py @@ -38,6 +38,7 @@ def config(): dagger = dict( use_offline_rollouts=False, # warm-start policy with BC from offline demos total_timesteps=1e5, + rollout_round_min_episodes=None, # use default value ) agent_path = None # Path to load agent from, optional. @@ -81,6 +82,8 @@ def ant(): @train_imitation_ex.named_config def seals_ant(): environment = dict(gym_id="seals/Ant-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + expert = {"policy_type": "ppo-huggingface"} @train_imitation_ex.named_config @@ -95,6 +98,29 @@ def seals_half_cheetah(): environment = dict(gym_id="seals/HalfCheetah-v0") bc_kwargs = dict(l2_weight=0.0) dagger = dict(total_timesteps=60000) + demonstrations = dict(rollout_type="ppo-huggingface") + expert = {"policy_type": "ppo-huggingface"} + + +@train_imitation_ex.named_config +def seals_hopper(): + environment = dict(gym_id="seals/Hopper-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + expert = {"policy_type": "ppo-huggingface"} + + +@train_imitation_ex.named_config +def seals_swimmer(): + environment = dict(gym_id="seals/Swimmer-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + expert = {"policy_type": "ppo-huggingface"} + + +@train_imitation_ex.named_config +def seals_walker(): + environment = dict(gym_id="seals/Walker2d-v0") + demonstrations = dict(rollout_type="ppo-huggingface") + expert = {"policy_type": "ppo-huggingface"} @train_imitation_ex.named_config diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index cf25f4783..d12869bf0 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -1,6 +1,7 @@ """Configuration for imitation.scripts.train_preference_comparisons.""" import sacred +from torch import nn from imitation.algorithms import preference_comparisons from imitation.scripts.ingredients import environment @@ -72,9 +73,24 @@ def cartpole(): @train_preference_comparisons_ex.named_config def seals_ant(): - locals().update(**MUJOCO_SHARED_LOCALS) - locals().update(**ANT_SHARED_LOCALS) + # locals().update(**MUJOCO_SHARED_LOCALS) + # locals().update(**ANT_SHARED_LOCALS) environment = dict(gym_id="seals/Ant-v0") + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=16, + clip_range=0.3, + ent_coef=3.1441389214159857e-06, + gae_lambda=0.8, + gamma=0.995, + learning_rate=0.00017959211641976886, + max_grad_norm=0.9, + n_epochs=10, + # policy_kwargs are same as the defaults + vf_coef=0.4351450387648799, + ), + ) @train_preference_comparisons_ex.named_config @@ -84,10 +100,116 @@ def half_cheetah(): rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024)) +@train_preference_comparisons_ex.named_config +def seals_half_cheetah(): + # locals().update(**MUJOCO_SHARED_LOCALS) + environment = dict(gym_id="seals/HalfCheetah-v0") + rl = dict( + batch_size=512, + rl_kwargs=dict( + batch_size=64, + clip_range=0.1, + ent_coef=3.794797423594763e-06, + gae_lambda=0.95, + gamma=0.95, + learning_rate=0.0003286871805949382, + max_grad_norm=0.8, + n_epochs=5, + vf_coef=0.11483689492120866, + ), + ) + num_iterations = 50 + total_timesteps = 20000000 + # train = dict( + # policy_cls="MlpPolicy", + # policy_kwargs=dict( + # activation_fn=nn.ReLU, + # # net_arch=[dict(pi=[64, 64], vf=[64, 64])], + # ), + # ) + + @train_preference_comparisons_ex.named_config def seals_hopper(): - locals().update(**MUJOCO_SHARED_LOCALS) + # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Hopper-v0") + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=512, + clip_range=0.1, + ent_coef=0.0010159833764878474, + gae_lambda=0.98, + gamma=0.995, + learning_rate=0.0003904770450788824, + max_grad_norm=0.9, + n_epochs=20, + vf_coef=0.20315938606555833, + ), + ) + + +@train_preference_comparisons_ex.named_config +def seals_swimmer(): + # locals().update(**MUJOCO_SHARED_LOCALS) + environment = dict(gym_id="seals/Swimmer-v0") + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=64, + clip_range=0.1, + ent_coef=5.167107294612664e-08, + gae_lambda=0.95, + gamma=0.999, + learning_rate=0.000414936134792374, + max_grad_norm=2, + n_epochs=5, + # policy_kwargs are same as the defaults + vf_coef=0.6162112311062333, + ), + ) + + +@train_preference_comparisons_ex.named_config +def seals_walker(): + # locals().update(**MUJOCO_SHARED_LOCALS) + environment = dict(gym_id="seals/Walker2d-v0") + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + rl = dict( + batch_size=8192, + rl_kwargs=dict( + batch_size=128, + clip_range=0.4, + ent_coef=0.00013057334805552262, + gae_lambda=0.92, + gamma=0.98, + learning_rate=0.000138575372312869, + max_grad_norm=0.6, + n_epochs=20, + # policy_kwargs are same as the defaults + vf_coef=0.6167177795726859, + ), + ) @train_preference_comparisons_ex.named_config diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py index 6d48f8695..9df2581a6 100644 --- a/src/imitation/scripts/config/train_rl.py +++ b/src/imitation/scripts/config/train_rl.py @@ -1,6 +1,8 @@ """Configuration settings for train_rl, training a policy with RL.""" + import sacred +from torch import nn from imitation.scripts.ingredients import environment from imitation.scripts.ingredients import logging as logging_ingredient @@ -70,8 +72,30 @@ def cartpole(): @train_rl_ex.named_config def seals_cartpole(): - environment = dict(gym_id="seals/CartPole-v0") - total_timesteps = int(1e6) + environment = dict(gym_id="seals/CartPole-v0", num_vec=8) + total_timesteps = int(1e5) + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + normalize_reward = False + rl = dict( + batch_size=4096, + rl_kwargs=dict( + batch_size=256, + clip_range=0.4, + ent_coef=0.008508727919228772, + gae_lambda=0.9, + gamma=0.9999, + learning_rate=0.0012403278189645594, + max_grad_norm=0.8, + n_epochs=10, + vf_coef=0.489343896591493, + ), + ) @train_rl_ex.named_config @@ -80,9 +104,69 @@ def half_cheetah(): total_timesteps = int(5e6) # does OK after 1e6, but continues improving +@train_rl_ex.named_config +def seals_half_cheetah(): + environment = dict( + gym_id="seals/HalfCheetah-v0", + num_vec=1, + ) + + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.Tanh, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + # total_timesteps = int(5e6) # does OK after 1e6, but continues improving + total_timesteps = 1e6 + normalize_reward = False + + rl = dict( + batch_size=512, + rl_kwargs=dict( + batch_size=64, + clip_range=0.1, + ent_coef=3.794797423594763e-06, + gae_lambda=0.95, + gamma=0.95, + learning_rate=0.0003286871805949382, + max_grad_norm=0.8, + n_epochs=5, + vf_coef=0.11483689492120866, + ), + ) + + @train_rl_ex.named_config def seals_hopper(): - environment = dict(gym_id="seals/Hopper-v0") + environment = dict(gym_id="seals/Hopper-v0", num_vec=1) + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + + total_timesteps = 1e6 + normalize_reward = False + + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=512, + clip_range=0.1, + ent_coef=0.0010159833764878474, + gae_lambda=0.98, + gamma=0.995, + learning_rate=0.0003904770450788824, + max_grad_norm=0.9, + n_epochs=20, + # policy_kwargs are same as the defaults + vf_coef=0.20315938606555833, + ), + ) @train_rl_ex.named_config @@ -104,15 +188,34 @@ def seals_mountain_car(): @train_rl_ex.named_config def pendulum(): - environment = dict(gym_id="Pendulum-v1") + environment = dict(gym_id="Pendulum-v1", num_vec=4) + total_timesteps = int(1e5) + + train = dict( + policy_cls="MlpPolicy", + # policy_kwargs=dict( + # activation_fn=nn.Tanh, + # net_arch=[dict(pi=[64, 64], vf=[64, 64])], + # ), + ) + normalize_reward = False + rl = dict( - batch_size=4096, + batch_size=1024 * 4, rl_kwargs=dict( + gae_lambda=0.95, gamma=0.9, + n_epochs=10, + ent_coef=0.0, learning_rate=1e-3, + clip_range=0.2, + use_sde=True, + sde_sample_freq=4, + # batch_size=64, + # max_grad_norm=0.8, + # vf_coef=0.11483689492120866, ), ) - total_timesteps = int(2e5) @train_rl_ex.named_config @@ -122,17 +225,99 @@ def reacher(): @train_rl_ex.named_config def seals_ant(): - environment = dict(gym_id="seals/Ant-v0") + environment = dict( + gym_id="seals/Ant-v0", + num_vec=1, + ) + + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.Tanh, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + + total_timesteps = 1e6 + normalize_reward = False + + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=16, + clip_range=0.3, + ent_coef=3.1441389214159857e-06, + gae_lambda=0.8, + gamma=0.995, + learning_rate=0.00017959211641976886, + max_grad_norm=0.9, + n_epochs=10, + # policy_kwargs are same as the defaults + vf_coef=0.4351450387648799, + ), + ) @train_rl_ex.named_config def seals_swimmer(): - environment = dict(gym_id="seals/Swimmer-v0") + environment = dict(gym_id="seals/Swimmer-v0", num_vec=1) + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + + total_timesteps = 1e6 + normalize_reward = False + + rl = dict( + batch_size=2048, + rl_kwargs=dict( + batch_size=64, + clip_range=0.1, + ent_coef=5.167107294612664e-08, + gae_lambda=0.95, + gamma=0.999, + learning_rate=0.000414936134792374, + max_grad_norm=2, + n_epochs=5, + # policy_kwargs are same as the defaults + vf_coef=0.6162112311062333, + ), + ) @train_rl_ex.named_config def seals_walker(): - environment = dict(gym_id="seals/Walker2d-v0") + environment = dict(gym_id="seals/Walker2d-v0", num_vec=1) + train = dict( + policy_cls="MlpPolicy", + policy_kwargs=dict( + activation_fn=nn.ReLU, + net_arch=[dict(pi=[64, 64], vf=[64, 64])], + ), + ) + + total_timesteps = 1e6 + normalize_reward = False + + rl = dict( + batch_size=8192, + rl_kwargs=dict( + batch_size=128, + clip_range=0.4, + ent_coef=0.00013057334805552262, + gae_lambda=0.92, + gamma=0.98, + learning_rate=0.000138575372312869, + max_grad_norm=0.6, + n_epochs=20, + # policy_kwargs are same as the defaults + vf_coef=0.6167177795726859, + ), + ) # Debug configs diff --git a/src/imitation/scripts/ingredients/reward.py b/src/imitation/scripts/ingredients/reward.py index c40d3751f..a4bd98d1f 100644 --- a/src/imitation/scripts/ingredients/reward.py +++ b/src/imitation/scripts/ingredients/reward.py @@ -46,6 +46,11 @@ def normalize_output_running(): normalize_output_layer = networks.RunningNorm # noqa: F841 +@reward_ingredient.named_config +def normalize_output_ema(): + normalize_output_layer = networks.EMANorm # noqa: F841 + + @reward_ingredient.named_config def reward_ensemble(): net_cls = reward_nets.RewardEnsemble diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 6014a08b6..c196954d1 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -2,12 +2,18 @@ import collections.abc import copy +import glob import pathlib -from typing import Any, Callable, Dict, Mapping, Optional, Sequence +from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union +import numpy as np import ray import ray.tune import sacred +from pandas.api.types import is_object_dtype +from ray.tune import search +from ray.tune.registry import register_trainable +from ray.tune.search import optuna from sacred.observers import FileStorageObserver from imitation.scripts.config.parallel import parallel_ex @@ -17,6 +23,7 @@ def parallel( sacred_ex_name: str, run_name: str, + num_samples: int, search_space: Mapping[str, Any], base_named_configs: Sequence[str], base_config_updates: Mapping[str, Any], @@ -24,6 +31,12 @@ def parallel( init_kwargs: Mapping[str, Any], local_dir: Optional[str], upload_dir: Optional[str], + repeat: int = 3, + eval_best_trial: bool = False, + eval_trial_seeds: int = 5, + experiment_checkpoint_path: str = "", + syncer=None, + resume: Union[str, bool] = False, ) -> None: """Parallelize multiple runs of another Sacred Experiment using Ray Tune. @@ -40,6 +53,7 @@ def parallel( under the 'experiment.name' key. This is equivalent to using the Sacred CLI '--name' option on the inner experiment. Offline analysis jobs can use this argument to group similar data. + num_samples: Number of times to sample from the hyperparameter space. search_space: A dictionary which can contain Ray Tune search objects like `ray.tune.grid_search` and `ray.tune.sample_from`, and is passed as the `config` argument to `ray.tune.run()`. After the @@ -62,6 +76,19 @@ def parallel( init_kwargs: Arguments to pass to `ray.init`. local_dir: `local_dir` argument to `ray.tune.run()`. upload_dir: `upload_dir` argument to `ray.tune.run()`. + repeat: Number of runs to repeat each trial for. + eval_best_trial: Whether to evaluate the trial with the best mean return + at the end of tuning on a different set of seeds. + eval_trial_seeds: Number of distinct seeds to evaluate the best trial on. + experiment_checkpoint_path: Path containing the checkpoints of a previous + experiment. ran using this script. Useful for resuming cancelled trials + of the experiments (using `resume`) or evaluating the best trial of the + experiment (using `eval_best_trial`). + resume: If true and `experiment_checkpoint_path` is given, then resumes the + experiment by restarting the trials that did not finish in the experiment + checkpoint path. + syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`. + Raises: TypeError: Named configs not string sequences or config updates not mappings. @@ -73,8 +100,8 @@ def parallel( if not isinstance(base_config_updates, collections.abc.Mapping): raise TypeError("base_config_updates must be a Mapping") - if not isinstance(search_space["named_configs"], collections.abc.Sequence): - raise TypeError('search_space["named_configs"] must be a Sequence') + # if not isinstance(search_space["named_configs"], collections.abc.Sequence): + # raise TypeError('search_space["named_configs"] must be a Sequence') if not isinstance(search_space["config_updates"], collections.abc.Mapping): raise TypeError('search_space["config_updates"] must be a Mapping') @@ -95,15 +122,104 @@ def parallel( ) ray.init(**init_kwargs) + search_alg = optuna.OptunaSearch() + search_alg = search.Repeater(search_alg, repeat=repeat) try: - ray.tune.run( - trainable, - config=search_space, - name=run_name, - local_dir=local_dir, - resources_per_trial=resources_per_trial, - sync_config=ray.tune.syncer.SyncConfig(upload_dir=upload_dir), + if experiment_checkpoint_path: + if resume: + register_trainable("inner", trainable) + runner = ray.tune.execution.trial_runner.TrialRunner( + local_checkpoint_dir=experiment_checkpoint_path, + sync_config=ray.tune.syncer.SyncConfig( + upload_dir=upload_dir, + syncer=syncer, + ), + metric="mean_return", + resume=resume, + ) + print( + "Live trials:", len(runner._live_trials), "/", len(runner._trials) + ) + while not runner.is_finished(): + runner.step() + print("Debug:", runner.debug_string()) + + result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path) + result._load_checkpoints_from_latest( + glob.glob(experiment_checkpoint_path + "/experiment_state*.json"), + ) + result.trials = None + result.fetch_trial_dataframes() + else: + result = ray.tune.run( + trainable, + config=search_space, + num_samples=num_samples * repeat, + name=run_name, + local_dir=local_dir, + resources_per_trial=resources_per_trial, + sync_config=ray.tune.syncer.SyncConfig( + upload_dir=upload_dir, + syncer=syncer, + ), + search_alg=search_alg, + metric="mean_return", + mode="max", + ) + + key = ( + "rollout/" + if sacred_ex_name == "train_preference_comparisons" + else "" + if sacred_ex_name == "train_rl" + else "imit_stats/" ) + key += "monitor_return_mean" + if eval_best_trial: + df = result.results_df + df = df[df["config/named_configs"].notna()] + for col in df.columns: + if is_object_dtype(df[col]): + df[col] = df[col].astype("str") + + grp_keys = [ + c for c in df.columns if c.startswith("config") and "seed" not in c + ] + grps = df.groupby(grp_keys) + print(grps[key]) + df["mean_return"] = grps[key].transform(lambda x: x.mean()) + best_config_df = df[df["mean_return"] == df["mean_return"].max()] + envs_processed = set() + for i, row in best_config_df.iterrows(): + tag = row["experiment_tag"] + trial = [t for t in result.trials if tag in t.experiment_tag][0] + best_config = trial.config + env = tuple(best_config["named_configs"]) + if env in envs_processed: + continue + envs_processed.add(env) + print("Named configs:", env) + print("Mean return:", row["mean_return"]) + print("All returns:", df[df["mean_return"] == row["mean_return"]][key]) + print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum()) + best_config["config_updates"].update( + seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))), + ) + resources_per_trial = {k: 2 * v for k, v in resources_per_trial.items()} + eval_result = ray.tune.run( + trainable, + config={ + "named_configs": best_config["named_configs"], + "config_updates": best_config["config_updates"], + "command_name": best_config.get("command_name", None), + }, + name=run_name + "_best_hp_eval", + resources_per_trial=resources_per_trial, + ) + returns = eval_result.results_df["mean_return"].to_numpy() + print("Returns:", returns) + print(np.mean(returns), np.std(returns)) + finally: ray.shutdown() @@ -148,7 +264,7 @@ def _ray_tune_sacred_wrapper( `ex.run`) and `reporter`. The function returns the run result. """ - def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]: + def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: """Trainable function with the correct signature for `ray.tune`. Args: @@ -169,11 +285,17 @@ def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]: # Import inside function rather than in module because Sacred experiments # are not picklable, and Ray requires this function to be picklable. from imitation.scripts.train_adversarial import train_adversarial_ex + from imitation.scripts.train_imitation import train_imitation_ex + from imitation.scripts.train_preference_comparisons import ( + train_preference_comparisons_ex, + ) from imitation.scripts.train_rl import train_rl_ex experiments = { "train_rl": train_rl_ex, "train_adversarial": train_adversarial_ex, + "train_imitation": train_imitation_ex, + "train_preference_comparisons": train_preference_comparisons_ex, } ex = experiments[sacred_ex_name] @@ -181,22 +303,28 @@ def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]: named_configs = base_named_configs + run_kwargs["named_configs"] updated_run_kwargs["named_configs"] = named_configs - config_updates = {**base_config_updates, **run_kwargs["config_updates"]} + config_updates: Mapping[str, Any] = {} + config_updates.update(base_config_updates) + config_updates.update(run_kwargs["config_updates"]) + if "__trial_index__" in run_kwargs: + config_updates.update(seed=run_kwargs.pop("__trial_index__")) updated_run_kwargs["config_updates"] = config_updates # Add other run_kwargs items to updated_run_kwargs. for k, v in run_kwargs.items(): if k not in updated_run_kwargs: updated_run_kwargs[k] = v - - run = ex.run( - **updated_run_kwargs, - options={"--run": run_name, "--file_storage": "sacred"}, - ) - + run = ex.run(**updated_run_kwargs, options={"--run": run_name}) # Ray Tune has a string formatting error if raylet completes without # any calls to `reporter`. - reporter(done=True) + # reporter(done=True) + # if sacred_ex_name == "train_preference_comparisons": + # #reporter(mean_return=run.result["rollout"]["monitor_return_mean"]) + # #ray.tune.report(mean_return=run.result["rollout"]["monitor_return_mean"]) + # ray.tune.report(mean_return=234) + # else: + # # reporter(mean_return=run.result["imit_stats"]["monitor_return_mean"]) + # ray.tune.report(mean_return=run.result["imit_stats"]["monitor_return_mean"]) assert run.status == "COMPLETED" return run.result diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py index 71fc0c2c9..58f7fb4c4 100644 --- a/src/imitation/scripts/train_adversarial.py +++ b/src/imitation/scripts/train_adversarial.py @@ -162,6 +162,7 @@ def callback(round_num: int, /) -> None: return { "imit_stats": imit_stats, "expert_stats": rollout.rollout_stats(expert_trajs), + "mean_return": imit_stats["monitor_return_mean"], } diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py index 2b4946668..c5673fa3e 100644 --- a/src/imitation/scripts/train_imitation.py +++ b/src/imitation/scripts/train_imitation.py @@ -125,10 +125,12 @@ def train_imitation( expert_policy=expert_policy, custom_logger=custom_logger, bc_trainer=bc_trainer, + beta_schedule=dagger["beta_schedule"], rng=_rnd, ) model.train( total_timesteps=int(dagger["total_timesteps"]), + rollout_round_min_episodes=dagger["rollout_round_min_episodes"], bc_train_kwargs=bc_train_kwargs, ) # TODO(adam): add checkpointing to DAgger? @@ -141,7 +143,7 @@ def train_imitation( imit_stats = train.eval_policy(imit_policy, venv) - stats = {"imit_stats": imit_stats} + stats = {"imit_stats": imit_stats, "mean_return": imit_stats["monitor_return_mean"]} trajectories = model._all_demos if use_dagger else expert_trajs assert trajectories is not None if all(isinstance(t, types.TrajectoryWithRew) for t in trajectories): diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index e1aab27ff..1daa306af 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -268,6 +268,7 @@ def save_callback(iteration_num): if bool(trajectory_path is None): results = dict(results) results["rollout"] = train.eval_policy(agent, venv) + results["mean_return"] = results["rollout"]["monitor_return_mean"] if save_preferences: main_trainer.dataset.save(log_dir / "preferences.pkl") diff --git a/src/imitation/scripts/train_rl.py b/src/imitation/scripts/train_rl.py index fd345ca62..a88e6096a 100644 --- a/src/imitation/scripts/train_rl.py +++ b/src/imitation/scripts/train_rl.py @@ -157,7 +157,9 @@ def train_rl( serialize.save_stable_model(output_dir, rl_algo) # Final evaluation of expert policy. - return train.eval_policy(rl_algo, venv) + eval_stats = train.eval_policy(rl_algo, venv) + eval_stats["mean_return"] = eval_stats["monitor_return_mean"] + return eval_stats def main_console(): diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py index 549e38fd2..6cc42bc78 100644 --- a/tests/algorithms/test_dagger.py +++ b/tests/algorithms/test_dagger.py @@ -33,7 +33,7 @@ def maybe_pendulum_expert_trajectories( return None -def test_beta_schedule(): +def test_linear_beta_schedule(): one_step_sched = dagger.LinearBetaSchedule(1) three_step_sched = dagger.LinearBetaSchedule(3) for i in range(10): @@ -41,6 +41,29 @@ def test_beta_schedule(): assert np.allclose(three_step_sched(i), (3 - i) / 3 if i <= 2 else 0) +def test_indicator_beta_schedule(): + one_step_sched = dagger.IndicatorBetaSchedule(1) + three_step_sched = dagger.IndicatorBetaSchedule(3) + for i in range(10): + assert np.allclose(one_step_sched(i), 1 if i == 0 else 0) + assert np.allclose(three_step_sched(i), 1 if i <= 2 else 0) + + +def test_exponential_beta_schedule(): + constant_sched = dagger.ExponentialBetaSchedule(1) + decay = 0.5 + decaying_sched = dagger.ExponentialBetaSchedule(decay) + for i in range(10): + assert np.allclose(constant_sched(i), 1) + assert np.allclose(decaying_sched(i), decay**i) + + with pytest.raises( + ValueError, + match=r"decay_probability lies outside the range \(0, 1\]\.", + ): + decaying_sched = dagger.ExponentialBetaSchedule(1.1) + + def test_traj_collector_seed(tmpdir, pendulum_venv, rng): collector = dagger.InteractiveTrajectoryCollector( venv=pendulum_venv, diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 2196b4af1..0a2766dbb 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -375,7 +375,10 @@ def bc_config(tmpdir, request): policy_type="ppo", loader_kwargs=dict(path=CARTPOLE_TEST_POLICY_PATH / "model.zip"), ), - expert_from_huggingface=dict(policy_type="ppo-huggingface"), + expert_from_huggingface=dict( + policy_type="ppo-huggingface", + loader_kwargs=dict(env_id="seals/CartPole-v0"), + ), random_expert=dict(policy_type="random"), zero_expert=dict(policy_type="zero"), )[request.param] @@ -403,7 +406,10 @@ def test_train_bc_warmstart(tmpdir): config_updates=dict( logging=dict(log_root=tmpdir), demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH), - expert=dict(policy_type="ppo-huggingface"), + expert=dict( + policy_type="ppo-huggingface", + loader_kwargs=dict(env_id="seals/CartPole-v0"), + ), ), ) assert run.status == "COMPLETED" @@ -559,6 +565,27 @@ def test_train_adversarial(tmpdir, named_configs, command): _check_train_ex_result(run.result) +def test_train_adversarial_debug(): + """Smoke test for imitation.scripts.train_adversarial.""" + named_configs = ["seals_ant", "debug_nans"] + config_updates = { + "common": dict(log_root="/home/tf/imitation/debug", parallel=False), + "demonstrations": dict( + rollout_path="/home/tf/imitation/download/final.pkl", + ), + # TensorBoard logs to get extra coverage + # "algorithm_kwargs": dict(init_tensorboard=True), + "agent_path": "/home/tf/imitation/download/01124/gen_policy", + } + run = train_adversarial.train_adversarial_ex.run( + command_name="airl", + named_configs=named_configs, + config_updates=config_updates, + ) + assert run.status == "COMPLETED" + _check_train_ex_result(run.result) + + @pytest.mark.parametrize("command", ("airl", "gail")) def test_train_adversarial_warmstart(tmpdir, command): named_configs = ["cartpole"] + ALGO_FAST_CONFIGS["adversarial"] From 97bc063e72e6fc769222351d954f68be28cf761f Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 10 Jan 2023 15:56:14 +0530 Subject: [PATCH 02/47] Clean parallel script --- src/imitation/scripts/parallel.py | 54 +++++++++++++++++++------------ 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index c196954d1..da492804e 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -27,12 +27,13 @@ def parallel( search_space: Mapping[str, Any], base_named_configs: Sequence[str], base_config_updates: Mapping[str, Any], - resources_per_trial: Mapping[str, Any], + resources_per_trial: Dict[str, Any], init_kwargs: Mapping[str, Any], local_dir: Optional[str], upload_dir: Optional[str], repeat: int = 3, eval_best_trial: bool = False, + eval_best_trial_resource_multiplier: int = 2, eval_trial_seeds: int = 5, experiment_checkpoint_path: str = "", syncer=None, @@ -79,6 +80,8 @@ def parallel( repeat: Number of runs to repeat each trial for. eval_best_trial: Whether to evaluate the trial with the best mean return at the end of tuning on a different set of seeds. + eval_best_trial_resource_multiplier: factor by which to multiply the + number of cpus per trial in `resources_per_trial`. eval_trial_seeds: Number of distinct seeds to evaluate the best trial on. experiment_checkpoint_path: Path containing the checkpoints of a previous experiment. ran using this script. Useful for resuming cancelled trials @@ -122,11 +125,11 @@ def parallel( ) ray.init(**init_kwargs) - search_alg = optuna.OptunaSearch() - search_alg = search.Repeater(search_alg, repeat=repeat) + search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat) try: if experiment_checkpoint_path: if resume: + # restart failed runs from experiment_checkpoint_path register_trainable("inner", trainable) runner = ray.tune.execution.trial_runner.TrialRunner( local_checkpoint_dir=experiment_checkpoint_path, @@ -138,16 +141,21 @@ def parallel( resume=resume, ) print( - "Live trials:", len(runner._live_trials), "/", len(runner._trials) + "Live trials:", + len(runner._live_trials), + "/", + len(runner._trials), ) while not runner.is_finished(): runner.step() print("Debug:", runner.debug_string()) + # load experiment analysis results result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path) result._load_checkpoints_from_latest( glob.glob(experiment_checkpoint_path + "/experiment_state*.json"), ) + # update result.trials using all the experiment_state json files result.trials = None result.fetch_trial_dataframes() else: @@ -167,45 +175,50 @@ def parallel( mode="max", ) - key = ( + key_prefix = ( "rollout/" if sacred_ex_name == "train_preference_comparisons" else "" if sacred_ex_name == "train_rl" else "imit_stats/" ) - key += "monitor_return_mean" + key = key_prefix + "monitor_return_mean" if eval_best_trial: df = result.results_df df = df[df["config/named_configs"].notna()] + # convert object dtype to str required by df.groupby for col in df.columns: if is_object_dtype(df[col]): df[col] = df[col].astype("str") - + # group into separate HP configs grp_keys = [ c for c in df.columns if c.startswith("config") and "seed" not in c ] grps = df.groupby(grp_keys) - print(grps[key]) + # store mean return of runs across all seeds in a group df["mean_return"] = grps[key].transform(lambda x: x.mean()) best_config_df = df[df["mean_return"] == df["mean_return"].max()] - envs_processed = set() - for i, row in best_config_df.iterrows(): - tag = row["experiment_tag"] - trial = [t for t in result.trials if tag in t.experiment_tag][0] + row = best_config_df.loc[0] + best_config_tag = row["experiment_tag"] + if result.trials is not None: + trial = [ + t for t in result.trials if best_config_tag in t.experiment_tag + ][0] best_config = trial.config - env = tuple(best_config["named_configs"]) - if env in envs_processed: - continue - envs_processed.add(env) - print("Named configs:", env) print("Mean return:", row["mean_return"]) print("All returns:", df[df["mean_return"] == row["mean_return"]][key]) print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum()) best_config["config_updates"].update( seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))), ) - resources_per_trial = {k: 2 * v for k, v in resources_per_trial.items()} + # update cpus per trial only if it is provided in `resources_per_trial` + # Uses the default values (cpu=1) if it is not provided + if "cpu" in resources_per_trial: + resources_per_trial["cpu"] *= eval_best_trial_resource_multiplier + best_config["config_updates"].update( + environment=dict(num_vec=resources_per_trial["cpu"]), + ) + eval_result = ray.tune.run( trainable, config={ @@ -219,7 +232,6 @@ def parallel( returns = eval_result.results_df["mean_return"].to_numpy() print("Returns:", returns) print(np.mean(returns), np.std(returns)) - finally: ray.shutdown() @@ -229,7 +241,7 @@ def _ray_tune_sacred_wrapper( run_name: str, base_named_configs: list, base_config_updates: Mapping[str, Any], -) -> Callable[[Mapping[str, Any], Any], Mapping[str, Any]]: +) -> Callable[[Dict[str, Any], Any], Mapping[str, Any]]: """From an Experiment build a wrapped run function suitable for Ray Tune. `ray.tune.run(...)` expects a trainable function that takes a dict @@ -303,7 +315,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: named_configs = base_named_configs + run_kwargs["named_configs"] updated_run_kwargs["named_configs"] = named_configs - config_updates: Mapping[str, Any] = {} + config_updates: Dict[str, Any] = {} config_updates.update(base_config_updates) config_updates.update(run_kwargs["config_updates"]) if "__trial_index__" in run_kwargs: From 92912256816e51ce6e4266ac80ed990c6416493d Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 26 Jan 2023 15:18:04 +0100 Subject: [PATCH 03/47] Undo the changes from #653 to the dagger benchmark config files. This change just made some error messages go away indicating the missing imitation.algorithms.dagger.ExponentialBetaSchedule but it did not fix the root cause. --- benchmarking/example_dagger_seals_ant_best_hp_eval.json | 2 +- .../example_dagger_seals_half_cheetah_best_hp_eval.json | 2 +- benchmarking/example_dagger_seals_hopper_best_hp_eval.json | 2 +- benchmarking/example_dagger_seals_swimmer_best_hp_eval.json | 2 +- benchmarking/example_dagger_seals_walker_best_hp_eval.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarking/example_dagger_seals_ant_best_hp_eval.json b/benchmarking/example_dagger_seals_ant_best_hp_eval.json index 035beab83..38f3f504a 100644 --- a/benchmarking/example_dagger_seals_ant_best_hp_eval.json +++ b/benchmarking/example_dagger_seals_ant_best_hp_eval.json @@ -16,7 +16,7 @@ }, "dagger": { "beta_schedule": { - "py/type": "imitation.algorithms.dagger.LinearBetaSchedule", + "py/object": "imitation.algorithms.dagger.LinearBetaSchedule", "rampdown_rounds": 15 }, "rollout_round_min_episodes": 5, diff --git a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json index 8961f8c26..708c92547 100644 --- a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json +++ b/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json @@ -17,7 +17,7 @@ "dagger": { "beta_schedule": { "decay_probability": 0.7, - "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule" + "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule" }, "rollout_round_min_episodes": 5, "total_timesteps": 60000, diff --git a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json b/benchmarking/example_dagger_seals_hopper_best_hp_eval.json index fe47291e0..001479ec3 100644 --- a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json +++ b/benchmarking/example_dagger_seals_hopper_best_hp_eval.json @@ -17,7 +17,7 @@ "dagger": { "beta_schedule": { "decay_probability": 0.7, - "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule" + "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule" }, "rollout_round_min_episodes": 10, "total_timesteps": 100000, diff --git a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json b/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json index 2e6cba2c0..df1606fca 100644 --- a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json +++ b/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json @@ -16,7 +16,7 @@ }, "dagger": { "beta_schedule": { - "py/type": "imitation.algorithms.dagger.LinearBetaSchedule", + "py/object": "imitation.algorithms.dagger.LinearBetaSchedule", "rampdown_rounds": 15 }, "rollout_round_min_episodes": 3, diff --git a/benchmarking/example_dagger_seals_walker_best_hp_eval.json b/benchmarking/example_dagger_seals_walker_best_hp_eval.json index e4569321f..ce6baff1c 100644 --- a/benchmarking/example_dagger_seals_walker_best_hp_eval.json +++ b/benchmarking/example_dagger_seals_walker_best_hp_eval.json @@ -17,7 +17,7 @@ "dagger": { "beta_schedule": { "decay_probability": 0.7, - "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule" + "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule" }, "rollout_round_min_episodes": 5, "total_timesteps": 100000, From 276d863f488512067c38408ecf1386e8199abf50 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Wed, 25 Jan 2023 17:08:27 +0100 Subject: [PATCH 04/47] Improve readability and interpretability of benchmarking tests. --- tests/test_benchmarking.py | 51 ++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py index 5c42063c6..67b9eb489 100644 --- a/tests/test_benchmarking.py +++ b/tests/test_benchmarking.py @@ -1,6 +1,4 @@ """Tests for config files in benchmarking/ folder.""" -import glob -import os import pathlib import pytest @@ -10,24 +8,39 @@ THIS_DIR = pathlib.Path(__file__).absolute().parent BENCHMARKING_DIR = THIS_DIR.parent / "benchmarking" +ALGORITHMS = ["bc", "dagger", "airl", "gail"] +ENVIRONMENTS = [ + "seals_walker", + "seals_ant", + "seals_half_cheetah", + "seals_hopper", + "seals_swimmer", +] -@pytest.mark.parametrize( - "command_name", - ["bc", "dagger", "airl", "gail"], -) -def test_benchmarking_configs(tmpdir, command_name): + +@pytest.mark.parametrize("environment", ENVIRONMENTS) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_benchmarks_print_config_succeeds(algorithm: str, environment: str): # We test the configs using the print_config command, # because running the configs requires MuJoCo. # Requiring MuJoCo to run the tests adds too much complexity. - if command_name in ("bc", "dagger"): - ex = train_imitation.train_imitation_ex - elif command_name in ("airl", "gail"): - ex = train_adversarial.train_adversarial_ex - cfg_pattern = os.path.join(BENCHMARKING_DIR, f"example_{command_name}_*.json") - cfg_files = glob.glob(cfg_pattern) - assert len(cfg_files) == 5, "There should be 1 config file for each of environment." - for i, cfg_file in enumerate(cfg_files): - cfg_name = f"{tmpdir.basename}_{i}" - ex.add_named_config(cfg_name, cfg_file) - run = ex.run(command_name="print_config", named_configs=[cfg_name]) - assert run.status == "COMPLETED" + + # GIVEN + if algorithm in ("bc", "dagger"): + experiment = train_imitation.train_imitation_ex + elif algorithm in ("airl", "gail"): + experiment = train_adversarial.train_adversarial_ex + else: + raise ValueError(f"Unknown algorithm: {algorithm}") + + config_name = f"{algorithm}_{environment}" + config_file = str( + BENCHMARKING_DIR / f"example_{algorithm}_{environment}_best_hp_eval.json", + ) + + # WHEN + experiment.add_named_config(config_name, config_file) + run = experiment.run(command_name="print_config", named_configs=[config_name]) + + # THEN + assert run.status == "COMPLETED" From 37eb914cba0aaa416543b763b6f2246eae8f9fa7 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 1 Mar 2023 21:48:13 +0530 Subject: [PATCH 05/47] Add pxponential beta scheduler for dagger --- src/imitation/algorithms/dagger.py | 29 +++++++++++++++++++ .../scripts/config/train_imitation.py | 1 + src/imitation/scripts/train_imitation.py | 1 + 3 files changed, 31 insertions(+) diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py index d43ca5eec..34d8cef7e 100644 --- a/src/imitation/algorithms/dagger.py +++ b/src/imitation/algorithms/dagger.py @@ -66,6 +66,35 @@ def __call__(self, round_num: int) -> float: return min(1, max(0, (self.rampdown_rounds - round_num) / self.rampdown_rounds)) +class ExponentialBetaSchedule(BetaSchedule): + """Exponentially decaying schedule for beta.""" + + def __init__(self, decay_probability: float): + """Builds ExponentialBetaSchedule. + + Args: + decay_probability: the decay factor for beta. + + Raises: + ValueError: if `decay_probability` not within (0, 1]. + """ + if not (0 < decay_probability <= 1): + raise ValueError("decay_probability lies outside the range (0, 1].") + self.decay_probability = decay_probability + + def __call__(self, round_num: int) -> float: + """Computes beta value. + + Args: + round_num: the current round number. + + Returns: + beta as `self.decay_probability ^ round_num` + """ + assert round_num >= 0 + return self.decay_probability**round_num + + def reconstruct_trainer( scratch_dir: types.AnyPath, venv: vec_env.VecEnv, diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py index 16da9c694..2ef2eed44 100644 --- a/src/imitation/scripts/config/train_imitation.py +++ b/src/imitation/scripts/config/train_imitation.py @@ -38,6 +38,7 @@ def config(): dagger = dict( use_offline_rollouts=False, # warm-start policy with BC from offline demos total_timesteps=1e5, + beta_schedule=None, ) agent_path = None # Path to load agent from, optional. diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py index 2b4946668..f8cc992fd 100644 --- a/src/imitation/scripts/train_imitation.py +++ b/src/imitation/scripts/train_imitation.py @@ -125,6 +125,7 @@ def train_imitation( expert_policy=expert_policy, custom_logger=custom_logger, bc_trainer=bc_trainer, + beta_schedule=dagger["beta_schedule"], rng=_rnd, ) model.train( From 877383b03d7d3260746997f3cab7b5272125b07b Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 2 Feb 2023 13:00:06 +0100 Subject: [PATCH 06/47] Ignore coverage for unknown algorithms. --- tests/test_benchmarking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py index 67b9eb489..ba01b38a2 100644 --- a/tests/test_benchmarking.py +++ b/tests/test_benchmarking.py @@ -31,7 +31,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str): elif algorithm in ("airl", "gail"): experiment = train_adversarial.train_adversarial_ex else: - raise ValueError(f"Unknown algorithm: {algorithm}") + raise ValueError(f"Unknown algorithm: {algorithm}") # pragma: no cover config_name = f"{algorithm}_{environment}" config_file = str( From c8e55cb1efee3913bf306c23f6a5c361674d7380 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Thu, 2 Feb 2023 13:04:02 +0100 Subject: [PATCH 07/47] Cleanup and extend tests for beta schedules in dagger. --- tests/algorithms/test_dagger.py | 39 ++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py index 525fc449a..6e5582810 100644 --- a/tests/algorithms/test_dagger.py +++ b/tests/algorithms/test_dagger.py @@ -33,12 +33,39 @@ def maybe_pendulum_expert_trajectories( return None -def test_beta_schedule(): - one_step_sched = dagger.LinearBetaSchedule(1) - three_step_sched = dagger.LinearBetaSchedule(3) - for i in range(10): - assert np.allclose(one_step_sched(i), 1 if i == 0 else 0) - assert np.allclose(three_step_sched(i), (3 - i) / 3 if i <= 2 else 0) +@pytest.mark.parametrize("num_rampdown_rounds", [1, 2, 3, 10]) +def test_linear_beta_schedule(num_rampdown_rounds): + # GIVEN + sched = dagger.LinearBetaSchedule(num_rampdown_rounds) + idx_after_rampdown = num_rampdown_rounds + 1 + + # WHEN + betas = [sched(i) for i in range(num_rampdown_rounds + 10)] + + # THEN + assert np.allclose( + betas[:idx_after_rampdown], + np.linspace(1, 0, idx_after_rampdown), + ) + assert np.allclose(betas[idx_after_rampdown:], 0) + + +@pytest.mark.parametrize("decay_probability", [0.1, 0.5, 0.9, 1]) +def test_exponential_beta_schedule(decay_probability): + # GIVEN + sched = dagger.ExponentialBetaSchedule(decay_probability) + + # WHEN + betas = [sched(i) for i in range(10)] + + # THEN + assert np.allclose(betas, decay_probability ** np.arange(10)) + + +@pytest.mark.parametrize("decay_probability", [-0.1, 0, 1.1, 2]) +def test_forbidden_decay_probability_on_exp_beta_schedule(decay_probability): + with pytest.raises(ValueError): + dagger.ExponentialBetaSchedule(decay_probability) def test_traj_collector_seed(tmpdir, pendulum_venv, rng): From d81eb68d2359ebb1927f6ebb2ba573f0c7e5745a Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 9 Feb 2023 02:02:21 +0530 Subject: [PATCH 08/47] Add optuna to dependencies --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 557015d91..867c1b775 100644 --- a/setup.py +++ b/setup.py @@ -210,6 +210,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: "chai-sacred>=0.8.3", "tensorboard>=1.14", "huggingface_sb3>=2.2.1", + "optuna>=3.0.1", ], tests_require=TESTS_REQUIRE, extras_require={ From 27467d38268a2217731f019dc0202ce3a520cf2a Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 9 Feb 2023 02:22:24 +0530 Subject: [PATCH 09/47] Fix test case --- tests/scripts/test_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 78bbca9bd..ad559d2d9 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -910,7 +910,7 @@ def test_parallel_train_adversarial_custom_env(tmpdir): logging=dict(log_root=tmpdir), demonstrations=dict(rollout_path=rollout_path), ), - search_space=dict(command_name="gail"), + search_space=dict(command_name=tune.choice(["gail"])), ) config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE) run = parallel.parallel_ex.run(config_updates=config_updates) From 1a3b6b81f70cdfc515dc41a264ae1e81347ac588 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 9 Feb 2023 12:04:03 +0530 Subject: [PATCH 10/47] Clean up the scripts --- src/imitation/scripts/analyze.py | 12 +- src/imitation/scripts/config/parallel.py | 219 ++---------------- .../scripts/config/train_adversarial.py | 40 +--- src/imitation/scripts/parallel.py | 39 ++-- 4 files changed, 48 insertions(+), 262 deletions(-) diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py index a7b52af36..b7b990800 100644 --- a/src/imitation/scripts/analyze.py +++ b/src/imitation/scripts/analyze.py @@ -167,6 +167,7 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str: def _return_summaries(sd: sacred_util.SacredDicts) -> dict: imit_stats = get(sd.run, "result.imit_stats") if imit_stats is None: + # stored in rollout key for preference comparison imit_stats = get(sd.run, "result.rollout") expert_stats = get(sd.run, "result.expert_stats") @@ -234,7 +235,7 @@ def _return_summaries(sd: sacred_util.SacredDicts) -> dict: # verbosity 2 table_verbosity_mapping.append( table_verbosity_mapping[-1] - | {"status", "imit_expert_ratio", "exp_command", "run_name", "seed", ""}, + | {"status", "imit_expert_ratio", "exp_command", "run_name"}, ) @@ -264,14 +265,14 @@ def analyze_imitation( csv_output_path: If provided, then save a CSV output file to this path. tex_output_path: If provided, then save a LaTeX-format table to this path. print_table: If True, then print the dataframe to stdout. - table_verbosity: Increasing levels of verbosity, from 0 to 2, increase the - number of columns in the table. + table_verbosity: Increasing levels of verbosity, from 0 to 3, increase the + number of columns in the table. Level 3 prints all of the columns available. Returns: The DataFrame generated from the Sacred logs. """ - if table_verbosity == -1: - table_entry_fns_subset = _get_table_entry_fns_subset(0) + if table_verbosity == 3: + table_entry_fns_subset = _get_table_entry_fns_subset(2) else: table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity) @@ -279,6 +280,7 @@ def analyze_imitation( for sd in _gather_sacred_dicts(): new_df = pd.DataFrame() if table_verbosity == -1: + # gets all config columns new_df = pd.json_normalize(sd.config) else: new_df = new_df.append({}, ignore_index=True) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index 0525641e3..697c5d862 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -6,6 +6,11 @@ Adding custom named configs is necessary because the CLI interface can't add search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`. + +For tuning hyperparameters of an algorithm on a given environment, override +the `base_named_configs` argument with the named config of the environment. +Ex: python -m imitation.scripts.parallel with example_gail \ + 'base_named_configs=["logging.wandb_logging", "seals_half_cheetah"]' """ import numpy as np @@ -13,7 +18,7 @@ import sacred from torch import nn -from imitation.algorithms.dagger import ExponentialBetaSchedule, LinearBetaSchedule +from imitation.algorithms import dagger from imitation.util.util import make_unique_timestamp parallel_ex = sacred.Experiment("parallel") @@ -35,44 +40,11 @@ def config(): local_dir = None # `local_dir` arg for `ray.tune.run` upload_dir = None # `upload_dir` arg for `ray.tune.run` - # n_seeds_start = 0 - # n_seeds = 1 # Number of seeds to search over by default experiment_checkpoint_path = "" eval_best_trial = False eval_trial_seeds = 5 # Number of seeds to search over by default num_samples = 1 # Number of samples per grid search configuration - repeat = 3 - env = "seals_half_cheetah" - wandb_name_prefix = "" - - -# @parallel_ex.config -# def seeds(n_seeds_start, n_seeds): -# search_space = { -# "config_updates": { -# "seed": tune.choice( -# list(range(n_seeds_start, n_seeds_start + n_seeds)), -# ) -# } -# } - - -# @parallel_ex.config -# def wandb(run_name): -# base_config_updates = { -# "logging": { -# "wandb": { -# "wandb_name_prefix": run_name, -# "wandb_kwargs": {"project": "algorithm-benchmark"}, -# }, -# }, -# } -# base_named_configs = ["logging.wandb_logging"] - - -@parallel_ex.named_config -def s3(): - upload_dir = "s3://shwang-chai/private" + repeat = 1 # Debug named configs @@ -137,11 +109,9 @@ def example_cartpole_rl(): def example_rl(): sacred_ex_name = "train_rl" run_name = "rl_tuning" - # n_seeds = 2 - base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"] + base_named_configs = ["logging.wandb_logging"] base_config_updates = {"environment": {"num_vec": 1}} search_space = { - # "named_configs": tune.choice([[env] for env in EASY_ENVS]), "config_updates": { "rl": { "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]), @@ -163,8 +133,8 @@ def example_rl(): @parallel_ex.named_config def example_bc(): sacred_ex_name = "train_imitation" - run_name = "bc_tuning_hc" - base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"] + run_name = "bc_tuning" + base_named_configs = ["logging.wandb_logging"] base_config_updates = {"environment": {"num_vec": 1}} search_space = { "config_updates": { @@ -191,8 +161,8 @@ def example_bc(): @parallel_ex.named_config def example_dagger(): sacred_ex_name = "train_imitation" - run_name = "dagger_tuning_hc" - base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"] + run_name = "dagger_tuning" + base_named_configs = ["logging.wandb_logging"] base_config_updates = { "environment": {"num_vec": 1}, "dagger": {"total_timesteps": 1e5}, @@ -209,8 +179,8 @@ def example_dagger(): ), "dagger": dict( beta_schedule=tune.choice( - [LinearBetaSchedule(i) for i in [1, 5, 15]] - + [ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]], + [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]] + + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]], ), rollout_round_min_episodes=tune.choice([3, 5, 10]), ), @@ -234,14 +204,10 @@ def example_gail(): "total_timesteps": 1e7, } search_space = { - # "named_configs": tune.choice([[env] for env in MY_ENVS]), "config_updates": { "algorithm_kwargs": dict( demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), n_disc_updates_per_round=tune.choice([8, 16]), - # both are same as rl.batch_size - # gen_replay_buffer_capacity=tune.choice([512, 1024]), - # gen_train_timesteps=0, ), "rl": { "batch_size": tune.choice([4096, 8192, 16384]), @@ -258,29 +224,23 @@ def example_gail(): eval_best_trial = True eval_trial_seeds = 5 repeat = 3 - # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}" resources_per_trial = dict(cpu=1) @parallel_ex.named_config def example_airl(): sacred_ex_name = "train_adversarial" - run_name = "airl_tuning_hc" - # n_seeds = 1 + run_name = "airl_tuning" base_named_configs = ["logging.wandb_logging"] base_config_updates = { "environment": {"num_vec": 1}, "total_timesteps": 1e7, } search_space = { - # "named_configs": tune.choice([[env] for env in MY_ENVS]), "config_updates": { "algorithm_kwargs": dict( demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), n_disc_updates_per_round=tune.choice([8, 16]), - # both are same as rl.batch_size - # gen_replay_buffer_capacity=tune.choice([512, 1024]), - # gen_train_timesteps=0, ), "rl": { "batch_size": tune.choice([4096, 8192, 16384]), @@ -297,7 +257,6 @@ def example_airl(): eval_best_trial = True eval_trial_seeds = 5 repeat = 3 - # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}" resources_per_trial = dict(cpu=1) @@ -305,7 +264,7 @@ def example_airl(): def example_pc(): sacred_ex_name = "train_preference_comparisons" run_name = "pc_tuning" - base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"] + base_named_configs = ["logging.wandb_logging"] base_config_updates = { "environment": {"num_vec": 1}, "total_timesteps": 2e7, @@ -317,8 +276,6 @@ def example_pc(): "named_configs": tune.choice( [ ["reward.normalize_output_disable"], - # ["reward.normalize_output_running"], - # ["reward.normalize_output_ema"], ], ), "config_updates": { @@ -327,19 +284,15 @@ def example_pc(): "activation_fn": tune.choice( [ nn.ReLU, - # nn.Tanh, ], ), }, }, "num_iterations": tune.choice([25, 50]), - # "initial_comparison_frac": tune.choice([0.1, 0.25]), - # "reward_trainer_kwargs": { - # "epochs": tune.choice([1, 3, 6]), - # }, - # "query_schedule": tune.choice( - # ["constant", "hyperbolic", "inverse_quadratic"], - # ), + "initial_comparison_frac": tune.choice([0.1, 0.25]), + "reward_trainer_kwargs": { + "epochs": tune.choice([1, 3, 6]), + }, "rl": { "batch_size": tune.choice([512, 2048, 8192]), "rl_kwargs": { @@ -349,138 +302,8 @@ def example_pc(): }, }, } - num_samples = 24 + num_samples = 100 eval_best_trial = True eval_trial_seeds = 5 repeat = 3 resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def debug_eval(): - sacred_ex_name = "train_preference_comparisons" - run_name = "debug_eval" - eval_trial_seeds = 2 - eval_best_trial = True - # base_named_configs = ["seals_half_cheetah"] - base_config_updates = { - "total_timesteps": 30, - "total_comparisons": 10, - # "query_schedule": "hyperbolic", - "num_iterations": 1, - "fragment_length": 2, - } - search_space = { - # "named_configs": tune.choice([[env] for env in MY_ENVS]), - "config_updates": { - # "num_iterations": tune.choice([5, 20, 50]), - "initial_comparison_frac": tune.choice([0.1, 0.2]), - # "reward_trainer_kwargs": { - # "epochs": tune.choice([1, 2, 3]), - # }, - # "query_schedule": tune.choice( - # ["constant", "hyperbolic", "inverse_quadratic"], - # ), - }, - } - resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def debug_eval_adv(): - sacred_ex_name = "train_adversarial" - run_name = "airl_tuning_debug" - # n_seeds = 5 - base_named_configs = [] - eval_best_trial = True - eval_trial_seeds = 2 - base_config_updates = { - "total_timesteps": 2048, - } - search_space = { - # "named_configs": tune.choice([[env] for env in MY_ENVS]), - "config_updates": { - "algorithm_kwargs": dict( - # demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), - n_disc_updates_per_round=tune.choice([1, 2]), - # both are same as rl.batch_size - # gen_replay_buffer_capacity=tune.choice([512, 1024]), - # gen_train_timesteps=0, - ), - "rl": { - "batch_size": 8, - # "rl_kwargs": { - # "ent_coef": tune.choice([0, 1e-3, 1e-1]), - # "learning_rate": tune.loguniform(1e-5, 5e-3), - # }, - }, - "algorithm_specific": dict(demo_batch_size=1), - }, - "command_name": "airl", - } - num_samples = 2 - repeat = 2 - resources_per_trial = dict(cpu=8) - - -@parallel_ex.named_config -def debug_airl(): - sacred_ex_name = "train_adversarial" - run_name = "airl_debug" - # n_seeds = 1 - base_named_configs = ["logging.wandb_logging", "seals_walker"] - base_config_updates = { - "environment": {"num_vec": 8}, - "total_timesteps": 1e7, - } - search_space = { - # "named_configs": tune.choice([[env] for env in MY_ENVS]), - "config_updates": { - "train": { - "policy_kwargs": { - "activation_fn": tune.choice( - [ - nn.ReLU, - # nn.Tanh, - ], - ), - }, - }, - "algorithm_kwargs": dict( - demo_batch_size=tune.choice([32]), - n_disc_updates_per_round=tune.choice([10]), - # both are same as rl.batch_size - # gen_replay_buffer_capacity=tune.choice([512, 1024]), - # gen_train_timesteps=0, - ), - "rl": { - "batch_size": tune.choice([10000]), - "rl_kwargs": { - "ent_coef": tune.choice([0.1]), - "learning_rate": tune.choice([1e-4]), - }, - }, - "algorithm_specific": {}, - }, - "command_name": "airl", - } - num_samples = 1 - eval_best_trial = False - # eval_trial_seeds = 5 - repeat = 5 - # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}" - resources_per_trial = dict(cpu=8) - - -# @parallel_ex.config_hook -# def config_hook(config, command_name, logger): -# """Sets env.""" -# del command_name, logger -# res = {} -# print(config) -# if config["env"]: -# res["base_named_configs"] = tuple( -# config["base_named_configs"] + [config["env"]] -# ) -# print(res) -# return res diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py index bd9df6287..fb26c99c6 100644 --- a/src/imitation/scripts/config/train_adversarial.py +++ b/src/imitation/scripts/config/train_adversarial.py @@ -99,8 +99,8 @@ def pendulum(): @train_adversarial_ex.named_config def seals_ant(): - # locals().update(**MUJOCO_SHARED_LOCALS) - # locals().update(**ANT_SHARED_LOCALS) + locals().update(**MUJOCO_SHARED_LOCALS) + locals().update(**ANT_SHARED_LOCALS) environment = dict(gym_id="seals/Ant-v0") demonstrations = dict(rollout_type="ppo-huggingface") rl = dict( @@ -173,21 +173,6 @@ def seals_half_cheetah(): vf_coef=0.11483689492120866, ), ) - # algorithm_specific = dict( - # airl=dict(total_timesteps=int(5e6)), - # gail=dict(total_timesteps=int(8e6)), - # ) - # reward = dict( - # algorithm_specific=dict( - # airl=dict( - # net_cls=reward_nets.BasicShapedRewardNet, - # net_kwargs=dict( - # reward_hid_sizes=(32,), - # potential_hid_sizes=(32,), - # ), - # ), - # ), - # ) algorithm_kwargs = dict( # Number of discriminator updates after each round of generator updates n_disc_updates_per_round=16, @@ -257,7 +242,7 @@ def seals_swimmer(): @train_adversarial_ex.named_config def seals_walker(): - # locals().update(**MUJOCO_SHARED_LOCALS) + locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Walker2d-v0") demonstrations = dict(rollout_type="ppo-huggingface") train = dict( @@ -311,22 +296,3 @@ def fast(): demo_batch_size=1, n_disc_updates_per_round=4, ) - - -@train_adversarial_ex.named_config -def debug_nans(): - environment = {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}} - total_timesteps = 1e7 - algorithm_kwargs = dict( - demo_batch_size=128, - n_disc_updates_per_round=8, - # both are same as rl.batch_size - # gen_replay_buffer_capacity=tune.choice([512, 1024]), - # gen_train_timesteps=0, - ) - rl = { - "batch_size": 4096, - "rl_kwargs": {"ent_coef": 0.1, "learning_rate": 7.316377404994506e-05}, - } - seed = 0 - checkpoint_interval = 1 diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 3e713777e..9ee8e6ee9 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -33,7 +33,7 @@ def parallel( upload_dir: Optional[str], repeat: int = 1, eval_best_trial: bool = False, - eval_best_trial_resource_multiplier: int = 2, + eval_best_trial_resource_multiplier: int = 1, eval_trial_seeds: int = 5, experiment_checkpoint_path: str = "", syncer=None, @@ -54,7 +54,8 @@ def parallel( under the 'experiment.name' key. This is equivalent to using the Sacred CLI '--name' option on the inner experiment. Offline analysis jobs can use this argument to group similar data. - num_samples: Number of times to sample from the hyperparameter space. + num_samples: Number of times to sample from the hyperparameter space without + considering repetition using `repeat`. search_space: A dictionary which can contain Ray Tune search objects like `ray.tune.grid_search` and `ray.tune.sample_from`, and is passed as the `config` argument to `ray.tune.run()`. After the @@ -79,12 +80,12 @@ def parallel( upload_dir: `upload_dir` argument to `ray.tune.run()`. repeat: Number of runs to repeat each trial for. eval_best_trial: Whether to evaluate the trial with the best mean return - at the end of tuning on a different set of seeds. + at the end of tuning on a separate set of seeds. eval_best_trial_resource_multiplier: factor by which to multiply the number of cpus per trial in `resources_per_trial`. eval_trial_seeds: Number of distinct seeds to evaluate the best trial on. experiment_checkpoint_path: Path containing the checkpoints of a previous - experiment. ran using this script. Useful for resuming cancelled trials + experiment ran using this script. Useful for resuming cancelled trials of the experiments (using `resume`) or evaluating the best trial of the experiment (using `eval_best_trial`). resume: If true and `experiment_checkpoint_path` is given, then resumes the @@ -159,6 +160,7 @@ def parallel( result.trials = None result.fetch_trial_dataframes() else: + # run hyperparameter tuning result = ray.tune.run( trainable, config=search_space, @@ -174,15 +176,14 @@ def parallel( metric="mean_return", mode="max", ) - - key_prefix = ( - "rollout/" - if sacred_ex_name == "train_preference_comparisons" - else "" - if sacred_ex_name == "train_rl" - else "imit_stats/" - ) + if sacred_ex_name == "train_rl": + key_prefix = "" + elif sacred_ex_name == "train_preference_comparisons": + key_prefix = "rollout/" + else: + key_prefix = "imit_stats/" key = key_prefix + "monitor_return_mean" + if eval_best_trial: df = result.results_df df = df[df["config/named_configs"].notna()] @@ -230,8 +231,9 @@ def parallel( resources_per_trial=resources_per_trial, ) returns = eval_result.results_df["mean_return"].to_numpy() - print("Returns:", returns) - print(np.mean(returns), np.std(returns)) + print("All returns:", returns) + print("Mean:", np.mean(returns)) + print("Std:", np.std(returns)) finally: ray.shutdown() @@ -333,14 +335,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: ) # Ray Tune has a string formatting error if raylet completes without # any calls to `reporter`. - # reporter(done=True) - # if sacred_ex_name == "train_preference_comparisons": - # #reporter(mean_return=run.result["rollout"]["monitor_return_mean"]) - # #ray.tune.report(mean_return=run.result["rollout"]["monitor_return_mean"]) - # ray.tune.report(mean_return=234) - # else: - # # reporter(mean_return=run.result["imit_stats"]["monitor_return_mean"]) - # ray.tune.report(mean_return=run.result["imit_stats"]["monitor_return_mean"]) + reporter(done=True) assert run.status == "COMPLETED" return run.result From 7a438da0f5421f0d98fdb4db9747a8af10d26297 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 9 Feb 2023 19:53:14 +0530 Subject: [PATCH 11/47] Remove reporter(done) since mean_return is reported by the runs --- src/imitation/scripts/parallel.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 9ee8e6ee9..2dd2254bf 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -333,9 +333,6 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: **updated_run_kwargs, options={"--run": run_name, "--file_storage": "sacred"}, ) - # Ray Tune has a string formatting error if raylet completes without - # any calls to `reporter`. - reporter(done=True) assert run.status == "COMPLETED" return run.result From 2e56de8eb97713b88ada09564369214f5e4fa661 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 23 Feb 2023 23:53:12 +0530 Subject: [PATCH 12/47] Add beta_schedule parameter to dagger script --- src/imitation/scripts/train_imitation.py | 1 + src/imitation/scripts/train_preference_comparisons.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py index e607339b4..56633e33a 100644 --- a/src/imitation/scripts/train_imitation.py +++ b/src/imitation/scripts/train_imitation.py @@ -119,6 +119,7 @@ def dagger( expert_policy=expert_policy, custom_logger=custom_logger, bc_trainer=bc_trainer, + beta_schedule=dagger["beta_schedule"], rng=_rnd, ) diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index 3d4fb4e33..4030317c4 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -280,7 +280,6 @@ def save_callback(iteration_num): results = dict(results) results["rollout"] = policy_evaluation.eval_policy(agent, venv) results["mean_return"] = results["rollout"]["monitor_return_mean"] - if save_preferences: main_trainer.dataset.save(log_dir / "preferences.pkl") From 73d8576fc893868c68442b657bd25aaffb7df9bf Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Fri, 17 Mar 2023 03:37:15 +0530 Subject: [PATCH 13/47] Update config policy kwargs --- src/imitation/scripts/config/train_adversarial.py | 6 +++--- .../scripts/config/train_preference_comparisons.py | 13 +++---------- src/imitation/scripts/config/train_rl.py | 12 ++++++------ 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py index 08b92fe9c..7989f3eab 100644 --- a/src/imitation/scripts/config/train_adversarial.py +++ b/src/imitation/scripts/config/train_adversarial.py @@ -187,7 +187,7 @@ def seals_hopper(): # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Hopper-v0") demonstrations = dict(rollout_type="ppo-huggingface") - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -216,7 +216,7 @@ def seals_swimmer(): environment = dict(gym_id="seals/Swimmer-v0") total_timesteps = int(2e6) demonstrations = dict(rollout_type="ppo-huggingface") - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -245,7 +245,7 @@ def seals_walker(): locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Walker2d-v0") demonstrations = dict(rollout_type="ppo-huggingface") - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 236edad47..1a039c762 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -120,20 +120,13 @@ def seals_half_cheetah(): ) num_iterations = 50 total_timesteps = 20000000 - # train = dict( - # policy_cls="MlpPolicy", - # policy_kwargs=dict( - # activation_fn=nn.ReLU, - # # net_arch=[dict(pi=[64, 64], vf=[64, 64])], - # ), - # ) @train_preference_comparisons_ex.named_config def seals_hopper(): # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Hopper-v0") - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -160,7 +153,7 @@ def seals_hopper(): def seals_swimmer(): # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Swimmer-v0") - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -188,7 +181,7 @@ def seals_swimmer(): def seals_walker(): # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Walker2d-v0") - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py index 34b45250c..a5475540d 100644 --- a/src/imitation/scripts/config/train_rl.py +++ b/src/imitation/scripts/config/train_rl.py @@ -74,7 +74,7 @@ def cartpole(): def seals_cartpole(): environment = dict(gym_id="seals/CartPole-v0", num_vec=8) total_timesteps = int(1e5) - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -111,7 +111,7 @@ def seals_half_cheetah(): num_vec=1, ) - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.Tanh, @@ -141,7 +141,7 @@ def seals_half_cheetah(): @train_rl_ex.named_config def seals_hopper(): environment = dict(gym_id="seals/Hopper-v0", num_vec=1) - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -211,7 +211,7 @@ def seals_ant(): num_vec=1, ) - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.Tanh, @@ -242,7 +242,7 @@ def seals_ant(): @train_rl_ex.named_config def seals_swimmer(): environment = dict(gym_id="seals/Swimmer-v0", num_vec=1) - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, @@ -273,7 +273,7 @@ def seals_swimmer(): @train_rl_ex.named_config def seals_walker(): environment = dict(gym_id="seals/Walker2d-v0", num_vec=1) - train = dict( + policy = dict( policy_cls="MlpPolicy", policy_kwargs=dict( activation_fn=nn.ReLU, From 9fdf8786663473334f94b24a841a832b29da435f Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 16 May 2023 19:00:32 +0530 Subject: [PATCH 14/47] Changes from review --- src/imitation/scripts/config/parallel.py | 16 ++++++++-------- .../scripts/config/train_adversarial.py | 4 ---- .../config/train_preference_comparisons.py | 6 ------ src/imitation/scripts/parallel.py | 18 +++++++----------- src/imitation/scripts/train_imitation.py | 1 + 5 files changed, 16 insertions(+), 29 deletions(-) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index ea90f11b8..b52446154 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -102,9 +102,6 @@ def example_cartpole_rl(): resources_per_trial = dict(cpu=4) -EASY_ENVS = ["cartpole", "pendulum", "mountain_car"] - - @parallel_ex.named_config def example_rl(): sacred_ex_name = "train_rl" @@ -135,18 +132,21 @@ def example_bc(): sacred_ex_name = "train_imitation" run_name = "bc_tuning" base_named_configs = ["logging.wandb_logging"] - base_config_updates = {"environment": {"num_vec": 1}} + base_config_updates = { + "environment": {"num_vec": 1}, + "demonstrations": {"rollout_type": "ppo-huggingface"}, + } search_space = { "config_updates": { - "bc_kwargs": dict( + "bc": dict( batch_size=tune.choice([8, 16, 32, 64]), l2_weight=tune.loguniform(1e-6, 1e-2), # L2 regularization weight optimizer_kwargs=dict( lr=tune.loguniform(1e-5, 1e-2), ), - ), - "bc_train_kwargs": dict( - n_epochs=tune.choice([1, 5, 10, 20]), + train_kwargs=dict( + n_epochs=tune.choice([1, 5, 10, 20]), + ), ), }, "command_name": "bc", diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py index 7989f3eab..ef675eab6 100644 --- a/src/imitation/scripts/config/train_adversarial.py +++ b/src/imitation/scripts/config/train_adversarial.py @@ -156,7 +156,6 @@ def half_cheetah(): @train_adversarial_ex.named_config def seals_half_cheetah(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/HalfCheetah-v0") demonstrations = dict(rollout_type="ppo-huggingface") rl = dict( @@ -184,7 +183,6 @@ def seals_half_cheetah(): @train_adversarial_ex.named_config def seals_hopper(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Hopper-v0") demonstrations = dict(rollout_type="ppo-huggingface") policy = dict( @@ -212,7 +210,6 @@ def seals_hopper(): @train_adversarial_ex.named_config def seals_swimmer(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Swimmer-v0") total_timesteps = int(2e6) demonstrations = dict(rollout_type="ppo-huggingface") @@ -242,7 +239,6 @@ def seals_swimmer(): @train_adversarial_ex.named_config def seals_walker(): - locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Walker2d-v0") demonstrations = dict(rollout_type="ppo-huggingface") policy = dict( diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 1a039c762..4fe9c793e 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -73,8 +73,6 @@ def cartpole(): @train_preference_comparisons_ex.named_config def seals_ant(): - # locals().update(**MUJOCO_SHARED_LOCALS) - # locals().update(**ANT_SHARED_LOCALS) environment = dict(gym_id="seals/Ant-v0") rl = dict( batch_size=2048, @@ -102,7 +100,6 @@ def half_cheetah(): @train_preference_comparisons_ex.named_config def seals_half_cheetah(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/HalfCheetah-v0") rl = dict( batch_size=512, @@ -124,7 +121,6 @@ def seals_half_cheetah(): @train_preference_comparisons_ex.named_config def seals_hopper(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Hopper-v0") policy = dict( policy_cls="MlpPolicy", @@ -151,7 +147,6 @@ def seals_hopper(): @train_preference_comparisons_ex.named_config def seals_swimmer(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Swimmer-v0") policy = dict( policy_cls="MlpPolicy", @@ -179,7 +174,6 @@ def seals_swimmer(): @train_preference_comparisons_ex.named_config def seals_walker(): - # locals().update(**MUJOCO_SHARED_LOCALS) environment = dict(gym_id="seals/Walker2d-v0") policy = dict( policy_cls="MlpPolicy", diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 2dd2254bf..53b4c2b32 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -139,7 +139,7 @@ def parallel( syncer=syncer, ), metric="mean_return", - resume=resume, + resume=True, ) print( "Live trials:", @@ -176,14 +176,7 @@ def parallel( metric="mean_return", mode="max", ) - if sacred_ex_name == "train_rl": - key_prefix = "" - elif sacred_ex_name == "train_preference_comparisons": - key_prefix = "rollout/" - else: - key_prefix = "imit_stats/" - key = key_prefix + "monitor_return_mean" - + key = "mean_return" if eval_best_trial: df = result.results_df df = df[df["config/named_configs"].notna()] @@ -199,7 +192,7 @@ def parallel( # store mean return of runs across all seeds in a group df["mean_return"] = grps[key].transform(lambda x: x.mean()) best_config_df = df[df["mean_return"] == df["mean_return"].max()] - row = best_config_df.loc[0] + row = best_config_df.iloc[0] best_config_tag = row["experiment_tag"] if result.trials is not None: trial = [ @@ -215,7 +208,10 @@ def parallel( # update cpus per trial only if it is provided in `resources_per_trial` # Uses the default values (cpu=1) if it is not provided if "cpu" in resources_per_trial: - resources_per_trial["cpu"] *= eval_best_trial_resource_multiplier + resources_per_trial_eval = copy.deepcopy(resources_per_trial) + resources_per_trial_eval[ + "cpu" + ] *= eval_best_trial_resource_multiplier best_config["config_updates"].update( environment=dict(num_vec=resources_per_trial["cpu"]), ) diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py index 56633e33a..5a6925eb3 100644 --- a/src/imitation/scripts/train_imitation.py +++ b/src/imitation/scripts/train_imitation.py @@ -76,6 +76,7 @@ def bc( expert_stats = _try_computing_expert_stats(expert_trajs) if expert_stats is not None: stats["expert_stats"] = expert_stats + stats["mean_return"] = imit_stats["monitor_return_mean"] return stats From 1c1dbc44970016fd5ef6bb965cf69afbf33590a1 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 16 May 2023 21:43:43 +0530 Subject: [PATCH 15/47] Fix errors with some configs --- src/imitation/scripts/config/parallel.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index b52446154..095c67107 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -165,8 +165,9 @@ def example_dagger(): base_named_configs = ["logging.wandb_logging"] base_config_updates = { "environment": {"num_vec": 1}, + "demonstrations": {"rollout_type": "ppo-huggingface"}, "dagger": {"total_timesteps": 1e5}, - "bc_kwargs": { + "bc": { "batch_size": 16, "l2_weight": 1e-4, "optimizer_kwargs": {"lr": 1e-3}, @@ -174,8 +175,10 @@ def example_dagger(): } search_space = { "config_updates": { - "bc_train_kwargs": dict( - n_epochs=tune.choice([1, 5, 10]), + "bc": dict( + train_kwargs=dict( + n_epochs=tune.choice([1, 5, 10]), + ), ), "dagger": dict( beta_schedule=tune.choice( @@ -201,6 +204,7 @@ def example_gail(): base_named_configs = ["logging.wandb_logging"] base_config_updates = { "environment": {"num_vec": 1}, + "demonstrations": {"rollout_type": "ppo-huggingface"}, "total_timesteps": 1e7, } search_space = { @@ -234,6 +238,7 @@ def example_airl(): base_named_configs = ["logging.wandb_logging"] base_config_updates = { "environment": {"num_vec": 1}, + "demonstrations": {"rollout_type": "ppo-huggingface"}, "total_timesteps": 1e7, } search_space = { @@ -273,11 +278,9 @@ def example_pc(): "gatherer_kwargs": {"sample": True}, } search_space = { - "named_configs": tune.choice( - [ - ["reward.normalize_output_disable"], - ], - ), + "named_configs": [ + ["reward.normalize_output_disable"], + ], "config_updates": { "train": { "policy_kwargs": { From 44c4e97d64980118b3a07f06f7c15edb273a16a1 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 14 Jun 2023 06:38:42 +0530 Subject: [PATCH 16/47] Updates based on review --- src/imitation/scripts/analyze.py | 29 ++++++++++--------- src/imitation/scripts/parallel.py | 26 ++++++++++++----- src/imitation/scripts/train_adversarial.py | 1 - src/imitation/scripts/train_imitation.py | 1 - .../scripts/train_preference_comparisons.py | 3 +- src/imitation/scripts/train_rl.py | 1 - 6 files changed, 35 insertions(+), 26 deletions(-) diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py index f036efe40..8977fed47 100644 --- a/src/imitation/scripts/analyze.py +++ b/src/imitation/scripts/analyze.py @@ -272,40 +272,43 @@ def analyze_imitation( The DataFrame generated from the Sacred logs. """ if table_verbosity == 3: + # Get column names for which we have get value using make_entry_fn + # These are same across Level 2 & 3. In Level 3, we additionally add remaining + # config columns. table_entry_fns_subset = _get_table_entry_fns_subset(2) else: table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity) - df = pd.DataFrame() + output_table = pd.DataFrame() for sd in _gather_sacred_dicts(): - new_df = pd.DataFrame() - if table_verbosity == -1: + if table_verbosity == 3: # gets all config columns - new_df = pd.json_normalize(sd.config) + row = pd.json_normalize(sd.config) else: - new_df = new_df.append({}, ignore_index=True) + # create an empty dataframe with a single row + row = pd.DataFrame(index=[0]) for col_name, make_entry_fn in table_entry_fns_subset.items(): - new_df[col_name] = make_entry_fn(sd) + row[col_name] = make_entry_fn(sd) - df = pd.concat([df, new_df]) + output_table = pd.concat([output_table, row]) - if len(df) > 0: - df.sort_values(by=["algo", "env_name"], inplace=True) + if len(output_table) > 0: + output_table.sort_values(by=["algo", "env_name"], inplace=True) display_options: Mapping[str, Any] = dict(index=False) if csv_output_path is not None: - df.to_csv(csv_output_path, **display_options) + output_table.to_csv(csv_output_path, **display_options) print(f"Wrote CSV file to {csv_output_path}") if tex_output_path is not None: - s: str = df.to_latex(**display_options) + s: str = output_table.to_latex(**display_options) with open(tex_output_path, "w") as f: f.write(s) print(f"Wrote TeX file to {tex_output_path}") if print_table: - print(df.to_string(**display_options)) - return df + print(output_table.to_string(**display_options)) + return output_table def _make_return_summary(stats: dict, prefix="") -> str: diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 53b4c2b32..2bb0129cb 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -127,6 +127,12 @@ def parallel( ray.init(**init_kwargs) search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat) + + if sacred_ex_name == "train_rl": + return_key = "monitor_return_mean" + else: + return_key = "imit_stats/monitor_return_mean" + try: if experiment_checkpoint_path: if resume: @@ -173,10 +179,9 @@ def parallel( syncer=syncer, ), search_alg=search_alg, - metric="mean_return", + metric=return_key, mode="max", ) - key = "mean_return" if eval_best_trial: df = result.results_df df = df[df["config/named_configs"].notna()] @@ -190,7 +195,7 @@ def parallel( ] grps = df.groupby(grp_keys) # store mean return of runs across all seeds in a group - df["mean_return"] = grps[key].transform(lambda x: x.mean()) + df["mean_return"] = grps[return_key].transform(lambda x: x.mean()) best_config_df = df[df["mean_return"] == df["mean_return"].max()] row = best_config_df.iloc[0] best_config_tag = row["experiment_tag"] @@ -200,20 +205,25 @@ def parallel( ][0] best_config = trial.config print("Mean return:", row["mean_return"]) - print("All returns:", df[df["mean_return"] == row["mean_return"]][key]) + print( + "All returns:", + df[df["mean_return"] == row["mean_return"]][return_key], + ) print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum()) best_config["config_updates"].update( seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))), ) + + resources_per_trial_eval = copy.deepcopy(resources_per_trial) # update cpus per trial only if it is provided in `resources_per_trial` # Uses the default values (cpu=1) if it is not provided if "cpu" in resources_per_trial: - resources_per_trial_eval = copy.deepcopy(resources_per_trial) + resources_per_trial_eval[ "cpu" ] *= eval_best_trial_resource_multiplier best_config["config_updates"].update( - environment=dict(num_vec=resources_per_trial["cpu"]), + environment=dict(num_vec=resources_per_trial_eval["cpu"]), ) eval_result = ray.tune.run( @@ -224,9 +234,9 @@ def parallel( "command_name": best_config.get("command_name", None), }, name=run_name + "_best_hp_eval", - resources_per_trial=resources_per_trial, + resources_per_trial=resources_per_trial_eval, ) - returns = eval_result.results_df["mean_return"].to_numpy() + returns = eval_result.results_df[return_key].to_numpy() print("All returns:", returns) print("Mean:", np.mean(returns)) print("Std:", np.std(returns)) diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py index d1f99a54b..26c8d7bcf 100644 --- a/src/imitation/scripts/train_adversarial.py +++ b/src/imitation/scripts/train_adversarial.py @@ -167,7 +167,6 @@ def callback(round_num: int, /) -> None: return { "imit_stats": imit_stats, "expert_stats": rollout.rollout_stats(expert_trajs), - "mean_return": imit_stats["monitor_return_mean"], } diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py index 5a6925eb3..56633e33a 100644 --- a/src/imitation/scripts/train_imitation.py +++ b/src/imitation/scripts/train_imitation.py @@ -76,7 +76,6 @@ def bc( expert_stats = _try_computing_expert_stats(expert_trajs) if expert_stats is not None: stats["expert_stats"] = expert_stats - stats["mean_return"] = imit_stats["monitor_return_mean"] return stats diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py index b054a5a6c..867a666a4 100644 --- a/src/imitation/scripts/train_preference_comparisons.py +++ b/src/imitation/scripts/train_preference_comparisons.py @@ -280,8 +280,7 @@ def save_callback(iteration_num): # Storing and evaluating policy only useful if we generated trajectory data if bool(trajectory_path is None): results = dict(results) - results["rollout"] = policy_evaluation.eval_policy(agent, venv) - results["mean_return"] = results["rollout"]["monitor_return_mean"] + results["imit_stats"] = policy_evaluation.eval_policy(agent, venv) if save_preferences: main_trainer.dataset.save(log_dir / "preferences.pkl") diff --git a/src/imitation/scripts/train_rl.py b/src/imitation/scripts/train_rl.py index 20a7b263c..6780a557b 100644 --- a/src/imitation/scripts/train_rl.py +++ b/src/imitation/scripts/train_rl.py @@ -159,7 +159,6 @@ def train_rl( # Final evaluation of expert policy. eval_stats = policy_evaluation.eval_policy(rl_algo, venv) - eval_stats["mean_return"] = eval_stats["monitor_return_mean"] return eval_stats From ab0126998a4f8beb44e93eb11d6c2b17e68038a8 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 14 Jun 2023 07:40:52 +0530 Subject: [PATCH 17/47] Change metric everywhere --- src/imitation/scripts/parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 2bb0129cb..6f77330df 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -144,7 +144,7 @@ def parallel( upload_dir=upload_dir, syncer=syncer, ), - metric="mean_return", + metric=return_key, resume=True, ) print( From e896d7db127f9025d89387cc10e513409fd973b1 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 11 Jul 2023 16:03:02 +0530 Subject: [PATCH 18/47] Separate tuning code from parallel.py --- benchmarking/tuning.py | 102 ++++++++++ benchmarking/tuning_config.py | 237 +++++++++++++++++++++++ setup.cfg | 1 + src/imitation/scripts/config/parallel.py | 216 +-------------------- src/imitation/scripts/parallel.py | 101 ++-------- 5 files changed, 363 insertions(+), 294 deletions(-) create mode 100644 benchmarking/tuning.py create mode 100644 benchmarking/tuning_config.py diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py new file mode 100644 index 000000000..b4e62a84a --- /dev/null +++ b/benchmarking/tuning.py @@ -0,0 +1,102 @@ +"""Tunes the hyperparameters of the algorithms.""" + +import copy +import pathlib +from typing import Any, Dict + +import numpy as np +import ray +from pandas.api import types as pd_types +from sacred.observers import FileStorageObserver +from tuning_config import parallel_ex, tuning_ex + + +@tuning_ex.main +def tune( + parallel: Dict[str, Any], + eval_best_trial: bool = False, + eval_best_trial_resource_multiplier: int = 1, + eval_trial_seeds: int = 5, +) -> None: + """Tune hyperparameters of imitation algorithms using parallel script. + + Args: + parallel: A dictionary of arguments from the parallel script. + eval_best_trial: Whether to evaluate the trial with the best mean return + at the end of tuning on a separate set of seeds. + eval_best_trial_resource_multiplier: factor by which to multiply the + number of cpus per trial in `resources_per_trial`. + eval_trial_seeds: Number of distinct seeds to evaluate the best trial on. + """ + run = parallel_ex.run(config_updates=parallel) + result = run.result + + if eval_best_trial: + if parallel["sacred_ex_name"] == "train_rl": + return_key = "monitor_return_mean" + else: + return_key = "imit_stats/monitor_return_mean" + df = result.results_df + df = df[df["config/named_configs"].notna()] + # convert object dtype to str required by df.groupby + for col in df.columns: + if pd_types.is_object_dtype(df[col]): + df[col] = df[col].astype("str") + # group into separate HP configs + grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c] + grps = df.groupby(grp_keys) + # store mean return of runs across all seeds in a group + df["mean_return"] = grps[return_key].transform(lambda x: x.mean()) + best_config_df = df[df["mean_return"] == df["mean_return"].max()] + row = best_config_df.iloc[0] + best_config_tag = row["experiment_tag"] + if result.trials is not None: + trial = [t for t in result.trials if best_config_tag in t.experiment_tag][0] + best_config = trial.config + print("Mean return:", row["mean_return"]) + print( + "All returns:", + df[df["mean_return"] == row["mean_return"]][return_key], + ) + print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum()) + + best_config["config_updates"].update( + seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))), + ) + + resources_per_trial_eval = copy.deepcopy(parallel["resources_per_trial"]) + # update cpus per trial only if it is provided in `resources_per_trial` + # Uses the default values (cpu=1) if it is not provided + if "cpu" in parallel["resources_per_trial"]: + resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier + + eval_config_updates = parallel.copy() + eval_config_updates.update( + run_name=parallel["run_name"] + "_best_hp_eval", + num_samples=1, + search_space=best_config, + base_named_configs=parallel["base_named_configs"], + base_config_updates=parallel["base_config_updates"], + resources_per_trial=resources_per_trial_eval, + search_alg=None, + repeat=1, + experiment_checkpoint_path="", + resume=False, + ) + eval_run = parallel_ex.run(config_updates=eval_config_updates) + eval_result = eval_run.result + returns = eval_result.results_df[return_key].to_numpy() + print("All returns:", returns) + print("Mean:", np.mean(returns)) + print("Std:", np.std(returns)) + + +def main_console(): + observer_path = pathlib.Path.cwd() / "output" / "sacred" / "tuning" + observer = FileStorageObserver(observer_path) + tuning_ex.observers.append(observer) + tuning_ex.run_commandline() + + +if __name__ == "__main__": # pragma: no cover + main_console() diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py new file mode 100644 index 000000000..79c8d0347 --- /dev/null +++ b/benchmarking/tuning_config.py @@ -0,0 +1,237 @@ +"""Config files for tuning experiments.""" + +import ray.tune as tune +import sacred +from torch import nn + +from imitation.algorithms import dagger +from imitation.scripts.parallel import parallel_ex + +tuning_ex = sacred.Experiment("tuning", ingredients=[parallel_ex]) + + +@tuning_ex.named_config +def example_rl(): + parallel = dict( + sacred_ex_name="train_rl", + run_name="rl_tuning", + base_named_configs=["logging.wandb_logging"], + base_config_updates={"environment": {"num_vec": 1}}, + search_space={ + "config_updates": { + "rl": { + "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]), + "rl_kwargs": { + "learning_rate": tune.loguniform(1e-5, 1e-2), + "batch_size": tune.choice([64, 128, 256, 512]), + "n_epochs": tune.choice([5, 10, 20]), + }, + }, + }, + }, + num_samples=100, + repeat=1, + resources_per_trial=dict(cpu=1), + ) + eval_best_trial = True + eval_trial_seeds = 5 + + +@tuning_ex.named_config +def example_bc(): + parallel = dict( + sacred_ex_name="train_imitation", + run_name="bc_tuning", + base_named_configs=["logging.wandb_logging"], + base_config_updates={ + "environment": {"num_vec": 1}, + "demonstrations": {"source": "huggingface"}, + }, + search_space={ + "config_updates": { + "bc": dict( + batch_size=tune.choice([8, 16, 32, 64]), + l2_weight=tune.loguniform(1e-6, 1e-2), # L2 regularization weight + optimizer_kwargs=dict( + lr=tune.loguniform(1e-5, 1e-2), + ), + train_kwargs=dict( + n_epochs=tune.choice([1, 5, 10, 20]), + ), + ), + }, + "command_name": "bc", + }, + num_samples=2, + repeat=1, + resources_per_trial=dict(cpu=1), + ) + + eval_best_trial = True + eval_trial_seeds = 5 + eval_best_trial_resource_multiplier = 1 + + +@tuning_ex.named_config +def example_dagger(): + parallel = dict( + sacred_ex_name="train_imitation", + run_name="dagger_tuning", + base_named_configs=["logging.wandb_logging"], + base_config_updates={ + "environment": {"num_vec": 1}, + "demonstrations": {"source": "huggingface"}, + "dagger": {"total_timesteps": 1e5}, + "bc": { + "batch_size": 16, + "l2_weight": 1e-4, + "optimizer_kwargs": {"lr": 1e-3}, + }, + }, + search_space={ + "config_updates": { + "bc": dict( + train_kwargs=dict( + n_epochs=tune.choice([1, 5, 10]), + ), + ), + "dagger": dict( + beta_schedule=tune.choice( + [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]] + + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]], + ), + rollout_round_min_episodes=tune.choice([3, 5, 10]), + ), + }, + "command_name": "dagger", + }, + num_samples=50, + repeat=3, + resources_per_trial=dict(cpu=1), + ) + eval_best_trial = True + eval_trial_seeds = 5 + + +@tuning_ex.named_config +def example_gail(): + parallel = dict( + sacred_ex_name="train_adversarial", + run_name="gail_tuning_hc", + base_named_configs=["logging.wandb_logging"], + base_config_updates={ + "environment": {"num_vec": 1}, + "demonstrations": {"source": "huggingface"}, + "total_timesteps": 1e7, + }, + search_space={ + "config_updates": { + "algorithm_kwargs": dict( + demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), + n_disc_updates_per_round=tune.choice([8, 16]), + ), + "rl": { + "batch_size": tune.choice([4096, 8192, 16384]), + "rl_kwargs": { + "ent_coef": tune.loguniform(1e-7, 1e-3), + "learning_rate": tune.loguniform(1e-5, 1e-2), + }, + }, + "algorithm_specific": {}, + }, + "command_name": "gail", + }, + num_samples=100, + repeat=3, + resources_per_trial=dict(cpu=1), + ) + eval_best_trial = True + eval_trial_seeds = 5 + + +@tuning_ex.named_config +def example_airl(): + parallel = dict( + sacred_ex_name="train_adversarial", + run_name="airl_tuning", + base_named_configs=["logging.wandb_logging"], + base_config_updates={ + "environment": {"num_vec": 1}, + "demonstrations": {"source": "huggingface"}, + "total_timesteps": 1e7, + }, + search_space={ + "config_updates": { + "algorithm_kwargs": dict( + demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), + n_disc_updates_per_round=tune.choice([8, 16]), + ), + "rl": { + "batch_size": tune.choice([4096, 8192, 16384]), + "rl_kwargs": { + "ent_coef": tune.loguniform(1e-7, 1e-3), + "learning_rate": tune.loguniform(1e-5, 1e-2), + }, + }, + "algorithm_specific": {}, + }, + "command_name": "airl", + }, + num_samples=100, + repeat=3, + resources_per_trial=dict(cpu=1), + ) + + eval_best_trial = True + eval_trial_seeds = 5 + + +@tuning_ex.named_config +def example_pc(): + parallel = dict( + sacred_ex_name="train_preference_comparisons", + run_name="pc_tuning", + base_named_configs=["logging.wandb_logging"], + base_config_updates={ + "environment": {"num_vec": 1}, + "demonstrations": {"source": "huggingface"}, + "total_timesteps": 2e7, + "total_comparisons": 5000, + "query_schedule": "hyperbolic", + "gatherer_kwargs": {"sample": True}, + }, + search_space={ + "named_configs": [ + ["reward.normalize_output_disable"], + ], + "config_updates": { + "train": { + "policy_kwargs": { + "activation_fn": tune.choice( + [ + nn.ReLU, + ], + ), + }, + }, + "num_iterations": tune.choice([25, 50]), + "initial_comparison_frac": tune.choice([0.1, 0.25]), + "reward_trainer_kwargs": { + "epochs": tune.choice([1, 3, 6]), + }, + "rl": { + "batch_size": tune.choice([512, 2048, 8192]), + "rl_kwargs": { + "learning_rate": tune.loguniform(1e-5, 1e-2), + "ent_coef": tune.loguniform(1e-7, 1e-3), + }, + }, + }, + }, + num_samples=100, + repeat=3, + resources_per_trial=dict(cpu=1), + ) + + eval_best_trial = True + eval_trial_seeds = 5 diff --git a/setup.cfg b/setup.cfg index 979c3ca46..f39db322f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,6 +7,7 @@ per-file-ignores = # F841 local variable unused [for Sacred config scopes] src/imitation/scripts/config/*.py:F841 ../src/imitation/scripts/config/*.py:F841 + benchmarking/tuning_config.py:F841 src/imitation/envs/examples/airl_envs/*.py:D [darglint] diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index 095c67107..e9c5b8245 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -16,9 +16,7 @@ import numpy as np import ray.tune as tune import sacred -from torch import nn -from imitation.algorithms import dagger from imitation.util.util import make_unique_timestamp parallel_ex = sacred.Experiment("parallel") @@ -45,6 +43,10 @@ def config(): eval_trial_seeds = 5 # Number of seeds to search over by default num_samples = 1 # Number of samples per grid search configuration repeat = 1 + search_alg = "optuna" # search algorithm to use + experiment_checkpoint_path = "" # Path to checkpoint of experiment to resume + syncer = None # Sacred syncer to use + resume = False # Whether to resume experiment from checkpoint # Debug named configs @@ -100,213 +102,3 @@ def example_cartpole_rl(): } base_named_configs = ["cartpole"] resources_per_trial = dict(cpu=4) - - -@parallel_ex.named_config -def example_rl(): - sacred_ex_name = "train_rl" - run_name = "rl_tuning" - base_named_configs = ["logging.wandb_logging"] - base_config_updates = {"environment": {"num_vec": 1}} - search_space = { - "config_updates": { - "rl": { - "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]), - "rl_kwargs": { - "learning_rate": tune.loguniform(1e-5, 1e-2), - "batch_size": tune.choice([64, 128, 256, 512]), - "n_epochs": tune.choice([5, 10, 20]), - }, - }, - }, - } - num_samples = 100 - eval_best_trial = True - eval_trial_seeds = 5 - repeat = 1 - resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def example_bc(): - sacred_ex_name = "train_imitation" - run_name = "bc_tuning" - base_named_configs = ["logging.wandb_logging"] - base_config_updates = { - "environment": {"num_vec": 1}, - "demonstrations": {"rollout_type": "ppo-huggingface"}, - } - search_space = { - "config_updates": { - "bc": dict( - batch_size=tune.choice([8, 16, 32, 64]), - l2_weight=tune.loguniform(1e-6, 1e-2), # L2 regularization weight - optimizer_kwargs=dict( - lr=tune.loguniform(1e-5, 1e-2), - ), - train_kwargs=dict( - n_epochs=tune.choice([1, 5, 10, 20]), - ), - ), - }, - "command_name": "bc", - } - num_samples = 64 - eval_best_trial = True - eval_trial_seeds = 5 - repeat = 3 - resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def example_dagger(): - sacred_ex_name = "train_imitation" - run_name = "dagger_tuning" - base_named_configs = ["logging.wandb_logging"] - base_config_updates = { - "environment": {"num_vec": 1}, - "demonstrations": {"rollout_type": "ppo-huggingface"}, - "dagger": {"total_timesteps": 1e5}, - "bc": { - "batch_size": 16, - "l2_weight": 1e-4, - "optimizer_kwargs": {"lr": 1e-3}, - }, - } - search_space = { - "config_updates": { - "bc": dict( - train_kwargs=dict( - n_epochs=tune.choice([1, 5, 10]), - ), - ), - "dagger": dict( - beta_schedule=tune.choice( - [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]] - + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]], - ), - rollout_round_min_episodes=tune.choice([3, 5, 10]), - ), - }, - "command_name": "dagger", - } - num_samples = 50 - repeat = 3 - eval_best_trial = True - eval_trial_seeds = 5 - resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def example_gail(): - sacred_ex_name = "train_adversarial" - run_name = "gail_tuning_hc" - base_named_configs = ["logging.wandb_logging"] - base_config_updates = { - "environment": {"num_vec": 1}, - "demonstrations": {"rollout_type": "ppo-huggingface"}, - "total_timesteps": 1e7, - } - search_space = { - "config_updates": { - "algorithm_kwargs": dict( - demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), - n_disc_updates_per_round=tune.choice([8, 16]), - ), - "rl": { - "batch_size": tune.choice([4096, 8192, 16384]), - "rl_kwargs": { - "ent_coef": tune.loguniform(1e-7, 1e-3), - "learning_rate": tune.loguniform(1e-5, 1e-2), - }, - }, - "algorithm_specific": {}, - }, - "command_name": "gail", - } - num_samples = 100 - eval_best_trial = True - eval_trial_seeds = 5 - repeat = 3 - resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def example_airl(): - sacred_ex_name = "train_adversarial" - run_name = "airl_tuning" - base_named_configs = ["logging.wandb_logging"] - base_config_updates = { - "environment": {"num_vec": 1}, - "demonstrations": {"rollout_type": "ppo-huggingface"}, - "total_timesteps": 1e7, - } - search_space = { - "config_updates": { - "algorithm_kwargs": dict( - demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]), - n_disc_updates_per_round=tune.choice([8, 16]), - ), - "rl": { - "batch_size": tune.choice([4096, 8192, 16384]), - "rl_kwargs": { - "ent_coef": tune.loguniform(1e-7, 1e-3), - "learning_rate": tune.loguniform(1e-5, 1e-2), - }, - }, - "algorithm_specific": {}, - }, - "command_name": "airl", - } - num_samples = 100 - eval_best_trial = True - eval_trial_seeds = 5 - repeat = 3 - resources_per_trial = dict(cpu=1) - - -@parallel_ex.named_config -def example_pc(): - sacred_ex_name = "train_preference_comparisons" - run_name = "pc_tuning" - base_named_configs = ["logging.wandb_logging"] - base_config_updates = { - "environment": {"num_vec": 1}, - "total_timesteps": 2e7, - "total_comparisons": 5000, - "query_schedule": "hyperbolic", - "gatherer_kwargs": {"sample": True}, - } - search_space = { - "named_configs": [ - ["reward.normalize_output_disable"], - ], - "config_updates": { - "train": { - "policy_kwargs": { - "activation_fn": tune.choice( - [ - nn.ReLU, - ], - ), - }, - }, - "num_iterations": tune.choice([25, 50]), - "initial_comparison_frac": tune.choice([0.1, 0.25]), - "reward_trainer_kwargs": { - "epochs": tune.choice([1, 3, 6]), - }, - "rl": { - "batch_size": tune.choice([512, 2048, 8192]), - "rl_kwargs": { - "learning_rate": tune.loguniform(1e-5, 1e-2), - "ent_coef": tune.loguniform(1e-7, 1e-3), - }, - }, - }, - } - num_samples = 100 - eval_best_trial = True - eval_trial_seeds = 5 - repeat = 3 - resources_per_trial = dict(cpu=1) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 6f77330df..2417414cb 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -6,11 +6,9 @@ import pathlib from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union -import numpy as np import ray import ray.tune import sacred -from pandas.api.types import is_object_dtype from ray.tune import search from ray.tune.registry import register_trainable from ray.tune.search import optuna @@ -31,14 +29,12 @@ def parallel( init_kwargs: Mapping[str, Any], local_dir: Optional[str], upload_dir: Optional[str], - repeat: int = 1, - eval_best_trial: bool = False, - eval_best_trial_resource_multiplier: int = 1, - eval_trial_seeds: int = 5, - experiment_checkpoint_path: str = "", - syncer=None, - resume: Union[str, bool] = False, -) -> None: + repeat: int, + search_alg: Optional[str], + experiment_checkpoint_path: str, + syncer, + resume: Union[str, bool], +) -> ray.tune.ExperimentAnalysis: """Parallelize multiple runs of another Sacred Experiment using Ray Tune. A Sacred FileObserver is attached to the inner experiment and writes Sacred @@ -47,7 +43,7 @@ def parallel( Args: sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or - "train_adversarial". + "train_imitation" or "train_adversarial" or "train_preference_comparisons". run_name: A name describing this parallelizing experiment. This argument is also passed to `ray.tune.run` as the `name` argument. It is also saved in 'sacred/run.json' of each inner Sacred experiment @@ -78,24 +74,19 @@ def parallel( init_kwargs: Arguments to pass to `ray.init`. local_dir: `local_dir` argument to `ray.tune.run()`. upload_dir: `upload_dir` argument to `ray.tune.run()`. + search_alg: can be either "optuna" or None. repeat: Number of runs to repeat each trial for. - eval_best_trial: Whether to evaluate the trial with the best mean return - at the end of tuning on a separate set of seeds. - eval_best_trial_resource_multiplier: factor by which to multiply the - number of cpus per trial in `resources_per_trial`. - eval_trial_seeds: Number of distinct seeds to evaluate the best trial on. - experiment_checkpoint_path: Path containing the checkpoints of a previous - experiment ran using this script. Useful for resuming cancelled trials - of the experiments (using `resume`) or evaluating the best trial of the - experiment (using `eval_best_trial`). + Not used if `search_alg` is None. resume: If true and `experiment_checkpoint_path` is given, then resumes the experiment by restarting the trials that did not finish in the experiment checkpoint path. syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`. - Raises: TypeError: Named configs not string sequences or config updates not mappings. + + Returns: + The result of `ray.tune.run()`. """ # Basic validation for config options before we enter parallel jobs. if not isinstance(base_named_configs, collections.abc.Sequence): @@ -126,7 +117,11 @@ def parallel( ) ray.init(**init_kwargs) - search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat) + if search_alg == "optuna": + algo = search.Repeater(optuna.OptunaSearch(), repeat=repeat) + else: + assert repeat == 1 # repeat should not be used if search_alg is None + algo = None if sacred_ex_name == "train_rl": return_key = "monitor_return_mean" @@ -166,7 +161,6 @@ def parallel( result.trials = None result.fetch_trial_dataframes() else: - # run hyperparameter tuning result = ray.tune.run( trainable, config=search_space, @@ -178,68 +172,11 @@ def parallel( upload_dir=upload_dir, syncer=syncer, ), - search_alg=search_alg, + search_alg=algo, metric=return_key, mode="max", ) - if eval_best_trial: - df = result.results_df - df = df[df["config/named_configs"].notna()] - # convert object dtype to str required by df.groupby - for col in df.columns: - if is_object_dtype(df[col]): - df[col] = df[col].astype("str") - # group into separate HP configs - grp_keys = [ - c for c in df.columns if c.startswith("config") and "seed" not in c - ] - grps = df.groupby(grp_keys) - # store mean return of runs across all seeds in a group - df["mean_return"] = grps[return_key].transform(lambda x: x.mean()) - best_config_df = df[df["mean_return"] == df["mean_return"].max()] - row = best_config_df.iloc[0] - best_config_tag = row["experiment_tag"] - if result.trials is not None: - trial = [ - t for t in result.trials if best_config_tag in t.experiment_tag - ][0] - best_config = trial.config - print("Mean return:", row["mean_return"]) - print( - "All returns:", - df[df["mean_return"] == row["mean_return"]][return_key], - ) - print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum()) - best_config["config_updates"].update( - seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))), - ) - - resources_per_trial_eval = copy.deepcopy(resources_per_trial) - # update cpus per trial only if it is provided in `resources_per_trial` - # Uses the default values (cpu=1) if it is not provided - if "cpu" in resources_per_trial: - - resources_per_trial_eval[ - "cpu" - ] *= eval_best_trial_resource_multiplier - best_config["config_updates"].update( - environment=dict(num_vec=resources_per_trial_eval["cpu"]), - ) - - eval_result = ray.tune.run( - trainable, - config={ - "named_configs": best_config["named_configs"], - "config_updates": best_config["config_updates"], - "command_name": best_config.get("command_name", None), - }, - name=run_name + "_best_hp_eval", - resources_per_trial=resources_per_trial_eval, - ) - returns = eval_result.results_df[return_key].to_numpy() - print("All returns:", returns) - print("Mean:", np.mean(returns)) - print("Std:", np.std(returns)) + return result finally: ray.shutdown() From 64c3a8d0deb8748eba2a69be20d7f9a464639523 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 11 Jul 2023 16:07:13 +0530 Subject: [PATCH 19/47] Fix docstring --- src/imitation/scripts/parallel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 2417414cb..10ae9f924 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -77,6 +77,10 @@ def parallel( search_alg: can be either "optuna" or None. repeat: Number of runs to repeat each trial for. Not used if `search_alg` is None. + experiment_checkpoint_path: Path containing the checkpoints of a previous + experiment ran using this script. Useful for resuming cancelled trials + of the experiments (using `resume`) or evaluating the best trial of the + experiment (using `eval_best_trial`). resume: If true and `experiment_checkpoint_path` is given, then resumes the experiment by restarting the trials that did not finish in the experiment checkpoint path. From 8fba0d3ac9b690613b7526b68bd1c68b3ac6efa7 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 11 Jul 2023 17:42:08 +0530 Subject: [PATCH 20/47] Removing resume option as it is getting tricky to correctly implement --- src/imitation/scripts/config/parallel.py | 5 +--- src/imitation/scripts/parallel.py | 31 ++---------------------- tests/scripts/test_scripts.py | 1 + 3 files changed, 4 insertions(+), 33 deletions(-) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index e9c5b8245..3416f9442 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -39,14 +39,11 @@ def config(): local_dir = None # `local_dir` arg for `ray.tune.run` upload_dir = None # `upload_dir` arg for `ray.tune.run` experiment_checkpoint_path = "" - eval_best_trial = False - eval_trial_seeds = 5 # Number of seeds to search over by default num_samples = 1 # Number of samples per grid search configuration repeat = 1 search_alg = "optuna" # search algorithm to use - experiment_checkpoint_path = "" # Path to checkpoint of experiment to resume + experiment_checkpoint_path = "" # Path to checkpoint of experiment syncer = None # Sacred syncer to use - resume = False # Whether to resume experiment from checkpoint # Debug named configs diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 10ae9f924..bf73c1c72 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -33,7 +33,6 @@ def parallel( search_alg: Optional[str], experiment_checkpoint_path: str, syncer, - resume: Union[str, bool], ) -> ray.tune.ExperimentAnalysis: """Parallelize multiple runs of another Sacred Experiment using Ray Tune. @@ -78,12 +77,8 @@ def parallel( repeat: Number of runs to repeat each trial for. Not used if `search_alg` is None. experiment_checkpoint_path: Path containing the checkpoints of a previous - experiment ran using this script. Useful for resuming cancelled trials - of the experiments (using `resume`) or evaluating the best trial of the - experiment (using `eval_best_trial`). - resume: If true and `experiment_checkpoint_path` is given, then resumes the - experiment by restarting the trials that did not finish in the experiment - checkpoint path. + experiment ran using this script. Useful for evaluating the best trial + of the experiment. syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`. Raises: @@ -134,28 +129,6 @@ def parallel( try: if experiment_checkpoint_path: - if resume: - # restart failed runs from experiment_checkpoint_path - register_trainable("inner", trainable) - runner = ray.tune.execution.trial_runner.TrialRunner( - local_checkpoint_dir=experiment_checkpoint_path, - sync_config=ray.tune.syncer.SyncConfig( - upload_dir=upload_dir, - syncer=syncer, - ), - metric=return_key, - resume=True, - ) - print( - "Live trials:", - len(runner._live_trials), - "/", - len(runner._trials), - ) - while not runner.is_finished(): - runner.step() - print("Debug:", runner.debug_string()) - # load experiment analysis results result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path) result._load_checkpoints_from_latest( diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 4435155cd..586fa91ba 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -802,6 +802,7 @@ def test_train_rl_cnn_policy(tmpdir: str, rng): # Need absolute path because raylet runs in different working directory. "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(), }, + search_alg=None, # Use default search algorithm of ray. search_space={ "command_name": "airl", "config_updates": {"total_timesteps": tune.choice([5, 10])}, From 12ab31c1641b6b99abb6823cf037a3f9340cb86c Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 12 Jul 2023 04:26:17 +0530 Subject: [PATCH 21/47] Minor fixes --- src/imitation/scripts/config/analyze.py | 2 +- src/imitation/scripts/config/parallel.py | 2 +- src/imitation/scripts/parallel.py | 5 ++--- tests/scripts/test_scripts.py | 7 ++++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/imitation/scripts/config/analyze.py b/src/imitation/scripts/config/analyze.py index 5213a875d..01cc2d035 100644 --- a/src/imitation/scripts/config/analyze.py +++ b/src/imitation/scripts/config/analyze.py @@ -18,7 +18,7 @@ def config(): tex_output_path = None # Write LaTex output to this path print_table = True # Set to True to print analysis to stdout split_str = "," # str used to split source_dir_str into multiple source dirs - table_verbosity = 1 # Choose from 0, 1, or 2 + table_verbosity = 1 # Choose from 0, 1, 2 or 3 source_dirs = None diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index 3416f9442..b09f9fc4a 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -42,7 +42,7 @@ def config(): num_samples = 1 # Number of samples per grid search configuration repeat = 1 search_alg = "optuna" # search algorithm to use - experiment_checkpoint_path = "" # Path to checkpoint of experiment + experiment_checkpoint_path = "" # Path to checkpoint of experiment syncer = None # Sacred syncer to use diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index bf73c1c72..ebda17c82 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -4,13 +4,12 @@ import copy import glob import pathlib -from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union +from typing import Any, Callable, Dict, Mapping, Optional, Sequence import ray import ray.tune import sacred from ray.tune import search -from ray.tune.registry import register_trainable from ray.tune.search import optuna from sacred.observers import FileStorageObserver @@ -78,7 +77,7 @@ def parallel( Not used if `search_alg` is None. experiment_checkpoint_path: Path containing the checkpoints of a previous experiment ran using this script. Useful for evaluating the best trial - of the experiment. + of the experiment. syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`. Raises: diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 586fa91ba..e17765471 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -802,7 +802,7 @@ def test_train_rl_cnn_policy(tmpdir: str, rng): # Need absolute path because raylet runs in different working directory. "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(), }, - search_alg=None, # Use default search algorithm of ray. + search_alg=None, # Use default search algorithm of ray. search_space={ "command_name": "airl", "config_updates": {"total_timesteps": tune.choice([5, 10])}, @@ -942,7 +942,7 @@ def test_analyze_imitation(tmpdir: str, run_names: List[str], run_sacred_fn): assert run.status == "COMPLETED" # Check that analyze script finds the correct number of logs. - def check(run_name: Optional[str], count: int) -> None: + def check(run_name: Optional[str], count: int, table_verbosity=1) -> None: run = analyze.analysis_ex.run( command_name="analyze_imitation", config_updates=dict( @@ -952,6 +952,7 @@ def check(run_name: Optional[str], count: int) -> None: csv_output_path=tmpdir_path / "analysis.csv", tex_output_path=tmpdir_path / "analysis.tex", print_table=True, + table_verbosity=table_verbosity, ), ) assert run.status == "COMPLETED" @@ -961,7 +962,7 @@ def check(run_name: Optional[str], count: int) -> None: for run_name, count in Counter(run_names).items(): check(run_name, count) - check(None, len(run_names)) # Check total number of logs. + check(None, len(run_names), table_verbosity=3) # Check total number of logs. def test_analyze_gather_tb(tmpdir: str): From 19b0f2c3ed8d7d2ef10aaabab21739d31b51261c Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Sun, 16 Jul 2023 10:39:12 +0530 Subject: [PATCH 22/47] Updates from review --- benchmarking/tuning.py | 202 +++++++++++++++-------- benchmarking/tuning_config.py | 36 ++-- src/imitation/scripts/config/parallel.py | 3 +- src/imitation/scripts/parallel.py | 9 +- tests/test_benchmarking.py | 27 +++ 5 files changed, 180 insertions(+), 97 deletions(-) diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py index b4e62a84a..0c18b1256 100644 --- a/benchmarking/tuning.py +++ b/benchmarking/tuning.py @@ -13,82 +13,144 @@ @tuning_ex.main def tune( - parallel: Dict[str, Any], - eval_best_trial: bool = False, + parallel_run_config: Dict[str, Any], eval_best_trial_resource_multiplier: int = 1, - eval_trial_seeds: int = 5, + num_eval_seeds: int = 5, ) -> None: """Tune hyperparameters of imitation algorithms using parallel script. Args: - parallel: A dictionary of arguments from the parallel script. - eval_best_trial: Whether to evaluate the trial with the best mean return - at the end of tuning on a separate set of seeds. - eval_best_trial_resource_multiplier: factor by which to multiply the - number of cpus per trial in `resources_per_trial`. - eval_trial_seeds: Number of distinct seeds to evaluate the best trial on. + parallel_run_config: Dictionary of arguments to pass to the parallel script. + eval_best_trial_resource_multiplier: Factor by which to multiply the + number of cpus per trial in `resources_per_trial`. This is useful for + allocating more resources per trial to the evaluation trials than the + resources for hyperparameter tuning since number of evaluation trials + is usually much smaller than the number of tuning trials. + num_eval_seeds: Number of distinct seeds to evaluate the best trial on. + Set to 0 to disable evaluation. + + Raises: + ValueError: If no trials are returned by. + """ + run = parallel_ex.run(config_updates=parallel_run_config) + experiment_analysis = run.result + if not experiment_analysis.trials: + raise ValueError( + "No trials found. Please ensure that the `experiment_checkpoint_path` " + "in `parallel_run_config` is passed correctly " + "or that the tuning run finished properly.", + ) + + return_key = "imit_stats/monitor_return_mean" + if parallel_run_config["sacred_ex_name"] == "train_rl": + return_key = "monitor_return_mean" + best_trial = find_best_trial(experiment_analysis, return_key, print_return=True) + + if num_eval_seeds > 0: # evaluate the best trial + resources_per_trial_eval = copy.deepcopy( + parallel_run_config["resources_per_trial"], + ) + # update cpus per trial only if it is provided in `resources_per_trial` + # Uses the default values (cpu=1) if it is not provided + if "cpu" in parallel_run_config["resources_per_trial"]: + resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier + evaluate_best_trial( + best_trial, + num_eval_seeds, + parallel_run_config, + resources_per_trial_eval, + return_key, + ) + + +def find_best_trial( + experiment_analysis: ray.tune.analysis.ExperimentAnalysis, + return_key: str, + print_return: bool = False, +) -> ray.tune.experiment.Trial: + """Find the trial with the best mean return across all seeds. + + Args: + experiment_analysis: The result of a parallel/tuning experiment. + return_key: The key of the return metric in the results dataframe. + print_return: Whether to print the mean and std of the returns + of the best trial. + + Returns: + best_trial: The trial with the best mean return across all seeds. + """ + df = experiment_analysis.results_df + # convert object dtype to str required by df.groupby + for col in df.columns: + if pd_types.is_object_dtype(df[col]): + df[col] = df[col].astype("str") + # group into separate HP configs + grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c] + grps = df.groupby(grp_keys) + # store mean return of runs across all seeds in a group + df["mean_return"] = grps[return_key].transform(lambda x: x.mean()) + best_config_df = df[df["mean_return"] == df["mean_return"].max()] + row = best_config_df.iloc[0] + best_config_tag = row["experiment_tag"] + assert experiment_analysis.trials is not None # for mypy + best_trial = [ + t for t in experiment_analysis.trials if best_config_tag in t.experiment_tag + ][0] + + if print_return: + all_returns = df[df["mean_return"] == row["mean_return"]][return_key] + all_returns = all_returns.to_numpy() + print("All returns:", all_returns) + print("Mean return:", row["mean_return"]) + print("Std return:", np.std(all_returns)) + print("Total seeds:", len(all_returns)) + return best_trial + + +def evaluate_best_trial( + best_trial: ray.tune.experiment.Trial, + num_eval_seeds: int, + parallel_run_config: Dict[str, Any], + resources_per_trial: Dict[str, int], + return_key: str, + print_return: bool = False, +): + """Evaluate the best trial of a parallel run on a separate set of seeds. + + Args: + best_trial: The trial with the best mean return across all seeds. + num_eval_seeds: Number of distinct seeds to evaluate the best trial on. + parallel_run_config: Dictionary of arguments passed to the parallel + script to get best_trial. + resources_per_trial: Resources to be used for each evaluation trial. + return_key: The key of the return metric in the results dataframe. + print_return: Whether to print the mean and std of the evaluation returns. + + Returns: + eval_run: The result of the evaluation run. """ - run = parallel_ex.run(config_updates=parallel) - result = run.result - - if eval_best_trial: - if parallel["sacred_ex_name"] == "train_rl": - return_key = "monitor_return_mean" - else: - return_key = "imit_stats/monitor_return_mean" - df = result.results_df - df = df[df["config/named_configs"].notna()] - # convert object dtype to str required by df.groupby - for col in df.columns: - if pd_types.is_object_dtype(df[col]): - df[col] = df[col].astype("str") - # group into separate HP configs - grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c] - grps = df.groupby(grp_keys) - # store mean return of runs across all seeds in a group - df["mean_return"] = grps[return_key].transform(lambda x: x.mean()) - best_config_df = df[df["mean_return"] == df["mean_return"].max()] - row = best_config_df.iloc[0] - best_config_tag = row["experiment_tag"] - if result.trials is not None: - trial = [t for t in result.trials if best_config_tag in t.experiment_tag][0] - best_config = trial.config - print("Mean return:", row["mean_return"]) - print( - "All returns:", - df[df["mean_return"] == row["mean_return"]][return_key], - ) - print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum()) - - best_config["config_updates"].update( - seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))), - ) - - resources_per_trial_eval = copy.deepcopy(parallel["resources_per_trial"]) - # update cpus per trial only if it is provided in `resources_per_trial` - # Uses the default values (cpu=1) if it is not provided - if "cpu" in parallel["resources_per_trial"]: - resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier - - eval_config_updates = parallel.copy() - eval_config_updates.update( - run_name=parallel["run_name"] + "_best_hp_eval", - num_samples=1, - search_space=best_config, - base_named_configs=parallel["base_named_configs"], - base_config_updates=parallel["base_config_updates"], - resources_per_trial=resources_per_trial_eval, - search_alg=None, - repeat=1, - experiment_checkpoint_path="", - resume=False, - ) - eval_run = parallel_ex.run(config_updates=eval_config_updates) - eval_result = eval_run.result - returns = eval_result.results_df[return_key].to_numpy() - print("All returns:", returns) - print("Mean:", np.mean(returns)) - print("Std:", np.std(returns)) + best_config = best_trial.config + best_config["config_updates"].update( + seed=ray.tune.grid_search(list(range(100, 100 + num_eval_seeds))), + ) + eval_config_updates = parallel_run_config.copy() + eval_config_updates.update( + run_name=parallel_run_config["run_name"] + "_best_hp_eval", + num_samples=1, + search_space=best_config, + resources_per_trial=resources_per_trial, + search_alg=None, + repeat=1, + experiment_checkpoint_path="", + ) + eval_run = parallel_ex.run(config_updates=eval_config_updates) + eval_result = eval_run.result + returns = eval_result.results_df[return_key].to_numpy() + if print_return: + print("All returns:", returns) + print("Mean:", np.mean(returns)) + print("Std:", np.std(returns)) + return eval_run def main_console(): diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py index 79c8d0347..187963d02 100644 --- a/benchmarking/tuning_config.py +++ b/benchmarking/tuning_config.py @@ -12,7 +12,7 @@ @tuning_ex.named_config def example_rl(): - parallel = dict( + parallel_run_config = dict( sacred_ex_name="train_rl", run_name="rl_tuning", base_named_configs=["logging.wandb_logging"], @@ -33,13 +33,12 @@ def example_rl(): repeat=1, resources_per_trial=dict(cpu=1), ) - eval_best_trial = True - eval_trial_seeds = 5 + num_eval_seeds = 5 @tuning_ex.named_config def example_bc(): - parallel = dict( + parallel_run_config = dict( sacred_ex_name="train_imitation", run_name="bc_tuning", base_named_configs=["logging.wandb_logging"], @@ -62,19 +61,18 @@ def example_bc(): }, "command_name": "bc", }, - num_samples=2, - repeat=1, + num_samples=64, + repeat=3, resources_per_trial=dict(cpu=1), ) - eval_best_trial = True - eval_trial_seeds = 5 + num_eval_seeds = 5 eval_best_trial_resource_multiplier = 1 @tuning_ex.named_config def example_dagger(): - parallel = dict( + parallel_run_config = dict( sacred_ex_name="train_imitation", run_name="dagger_tuning", base_named_configs=["logging.wandb_logging"], @@ -109,13 +107,12 @@ def example_dagger(): repeat=3, resources_per_trial=dict(cpu=1), ) - eval_best_trial = True - eval_trial_seeds = 5 + num_eval_seeds = 5 @tuning_ex.named_config def example_gail(): - parallel = dict( + parallel_run_config = dict( sacred_ex_name="train_adversarial", run_name="gail_tuning_hc", base_named_configs=["logging.wandb_logging"], @@ -145,13 +142,12 @@ def example_gail(): repeat=3, resources_per_trial=dict(cpu=1), ) - eval_best_trial = True - eval_trial_seeds = 5 + num_eval_seeds = 5 @tuning_ex.named_config def example_airl(): - parallel = dict( + parallel_run_config = dict( sacred_ex_name="train_adversarial", run_name="airl_tuning", base_named_configs=["logging.wandb_logging"], @@ -181,14 +177,12 @@ def example_airl(): repeat=3, resources_per_trial=dict(cpu=1), ) - - eval_best_trial = True - eval_trial_seeds = 5 + num_eval_seeds = 5 @tuning_ex.named_config def example_pc(): - parallel = dict( + parallel_run_config = dict( sacred_ex_name="train_preference_comparisons", run_name="pc_tuning", base_named_configs=["logging.wandb_logging"], @@ -232,6 +226,4 @@ def example_pc(): repeat=3, resources_per_trial=dict(cpu=1), ) - - eval_best_trial = True - eval_trial_seeds = 5 + num_eval_seeds = 5 diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index b09f9fc4a..b38b6f28c 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -38,9 +38,8 @@ def config(): local_dir = None # `local_dir` arg for `ray.tune.run` upload_dir = None # `upload_dir` arg for `ray.tune.run` - experiment_checkpoint_path = "" num_samples = 1 # Number of samples per grid search configuration - repeat = 1 + repeat = 1 # Number of times to repeat a sampled configuration search_alg = "optuna" # search algorithm to use experiment_checkpoint_path = "" # Path to checkpoint of experiment syncer = None # Sacred syncer to use diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index ebda17c82..93aa932b9 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -72,11 +72,13 @@ def parallel( init_kwargs: Arguments to pass to `ray.init`. local_dir: `local_dir` argument to `ray.tune.run()`. upload_dir: `upload_dir` argument to `ray.tune.run()`. - search_alg: can be either "optuna" or None. + search_alg: can be either "optuna" or None. Setting `None` allows for + adding grid_search to the `search_space` hyperparameters but doesn't allow + for trials to be repeated. repeat: Number of runs to repeat each trial for. Not used if `search_alg` is None. experiment_checkpoint_path: Path containing the checkpoints of a previous - experiment ran using this script. Useful for evaluating the best trial + experiment ran using this script. Useful for evaluating the best trial of the experiment. syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`. @@ -84,7 +86,8 @@ def parallel( TypeError: Named configs not string sequences or config updates not mappings. Returns: - The result of `ray.tune.run()`. + The result of running the parallel experiment with `ray.tune.run()`. + Useful for fetching the configs and results dataframe of all the trials. """ # Basic validation for config options before we enter parallel jobs. if not isinstance(base_named_configs, collections.abc.Sequence): diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py index ba01b38a2..4a8f6ea6f 100644 --- a/tests/test_benchmarking.py +++ b/tests/test_benchmarking.py @@ -1,5 +1,7 @@ """Tests for config files in benchmarking/ folder.""" import pathlib +import subprocess +import sys import pytest @@ -44,3 +46,28 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str): # THEN assert run.status == "COMPLETED" + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_tuning_print_config_succeeds(algorithm: str): + # We test the configs using the print_config command, + # because running the configs requires MuJoCo. + # Requiring MuJoCo to run the tests adds too much complexity. + + # We need to use sys.executable, not just "python", on Windows as + # subprocess.call ignores PATH (unless shell=True) so runs a + # system-wide Python interpreter outside of our venv. See: + # https://stackoverflow.com/questions/5658622/ + tuning_path = str(BENCHMARKING_DIR / "tuning.py") + env = 'parallel_run_config.base_named_configs=["seals_cartpole"]' + exit_code = subprocess.call( + [ + sys.executable, + tuning_path, + "print_config", + "with", + f"example_{algorithm}", + env, + ], + ) + assert exit_code == 0 From 046b8d9987e13a8d87f2bd52fe75be562e80db04 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Sun, 16 Jul 2023 13:04:14 +0530 Subject: [PATCH 23/47] fix lint error --- src/imitation/scripts/config/parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index b38b6f28c..e81a617db 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -39,7 +39,7 @@ def config(): local_dir = None # `local_dir` arg for `ray.tune.run` upload_dir = None # `upload_dir` arg for `ray.tune.run` num_samples = 1 # Number of samples per grid search configuration - repeat = 1 # Number of times to repeat a sampled configuration + repeat = 1 # Number of times to repeat a sampled configuration search_alg = "optuna" # search algorithm to use experiment_checkpoint_path = "" # Path to checkpoint of experiment syncer = None # Sacred syncer to use From 8eee0822d3fb4686d5801a6e955fdde0c9a90ce7 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Sun, 16 Jul 2023 13:52:43 +0530 Subject: [PATCH 24/47] Add documentation for using the tuning script --- benchmarking/README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/benchmarking/README.md b/benchmarking/README.md index 3f5114545..95e67f1d3 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -17,3 +17,24 @@ python -m imitation.scripts. with benchmarking/.json') ``` + +# Tuning Hyperparameters + +The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script. +The benchmarking hyperparameter configs were generated by tuning the hyperparameters using +the search space defined in the `tuning_config.py` script. The tuning script proceeds in two +phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best +hyperparameter config found in the first phase based on the maximum mean return is +re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials +are reported. + +To tune the hyperparameters of an algorithm using the default search space provided: +```bash +python tuning.py with example_{algo} 'parallel_run_config.base_named_configs=["{env}"]' +``` + +In this command, `example_{algo}` provides the default search space and settings to be used for +the specific algorithm, which is defined in the `tuning_config.py` script and +`'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in. +See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be +provided through the command line to change the tuning behavior. From 5ce765859f7cd295ae607cab2709d0f626c65de7 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Mon, 17 Jul 2023 09:08:04 +0530 Subject: [PATCH 25/47] Fix lint error --- benchmarking/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/README.md b/benchmarking/README.md index 95e67f1d3..892908ac8 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -23,7 +23,7 @@ ex.add_config('benchmarking/.json') The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script. The benchmarking hyperparameter configs were generated by tuning the hyperparameters using the search space defined in the `tuning_config.py` script. The tuning script proceeds in two -phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best +phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best hyperparameter config found in the first phase based on the maximum mean return is re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials are reported. From a8be3316b653451ce8366379cf413627dd22e1ec Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 18 Jul 2023 11:09:05 +0530 Subject: [PATCH 26/47] Updates from the review --- benchmarking/README.md | 4 ++-- ....json => airl_seals_ant_best_hp_eval.json} | 0 ...airl_seals_half_cheetah_best_hp_eval.json} | 0 ...on => airl_seals_hopper_best_hp_eval.json} | 0 ...n => airl_seals_swimmer_best_hp_eval.json} | 0 ...on => airl_seals_walker_best_hp_eval.json} | 0 ...al.json => bc_seals_ant_best_hp_eval.json} | 0 ...> bc_seals_half_cheetah_best_hp_eval.json} | 0 ...json => bc_seals_hopper_best_hp_eval.json} | 0 ...son => bc_seals_swimmer_best_hp_eval.json} | 0 ...json => bc_seals_walker_best_hp_eval.json} | 0 ...son => dagger_seals_ant_best_hp_eval.json} | 0 ...gger_seals_half_cheetah_best_hp_eval.json} | 0 ... => dagger_seals_hopper_best_hp_eval.json} | 0 ...=> dagger_seals_swimmer_best_hp_eval.json} | 0 ... => dagger_seals_walker_best_hp_eval.json} | 0 ....json => gail_seals_ant_best_hp_eval.json} | 0 ...gail_seals_half_cheetah_best_hp_eval.json} | 0 ...on => gail_seals_hopper_best_hp_eval.json} | 0 ...n => gail_seals_swimmer_best_hp_eval.json} | 0 ...on => gail_seals_walker_best_hp_eval.json} | 0 benchmarking/tuning.py | 23 +++++++++++-------- benchmarking/tuning_config.py | 21 +++++++++-------- benchmarking/util.py | 2 +- experiments/commands.py | 18 +++++++-------- src/imitation/scripts/config/parallel.py | 6 ++--- tests/test_benchmarking.py | 4 ++-- tests/test_experiments.py | 16 ++++++------- 28 files changed, 49 insertions(+), 45 deletions(-) rename benchmarking/{example_airl_seals_ant_best_hp_eval.json => airl_seals_ant_best_hp_eval.json} (100%) rename benchmarking/{example_airl_seals_half_cheetah_best_hp_eval.json => airl_seals_half_cheetah_best_hp_eval.json} (100%) rename benchmarking/{example_airl_seals_hopper_best_hp_eval.json => airl_seals_hopper_best_hp_eval.json} (100%) rename benchmarking/{example_airl_seals_swimmer_best_hp_eval.json => airl_seals_swimmer_best_hp_eval.json} (100%) rename benchmarking/{example_airl_seals_walker_best_hp_eval.json => airl_seals_walker_best_hp_eval.json} (100%) rename benchmarking/{example_bc_seals_ant_best_hp_eval.json => bc_seals_ant_best_hp_eval.json} (100%) rename benchmarking/{example_bc_seals_half_cheetah_best_hp_eval.json => bc_seals_half_cheetah_best_hp_eval.json} (100%) rename benchmarking/{example_bc_seals_hopper_best_hp_eval.json => bc_seals_hopper_best_hp_eval.json} (100%) rename benchmarking/{example_bc_seals_swimmer_best_hp_eval.json => bc_seals_swimmer_best_hp_eval.json} (100%) rename benchmarking/{example_bc_seals_walker_best_hp_eval.json => bc_seals_walker_best_hp_eval.json} (100%) rename benchmarking/{example_dagger_seals_ant_best_hp_eval.json => dagger_seals_ant_best_hp_eval.json} (100%) rename benchmarking/{example_dagger_seals_half_cheetah_best_hp_eval.json => dagger_seals_half_cheetah_best_hp_eval.json} (100%) rename benchmarking/{example_dagger_seals_hopper_best_hp_eval.json => dagger_seals_hopper_best_hp_eval.json} (100%) rename benchmarking/{example_dagger_seals_swimmer_best_hp_eval.json => dagger_seals_swimmer_best_hp_eval.json} (100%) rename benchmarking/{example_dagger_seals_walker_best_hp_eval.json => dagger_seals_walker_best_hp_eval.json} (100%) rename benchmarking/{example_gail_seals_ant_best_hp_eval.json => gail_seals_ant_best_hp_eval.json} (100%) rename benchmarking/{example_gail_seals_half_cheetah_best_hp_eval.json => gail_seals_half_cheetah_best_hp_eval.json} (100%) rename benchmarking/{example_gail_seals_hopper_best_hp_eval.json => gail_seals_hopper_best_hp_eval.json} (100%) rename benchmarking/{example_gail_seals_swimmer_best_hp_eval.json => gail_seals_swimmer_best_hp_eval.json} (100%) rename benchmarking/{example_gail_seals_walker_best_hp_eval.json => gail_seals_walker_best_hp_eval.json} (100%) diff --git a/benchmarking/README.md b/benchmarking/README.md index 892908ac8..3973c6181 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -30,10 +30,10 @@ are reported. To tune the hyperparameters of an algorithm using the default search space provided: ```bash -python tuning.py with example_{algo} 'parallel_run_config.base_named_configs=["{env}"]' +python tuning.py with {algo} 'parallel_run_config.base_named_configs=["{env}"]' ``` -In this command, `example_{algo}` provides the default search space and settings to be used for +In this command, `{algo}` provides the default search space and settings to be used for the specific algorithm, which is defined in the `tuning_config.py` script and `'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in. See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be diff --git a/benchmarking/example_airl_seals_ant_best_hp_eval.json b/benchmarking/airl_seals_ant_best_hp_eval.json similarity index 100% rename from benchmarking/example_airl_seals_ant_best_hp_eval.json rename to benchmarking/airl_seals_ant_best_hp_eval.json diff --git a/benchmarking/example_airl_seals_half_cheetah_best_hp_eval.json b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json similarity index 100% rename from benchmarking/example_airl_seals_half_cheetah_best_hp_eval.json rename to benchmarking/airl_seals_half_cheetah_best_hp_eval.json diff --git a/benchmarking/example_airl_seals_hopper_best_hp_eval.json b/benchmarking/airl_seals_hopper_best_hp_eval.json similarity index 100% rename from benchmarking/example_airl_seals_hopper_best_hp_eval.json rename to benchmarking/airl_seals_hopper_best_hp_eval.json diff --git a/benchmarking/example_airl_seals_swimmer_best_hp_eval.json b/benchmarking/airl_seals_swimmer_best_hp_eval.json similarity index 100% rename from benchmarking/example_airl_seals_swimmer_best_hp_eval.json rename to benchmarking/airl_seals_swimmer_best_hp_eval.json diff --git a/benchmarking/example_airl_seals_walker_best_hp_eval.json b/benchmarking/airl_seals_walker_best_hp_eval.json similarity index 100% rename from benchmarking/example_airl_seals_walker_best_hp_eval.json rename to benchmarking/airl_seals_walker_best_hp_eval.json diff --git a/benchmarking/example_bc_seals_ant_best_hp_eval.json b/benchmarking/bc_seals_ant_best_hp_eval.json similarity index 100% rename from benchmarking/example_bc_seals_ant_best_hp_eval.json rename to benchmarking/bc_seals_ant_best_hp_eval.json diff --git a/benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json similarity index 100% rename from benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json rename to benchmarking/bc_seals_half_cheetah_best_hp_eval.json diff --git a/benchmarking/example_bc_seals_hopper_best_hp_eval.json b/benchmarking/bc_seals_hopper_best_hp_eval.json similarity index 100% rename from benchmarking/example_bc_seals_hopper_best_hp_eval.json rename to benchmarking/bc_seals_hopper_best_hp_eval.json diff --git a/benchmarking/example_bc_seals_swimmer_best_hp_eval.json b/benchmarking/bc_seals_swimmer_best_hp_eval.json similarity index 100% rename from benchmarking/example_bc_seals_swimmer_best_hp_eval.json rename to benchmarking/bc_seals_swimmer_best_hp_eval.json diff --git a/benchmarking/example_bc_seals_walker_best_hp_eval.json b/benchmarking/bc_seals_walker_best_hp_eval.json similarity index 100% rename from benchmarking/example_bc_seals_walker_best_hp_eval.json rename to benchmarking/bc_seals_walker_best_hp_eval.json diff --git a/benchmarking/example_dagger_seals_ant_best_hp_eval.json b/benchmarking/dagger_seals_ant_best_hp_eval.json similarity index 100% rename from benchmarking/example_dagger_seals_ant_best_hp_eval.json rename to benchmarking/dagger_seals_ant_best_hp_eval.json diff --git a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json similarity index 100% rename from benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json rename to benchmarking/dagger_seals_half_cheetah_best_hp_eval.json diff --git a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json b/benchmarking/dagger_seals_hopper_best_hp_eval.json similarity index 100% rename from benchmarking/example_dagger_seals_hopper_best_hp_eval.json rename to benchmarking/dagger_seals_hopper_best_hp_eval.json diff --git a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json b/benchmarking/dagger_seals_swimmer_best_hp_eval.json similarity index 100% rename from benchmarking/example_dagger_seals_swimmer_best_hp_eval.json rename to benchmarking/dagger_seals_swimmer_best_hp_eval.json diff --git a/benchmarking/example_dagger_seals_walker_best_hp_eval.json b/benchmarking/dagger_seals_walker_best_hp_eval.json similarity index 100% rename from benchmarking/example_dagger_seals_walker_best_hp_eval.json rename to benchmarking/dagger_seals_walker_best_hp_eval.json diff --git a/benchmarking/example_gail_seals_ant_best_hp_eval.json b/benchmarking/gail_seals_ant_best_hp_eval.json similarity index 100% rename from benchmarking/example_gail_seals_ant_best_hp_eval.json rename to benchmarking/gail_seals_ant_best_hp_eval.json diff --git a/benchmarking/example_gail_seals_half_cheetah_best_hp_eval.json b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json similarity index 100% rename from benchmarking/example_gail_seals_half_cheetah_best_hp_eval.json rename to benchmarking/gail_seals_half_cheetah_best_hp_eval.json diff --git a/benchmarking/example_gail_seals_hopper_best_hp_eval.json b/benchmarking/gail_seals_hopper_best_hp_eval.json similarity index 100% rename from benchmarking/example_gail_seals_hopper_best_hp_eval.json rename to benchmarking/gail_seals_hopper_best_hp_eval.json diff --git a/benchmarking/example_gail_seals_swimmer_best_hp_eval.json b/benchmarking/gail_seals_swimmer_best_hp_eval.json similarity index 100% rename from benchmarking/example_gail_seals_swimmer_best_hp_eval.json rename to benchmarking/gail_seals_swimmer_best_hp_eval.json diff --git a/benchmarking/example_gail_seals_walker_best_hp_eval.json b/benchmarking/gail_seals_walker_best_hp_eval.json similarity index 100% rename from benchmarking/example_gail_seals_walker_best_hp_eval.json rename to benchmarking/gail_seals_walker_best_hp_eval.json diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py index 0c18b1256..324032088 100644 --- a/benchmarking/tuning.py +++ b/benchmarking/tuning.py @@ -30,7 +30,7 @@ def tune( Set to 0 to disable evaluation. Raises: - ValueError: If no trials are returned by. + ValueError: If no trials are returned by the parallel run of tuning. """ run = parallel_ex.run(config_updates=parallel_run_config) experiment_analysis = run.result @@ -54,9 +54,10 @@ def tune( # Uses the default values (cpu=1) if it is not provided if "cpu" in parallel_run_config["resources_per_trial"]: resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier - evaluate_best_trial( + evaluate_trial( best_trial, num_eval_seeds, + parallel_run_config["run_name"] + "_best_hp_eval", parallel_run_config, resources_per_trial_eval, return_key, @@ -107,19 +108,21 @@ def find_best_trial( return best_trial -def evaluate_best_trial( - best_trial: ray.tune.experiment.Trial, +def evaluate_trial( + trial: ray.tune.experiment.Trial, num_eval_seeds: int, + run_name: str, parallel_run_config: Dict[str, Any], resources_per_trial: Dict[str, int], return_key: str, print_return: bool = False, ): - """Evaluate the best trial of a parallel run on a separate set of seeds. + """Evaluate a given trial of a parallel run on a separate set of seeds. Args: - best_trial: The trial with the best mean return across all seeds. + trial: The trial to evaluate. num_eval_seeds: Number of distinct seeds to evaluate the best trial on. + run_name: The name of the evaluation run. parallel_run_config: Dictionary of arguments passed to the parallel script to get best_trial. resources_per_trial: Resources to be used for each evaluation trial. @@ -129,15 +132,15 @@ def evaluate_best_trial( Returns: eval_run: The result of the evaluation run. """ - best_config = best_trial.config - best_config["config_updates"].update( + config = trial.config + config["config_updates"].update( seed=ray.tune.grid_search(list(range(100, 100 + num_eval_seeds))), ) eval_config_updates = parallel_run_config.copy() eval_config_updates.update( - run_name=parallel_run_config["run_name"] + "_best_hp_eval", + run_name=run_name, num_samples=1, - search_space=best_config, + search_space=config, resources_per_trial=resources_per_trial, search_alg=None, repeat=1, diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py index 187963d02..239537406 100644 --- a/benchmarking/tuning_config.py +++ b/benchmarking/tuning_config.py @@ -4,14 +4,14 @@ import sacred from torch import nn -from imitation.algorithms import dagger +from imitation.algorithms import dagger as dagger_alg from imitation.scripts.parallel import parallel_ex tuning_ex = sacred.Experiment("tuning", ingredients=[parallel_ex]) @tuning_ex.named_config -def example_rl(): +def rl(): parallel_run_config = dict( sacred_ex_name="train_rl", run_name="rl_tuning", @@ -37,7 +37,7 @@ def example_rl(): @tuning_ex.named_config -def example_bc(): +def bc(): parallel_run_config = dict( sacred_ex_name="train_imitation", run_name="bc_tuning", @@ -71,7 +71,7 @@ def example_bc(): @tuning_ex.named_config -def example_dagger(): +def dagger(): parallel_run_config = dict( sacred_ex_name="train_imitation", run_name="dagger_tuning", @@ -95,8 +95,11 @@ def example_dagger(): ), "dagger": dict( beta_schedule=tune.choice( - [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]] - + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]], + [dagger_alg.LinearBetaSchedule(i) for i in [1, 5, 15]] + + [ + dagger_alg.ExponentialBetaSchedule(i) + for i in [0.3, 0.5, 0.7] + ], ), rollout_round_min_episodes=tune.choice([3, 5, 10]), ), @@ -111,7 +114,7 @@ def example_dagger(): @tuning_ex.named_config -def example_gail(): +def gail(): parallel_run_config = dict( sacred_ex_name="train_adversarial", run_name="gail_tuning_hc", @@ -146,7 +149,7 @@ def example_gail(): @tuning_ex.named_config -def example_airl(): +def airl(): parallel_run_config = dict( sacred_ex_name="train_adversarial", run_name="airl_tuning", @@ -181,7 +184,7 @@ def example_airl(): @tuning_ex.named_config -def example_pc(): +def pc(): parallel_run_config = dict( sacred_ex_name="train_preference_comparisons", run_name="pc_tuning", diff --git a/benchmarking/util.py b/benchmarking/util.py index 408f0d812..88416344d 100644 --- a/benchmarking/util.py +++ b/benchmarking/util.py @@ -79,7 +79,7 @@ def clean_config_file(file: pathlib.Path, write_path: pathlib.Path, /) -> None: remove_empty_dicts(config) # files are of the format - # /path/to/file/example___best_hp_eval//sacred/1/config.json + # /path/to/file/__best_hp_eval//sacred/1/config.json # we want to write to //_.json with open(write_path / f"{file.parents[3].name}.json", "w") as f: json.dump(config, f, indent=4) diff --git a/experiments/commands.py b/experiments/commands.py index 2ac737e06..9021d3738 100644 --- a/experiments/commands.py +++ b/experiments/commands.py @@ -22,13 +22,13 @@ python -m imitation.scripts.train_adversarial airl \ --capture=sys --name=run0 \ --file_storage=output/sacred/$USER-cmd-run0-airl-0-a3531726 \ - with ../benchmarking/example_airl_seals_walker_best_hp_eval.json \ + with ../benchmarking/airl_seals_walker_best_hp_eval.json \ seed=0 logging.log_root=output python -m imitation.scripts.train_adversarial gail \ --capture=sys --name=run0 \ --file_storage=output/sacred/$USER-cmd-run0-gail-0-a1ec171b \ - with ../benchmarking/example_gail_seals_walker_best_hp_eval.json \ + with ../benchmarking/gail_seals_walker_best_hp_eval.json \ seed=0 logging.log_root=output We can execute commands in parallel by piping them to GNU parallel: @@ -42,7 +42,7 @@ python commands.py \ --name=run0 \ - --cfg_pattern=../benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json \ + --cfg_pattern=../benchmarking/bc_seals_half_cheetah_best_hp_eval.json \ --output_dir=/data/output \ --remote @@ -52,7 +52,7 @@ --command "python -m imitation.scripts.train_imitation bc \ --capture=sys --name=run0 \ --file_storage=/data/output/sacred/$USER-cmd-run0-bc-0-72cb1df3 \ - with /data/imitation/benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json \ + with /data/imitation/benchmarking/bc_seals_half_cheetah_best_hp_eval.json \ seed=0 logging.log_root=/data/output" \ --container hacobe/devbox:imitation \ --login --force-pull --never-restart --gpu 0 --shared-host-dir-mount /data @@ -177,19 +177,19 @@ def parse() -> argparse.Namespace: parser.add_argument( "--cfg_pattern", type=str, - default="example_bc_seals_half_cheetah_best_hp_eval.json", + default="bc_seals_half_cheetah_best_hp_eval.json", help="""Generate a command for every file that matches this glob pattern. \ Each matching file should be a config file that has its algorithm name \ (bc, dagger, airl or gail) bookended by underscores in the filename. \ If the --remote flag is enabled, then generate a command for every file in the \ --remote_cfg_dir directory that has the same filename as a file that matches this \ glob pattern. E.g., suppose the current, local working directory is 'foo' and \ -the subdirectory 'foo/bar' contains the config files 'example_bc_best.json' and \ -'example_dagger_best.json'. If the pattern 'bar/*.json' is supplied, then globbing \ -will return ['bar/example_bc_best.json', 'bar/example_dagger_best.json']. \ +the subdirectory 'foo/bar' contains the config files 'bc_best.json' and \ +'dagger_best.json'. If the pattern 'bar/*.json' is supplied, then globbing \ +will return ['bar/bc_best.json', 'bar/dagger_best.json']. \ If the --remote flag is enabled, 'bar' will be replaced with `remote_cfg_dir` and \ commands will be created for the following configs: \ -[`remote_cfg_dir`/example_bc_best.json, `remote_cfg_dir`/example_dagger_best.json] \ +[`remote_cfg_dir`/bc_best.json, `remote_cfg_dir`/dagger_best.json] \ Why not just supply the pattern '`remote_cfg_dir`/*.json' directly? \ Because the `remote_cfg_dir` directory may not exist on the local machine.""", ) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index e81a617db..a591f3d9a 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -7,10 +7,8 @@ Adding custom named configs is necessary because the CLI interface can't add search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`. -For tuning hyperparameters of an algorithm on a given environment, override -the `base_named_configs` argument with the named config of the environment. -Ex: python -m imitation.scripts.parallel with example_gail \ - 'base_named_configs=["logging.wandb_logging", "seals_half_cheetah"]' +For tuning hyperparameters of an algorithm on a given environment, +check out the benchmarking/tuning.py script. """ import numpy as np diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py index 4a8f6ea6f..18d4f12cf 100644 --- a/tests/test_benchmarking.py +++ b/tests/test_benchmarking.py @@ -37,7 +37,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str): config_name = f"{algorithm}_{environment}" config_file = str( - BENCHMARKING_DIR / f"example_{algorithm}_{environment}_best_hp_eval.json", + BENCHMARKING_DIR / f"{algorithm}_{environment}_best_hp_eval.json", ) # WHEN @@ -66,7 +66,7 @@ def test_tuning_print_config_succeeds(algorithm: str): tuning_path, "print_config", "with", - f"example_{algorithm}", + f"{algorithm}", env, ], ) diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 0f6d314fe..0d431d0e9 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -245,13 +245,13 @@ def test_commands_hofvarpnir_config_with_special_characters_in_flags(tmpdir): def test_commands_bc_config(): if os.name == "nt": # pragma: no cover pytest.skip("commands.py not ported to Windows.") - cfg_pattern = _get_benchmarking_path("example_bc_seals_ant_best_hp_eval.json") + cfg_pattern = _get_benchmarking_path("bc_seals_ant_best_hp_eval.json") commands = _run_commands_from_flags(cfg_pattern=cfg_pattern) assert len(commands) == 1 expected = """python -m imitation.scripts.train_imitation bc \ --capture=sys --name=run0 --file_storage=output/sacred/\ $USER-cmd-run0-bc-0-138a1475 \ -with benchmarking/example_bc_seals_ant_best_hp_eval.json \ +with benchmarking/bc_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected @@ -259,13 +259,13 @@ def test_commands_bc_config(): def test_commands_dagger_config(): if os.name == "nt": # pragma: no cover pytest.skip("commands.py not ported to Windows.") - cfg_pattern = _get_benchmarking_path("example_dagger_seals_ant_best_hp_eval.json") + cfg_pattern = _get_benchmarking_path("dagger_seals_ant_best_hp_eval.json") commands = _run_commands_from_flags(cfg_pattern=cfg_pattern) assert len(commands) == 1 expected = """python -m imitation.scripts.train_imitation dagger \ --capture=sys --name=run0 --file_storage=output/sacred/\ $USER-cmd-run0-dagger-0-6a49161a \ -with benchmarking/example_dagger_seals_ant_best_hp_eval.json \ +with benchmarking/dagger_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected @@ -273,13 +273,13 @@ def test_commands_dagger_config(): def test_commands_gail_config(): if os.name == "nt": # pragma: no cover pytest.skip("commands.py not ported to Windows.") - cfg_pattern = _get_benchmarking_path("example_gail_seals_ant_best_hp_eval.json") + cfg_pattern = _get_benchmarking_path("gail_seals_ant_best_hp_eval.json") commands = _run_commands_from_flags(cfg_pattern=cfg_pattern) assert len(commands) == 1 expected = """python -m imitation.scripts.train_adversarial gail \ --capture=sys --name=run0 --file_storage=output/sacred/\ $USER-cmd-run0-gail-0-3ec8154d \ -with benchmarking/example_gail_seals_ant_best_hp_eval.json \ +with benchmarking/gail_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected @@ -287,13 +287,13 @@ def test_commands_gail_config(): def test_commands_airl_config(): if os.name == "nt": # pragma: no cover pytest.skip("commands.py not ported to Windows.") - cfg_pattern = _get_benchmarking_path("example_airl_seals_ant_best_hp_eval.json") + cfg_pattern = _get_benchmarking_path("airl_seals_ant_best_hp_eval.json") commands = _run_commands_from_flags(cfg_pattern=cfg_pattern) assert len(commands) == 1 expected = """python -m imitation.scripts.train_adversarial airl \ --capture=sys --name=run0 \ --file_storage=output/sacred/$USER-cmd-run0-airl-0-400e1558 \ -with benchmarking/example_airl_seals_ant_best_hp_eval.json \ +with benchmarking/airl_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected From 4ff006d1f2162c8f5085c1f824a19090846dd23c Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Tue, 18 Jul 2023 12:06:30 +0530 Subject: [PATCH 27/47] Fix file name test errors --- experiments/commands.py | 2 +- tests/test_experiments.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/experiments/commands.py b/experiments/commands.py index 9021d3738..738a55011 100644 --- a/experiments/commands.py +++ b/experiments/commands.py @@ -85,7 +85,7 @@ def _get_algo_name(cfg_file: str) -> str: """Get the algorithm name from the given config filename.""" algo_names = set() for key in _ALGO_NAME_TO_SCRIPT_NAME: - if cfg_file.find("_" + key + "_") != -1: + if cfg_file.find(key + "_") != -1: algo_names.add(key) if len(algo_names) == 0: diff --git a/tests/test_experiments.py b/tests/test_experiments.py index 0d431d0e9..b2417a9f9 100644 --- a/tests/test_experiments.py +++ b/tests/test_experiments.py @@ -250,7 +250,7 @@ def test_commands_bc_config(): assert len(commands) == 1 expected = """python -m imitation.scripts.train_imitation bc \ --capture=sys --name=run0 --file_storage=output/sacred/\ -$USER-cmd-run0-bc-0-138a1475 \ +$USER-cmd-run0-bc-0-78e5112a \ with benchmarking/bc_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected @@ -264,7 +264,7 @@ def test_commands_dagger_config(): assert len(commands) == 1 expected = """python -m imitation.scripts.train_imitation dagger \ --capture=sys --name=run0 --file_storage=output/sacred/\ -$USER-cmd-run0-dagger-0-6a49161a \ +$USER-cmd-run0-dagger-0-c27812cf \ with benchmarking/dagger_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected @@ -278,7 +278,7 @@ def test_commands_gail_config(): assert len(commands) == 1 expected = """python -m imitation.scripts.train_adversarial gail \ --capture=sys --name=run0 --file_storage=output/sacred/\ -$USER-cmd-run0-gail-0-3ec8154d \ +$USER-cmd-run0-gail-0-9d8d1202 \ with benchmarking/gail_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected @@ -292,7 +292,7 @@ def test_commands_airl_config(): assert len(commands) == 1 expected = """python -m imitation.scripts.train_adversarial airl \ --capture=sys --name=run0 \ ---file_storage=output/sacred/$USER-cmd-run0-airl-0-400e1558 \ +--file_storage=output/sacred/$USER-cmd-run0-airl-0-9ed3120d \ with benchmarking/airl_seals_ant_best_hp_eval.json \ seed=0 logging.log_root=output""" assert commands[0] == expected From 6933afacb22c555fcd70a833041bd716d2d78807 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 19 Jul 2023 14:41:39 +0530 Subject: [PATCH 28/47] Add tune_run_kwargs in parallel script --- src/imitation/scripts/config/parallel.py | 3 -- src/imitation/scripts/parallel.py | 39 +++++++++++------------- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index a591f3d9a..4773b713e 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -34,13 +34,10 @@ def config(): "config_updates": {}, } # `config` argument to `ray.tune.run(trainable, config)` - local_dir = None # `local_dir` arg for `ray.tune.run` - upload_dir = None # `upload_dir` arg for `ray.tune.run` num_samples = 1 # Number of samples per grid search configuration repeat = 1 # Number of times to repeat a sampled configuration search_alg = "optuna" # search algorithm to use experiment_checkpoint_path = "" # Path to checkpoint of experiment - syncer = None # Sacred syncer to use # Debug named configs diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 93aa932b9..7bf3db16f 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -26,12 +26,9 @@ def parallel( base_config_updates: Mapping[str, Any], resources_per_trial: Dict[str, Any], init_kwargs: Mapping[str, Any], - local_dir: Optional[str], - upload_dir: Optional[str], repeat: int, - search_alg: Optional[str], experiment_checkpoint_path: str, - syncer, + tune_run_kwargs: Dict[str, Any], ) -> ray.tune.ExperimentAnalysis: """Parallelize multiple runs of another Sacred Experiment using Ray Tune. @@ -70,17 +67,13 @@ def parallel( generated Ray directory name, unlike config updates from `search_space`. resources_per_trial: Argument to `ray.tune.run()`. init_kwargs: Arguments to pass to `ray.init`. - local_dir: `local_dir` argument to `ray.tune.run()`. - upload_dir: `upload_dir` argument to `ray.tune.run()`. - search_alg: can be either "optuna" or None. Setting `None` allows for - adding grid_search to the `search_space` hyperparameters but doesn't allow - for trials to be repeated. repeat: Number of runs to repeat each trial for. - Not used if `search_alg` is None. + If `repeat` > 1, then optuna is used as the default search algorithm + unless specified otherwise in `tune_run_kwargs`. experiment_checkpoint_path: Path containing the checkpoints of a previous experiment ran using this script. Useful for evaluating the best trial of the experiment. - syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`. + tune_run_kwargs: Other arguments to pass to `ray.tune.run()`. Raises: TypeError: Named configs not string sequences or config updates not mappings. @@ -118,11 +111,18 @@ def parallel( ) ray.init(**init_kwargs) - if search_alg == "optuna": - algo = search.Repeater(optuna.OptunaSearch(), repeat=repeat) - else: - assert repeat == 1 # repeat should not be used if search_alg is None - algo = None + if repeat > 1: + if "search_alg" not in tune_run_kwargs: + tune_run_kwargs["search_alg"] = optuna.OptunaSearch() + try: + algo = tune_run_kwargs["search_alg"] + algo = search.Repeater(algo, repeat) + tune_run_kwargs["search_alg"] = algo + except AttributeError: + raise ValueError( + "repeat > 1 but search_alg is not an instance of " + "ray.tune.search.SearchAlgorithm", + ) if sacred_ex_name == "train_rl": return_key = "monitor_return_mean" @@ -145,15 +145,10 @@ def parallel( config=search_space, num_samples=num_samples * repeat, name=run_name, - local_dir=local_dir, resources_per_trial=resources_per_trial, - sync_config=ray.tune.syncer.SyncConfig( - upload_dir=upload_dir, - syncer=syncer, - ), - search_alg=algo, metric=return_key, mode="max", + **tune_run_kwargs, ) return result finally: From 77f9d9b74ddcb42e9181f9f493ca2f144b6a443f Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 19 Jul 2023 16:10:15 +0530 Subject: [PATCH 29/47] Fix test errors --- src/imitation/scripts/config/parallel.py | 1 + src/imitation/scripts/parallel.py | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index 4773b713e..bdc591422 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -38,6 +38,7 @@ def config(): repeat = 1 # Number of times to repeat a sampled configuration search_alg = "optuna" # search algorithm to use experiment_checkpoint_path = "" # Path to checkpoint of experiment + tune_run_kwargs = {} # Additional kwargs to pass to `tune.run` # Debug named configs diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 7bf3db16f..65a72eae3 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -111,13 +111,14 @@ def parallel( ) ray.init(**init_kwargs) + updated_tune_run_kwargs = copy.deepcopy(tune_run_kwargs) if repeat > 1: - if "search_alg" not in tune_run_kwargs: - tune_run_kwargs["search_alg"] = optuna.OptunaSearch() + if "search_alg" not in updated_tune_run_kwargs: + updated_tune_run_kwargs["search_alg"] = optuna.OptunaSearch() try: - algo = tune_run_kwargs["search_alg"] + algo = updated_tune_run_kwargs["search_alg"] algo = search.Repeater(algo, repeat) - tune_run_kwargs["search_alg"] = algo + updated_tune_run_kwargs["search_alg"] = algo except AttributeError: raise ValueError( "repeat > 1 but search_alg is not an instance of " @@ -148,7 +149,7 @@ def parallel( resources_per_trial=resources_per_trial, metric=return_key, mode="max", - **tune_run_kwargs, + **updated_tune_run_kwargs, ) return result finally: From 54eb8a6f44ea599236b6165fa5de9079df7ca49a Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 19 Jul 2023 16:31:49 +0530 Subject: [PATCH 30/47] Fix test --- tests/scripts/test_scripts.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index e17765471..146048c42 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -969,7 +969,10 @@ def test_analyze_gather_tb(tmpdir: str): if os.name == "nt": # pragma: no cover pytest.skip("gather_tb uses symlinks: not supported by Windows") num_runs = 2 - config_updates: Dict[str, Any] = dict(local_dir=tmpdir, run_name="test") + config_updates: Dict[str, Any] = dict( + tune_run_kwargs=dict(local_dir=tmpdir), + run_name="test", + ) config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE) config_updates.update(num_samples=num_runs) parallel_run = parallel.parallel_ex.run( From d50238f1b900b05296d081954624cac9e2bcf6ab Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 19 Jul 2023 17:02:37 +0530 Subject: [PATCH 31/47] Fix lint --- src/imitation/scripts/parallel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 65a72eae3..a7a08064b 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -4,7 +4,7 @@ import copy import glob import pathlib -from typing import Any, Callable, Dict, Mapping, Optional, Sequence +from typing import Any, Callable, Dict, Mapping, Sequence import ray import ray.tune @@ -77,6 +77,8 @@ def parallel( Raises: TypeError: Named configs not string sequences or config updates not mappings. + ValueError: `repeat` > 1 but `search_alg` is not an instance of + `ray.tune.search.SearchAlgorithm`. Returns: The result of running the parallel experiment with `ray.tune.run()`. From 3fe22d4e6904c60c581a69004788b08b0184c8ed Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 19 Jul 2023 21:37:18 +0530 Subject: [PATCH 32/47] Updates from review --- benchmarking/tuning.py | 21 +++++++++++++++------ src/imitation/scripts/config/parallel.py | 1 - src/imitation/scripts/parallel.py | 2 +- tests/scripts/test_scripts.py | 1 - 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py index 324032088..409d0b5af 100644 --- a/benchmarking/tuning.py +++ b/benchmarking/tuning.py @@ -7,6 +7,7 @@ import numpy as np import ray from pandas.api import types as pd_types +from ray.tune.search import optuna from sacred.observers import FileStorageObserver from tuning_config import parallel_ex, tuning_ex @@ -32,7 +33,15 @@ def tune( Raises: ValueError: If no trials are returned by the parallel run of tuning. """ - run = parallel_ex.run(config_updates=parallel_run_config) + search_alg = optuna.OptunaSearch() + updated_parallel_run_config = copy.deepcopy(parallel_run_config) + if "tune_run_kwargs" not in updated_parallel_run_config: + tune_run_kwargs = {} + else: + tune_run_kwargs = updated_parallel_run_config["tune_run_kwargs"] + tune_run_kwargs.update(search_alg=search_alg) + updated_parallel_run_config.update(tune_run_kwargs=tune_run_kwargs) + run = parallel_ex.run(config_updates=updated_parallel_run_config) experiment_analysis = run.result if not experiment_analysis.trials: raise ValueError( @@ -42,23 +51,23 @@ def tune( ) return_key = "imit_stats/monitor_return_mean" - if parallel_run_config["sacred_ex_name"] == "train_rl": + if updated_parallel_run_config["sacred_ex_name"] == "train_rl": return_key = "monitor_return_mean" best_trial = find_best_trial(experiment_analysis, return_key, print_return=True) if num_eval_seeds > 0: # evaluate the best trial resources_per_trial_eval = copy.deepcopy( - parallel_run_config["resources_per_trial"], + updated_parallel_run_config["resources_per_trial"], ) # update cpus per trial only if it is provided in `resources_per_trial` # Uses the default values (cpu=1) if it is not provided - if "cpu" in parallel_run_config["resources_per_trial"]: + if "cpu" in updated_parallel_run_config["resources_per_trial"]: resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier evaluate_trial( best_trial, num_eval_seeds, - parallel_run_config["run_name"] + "_best_hp_eval", - parallel_run_config, + updated_parallel_run_config["run_name"] + "_best_hp_eval", + updated_parallel_run_config, resources_per_trial_eval, return_key, ) diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py index bdc591422..c9c898feb 100644 --- a/src/imitation/scripts/config/parallel.py +++ b/src/imitation/scripts/config/parallel.py @@ -36,7 +36,6 @@ def config(): num_samples = 1 # Number of samples per grid search configuration repeat = 1 # Number of times to repeat a sampled configuration - search_alg = "optuna" # search algorithm to use experiment_checkpoint_path = "" # Path to checkpoint of experiment tune_run_kwargs = {} # Additional kwargs to pass to `tune.run` diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index a7a08064b..57503d6e0 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -34,7 +34,7 @@ def parallel( A Sacred FileObserver is attached to the inner experiment and writes Sacred logs to "{RAY_LOCAL_DIR}/sacred/". These files are automatically copied over - to `upload_dir` if that argument is provided. + to `upload_dir` if that argument is provided in `tune_run_kwargs`. Args: sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 146048c42..7ff241323 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -802,7 +802,6 @@ def test_train_rl_cnn_policy(tmpdir: str, rng): # Need absolute path because raylet runs in different working directory. "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(), }, - search_alg=None, # Use default search algorithm of ray. search_space={ "command_name": "airl", "config_updates": {"total_timesteps": tune.choice([5, 10])}, From c50aa20ddfa9f7ce5987a3fd08083d22757925a7 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Thu, 20 Jul 2023 16:19:04 +0530 Subject: [PATCH 33/47] Simplify few lines of code --- benchmarking/tuning.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py index 409d0b5af..9c3f52498 100644 --- a/benchmarking/tuning.py +++ b/benchmarking/tuning.py @@ -33,14 +33,12 @@ def tune( Raises: ValueError: If no trials are returned by the parallel run of tuning. """ - search_alg = optuna.OptunaSearch() updated_parallel_run_config = copy.deepcopy(parallel_run_config) - if "tune_run_kwargs" not in updated_parallel_run_config: - tune_run_kwargs = {} + search_alg = optuna.OptunaSearch() + if "tune_run_kwargs" in updated_parallel_run_config: + updated_parallel_run_config["tune_run_kwargs"]["search_alg"] = search_alg else: - tune_run_kwargs = updated_parallel_run_config["tune_run_kwargs"] - tune_run_kwargs.update(search_alg=search_alg) - updated_parallel_run_config.update(tune_run_kwargs=tune_run_kwargs) + updated_parallel_run_config["tune_run_kwargs"] = dict(search_alg=search_alg) run = parallel_ex.run(config_updates=updated_parallel_run_config) experiment_analysis = run.result if not experiment_analysis.trials: From 000af616fb159c165f4806df11d865ee2a6b3663 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Fri, 4 Aug 2023 21:54:48 +0530 Subject: [PATCH 34/47] Updates from review --- benchmarking/README.md | 3 ++- src/imitation/scripts/analyze.py | 3 --- .../scripts/config/train_adversarial.py | 4 ++++ .../config/train_preference_comparisons.py | 4 ++++ src/imitation/scripts/config/train_rl.py | 5 +++++ src/imitation/scripts/parallel.py | 16 +++++++--------- tests/scripts/test_scripts.py | 3 +++ 7 files changed, 25 insertions(+), 13 deletions(-) diff --git a/benchmarking/README.md b/benchmarking/README.md index 3973c6181..ba89da69d 100644 --- a/benchmarking/README.md +++ b/benchmarking/README.md @@ -15,7 +15,8 @@ python -m imitation.scripts. with benchmarking/.json') +from imitation.scripts. import +.run(command_name="", named_configs=["benchmarking/.json"]) ``` # Tuning Hyperparameters diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py index 8977fed47..96b34bd6e 100644 --- a/src/imitation/scripts/analyze.py +++ b/src/imitation/scripts/analyze.py @@ -166,9 +166,6 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str: def _return_summaries(sd: sacred_util.SacredDicts) -> dict: imit_stats = get(sd.run, "result.imit_stats") - if imit_stats is None: - # stored in rollout key for preference comparison - imit_stats = get(sd.run, "result.rollout") expert_stats = get(sd.run, "result.expert_stats") expert_return_summary = None diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py index ef675eab6..acc842095 100644 --- a/src/imitation/scripts/config/train_adversarial.py +++ b/src/imitation/scripts/config/train_adversarial.py @@ -8,6 +8,10 @@ from imitation.scripts.ingredients import logging as logging_ingredient from imitation.scripts.ingredients import policy_evaluation, reward, rl +# Note: All the hyperparameter configs in the file are of the tuned +# hyperparameters of the RL algorithm of the respective environment. +# Taken from imitation/scripts/config/train_rl.py + train_adversarial_ex = sacred.Experiment( "train_adversarial", ingredients=[ diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py index 4fe9c793e..4d8531732 100644 --- a/src/imitation/scripts/config/train_preference_comparisons.py +++ b/src/imitation/scripts/config/train_preference_comparisons.py @@ -8,6 +8,10 @@ from imitation.scripts.ingredients import logging as logging_ingredient from imitation.scripts.ingredients import policy_evaluation, reward, rl +# Note: All the hyperparameter configs in the file are of the tuned +# hyperparameters of the RL algorithm of the respective environment. +# Taken from imitation/scripts/config/train_rl.py + train_preference_comparisons_ex = sacred.Experiment( "train_preference_comparisons", ingredients=[ diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py index a5475540d..e4ab71da1 100644 --- a/src/imitation/scripts/config/train_rl.py +++ b/src/imitation/scripts/config/train_rl.py @@ -8,6 +8,11 @@ from imitation.scripts.ingredients import logging as logging_ingredient from imitation.scripts.ingredients import policy_evaluation, rl +# Note: All the hyperparameter configs in the file are tuned +# for the PPO algorithm on the respective environment using the +# RL Baselines Zoo library: +# https://github.com/HumanCompatibleAI/rl-baselines3-zoo/ + train_rl_ex = sacred.Experiment( "train_rl", ingredients=[ diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 57503d6e0..9f5478a6e 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -24,7 +24,7 @@ def parallel( search_space: Mapping[str, Any], base_named_configs: Sequence[str], base_config_updates: Mapping[str, Any], - resources_per_trial: Dict[str, Any], + resources_per_trial: Mapping[str, Any], init_kwargs: Mapping[str, Any], repeat: int, experiment_checkpoint_path: str, @@ -115,17 +115,15 @@ def parallel( ray.init(**init_kwargs) updated_tune_run_kwargs = copy.deepcopy(tune_run_kwargs) if repeat > 1: - if "search_alg" not in updated_tune_run_kwargs: - updated_tune_run_kwargs["search_alg"] = optuna.OptunaSearch() try: - algo = updated_tune_run_kwargs["search_alg"] - algo = search.Repeater(algo, repeat) - updated_tune_run_kwargs["search_alg"] = algo - except AttributeError: + # Use optuna as the default search algorithm for repeat runs. + algo = tune_run_kwargs.get("search_alg", optuna.OptunaSearch()) + updated_tune_run_kwargs["search_alg"] = search.Repeater(algo, repeat) + except AttributeError as e: raise ValueError( "repeat > 1 but search_alg is not an instance of " "ray.tune.search.SearchAlgorithm", - ) + ) from e if sacred_ex_name == "train_rl": return_key = "monitor_return_mean" @@ -198,7 +196,7 @@ def _ray_tune_sacred_wrapper( `ex.run`) and `reporter`. The function returns the run result. """ - def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: + def inner(config: dict[str, Any], reporter) -> Mapping[str, Any]: """Trainable function with the correct signature for `ray.tune`. Args: diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py index 7ff241323..b0271d83b 100644 --- a/tests/scripts/test_scripts.py +++ b/tests/scripts/test_scripts.py @@ -889,6 +889,9 @@ def test_parallel_train_adversarial_custom_env(tmpdir): logging=dict(log_root=tmpdir), demonstrations=dict(path=path), ), + # specifying repeat=2 uses the optuna search algorithm which + # requires the search space to be non-empty. So we provide + # the command name using tune.choice. search_space=dict(command_name=tune.choice(["gail"])), ) config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE) From 8b551341a89a5008fd5c35e04110710ea746d52a Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Fri, 4 Aug 2023 23:11:15 +0530 Subject: [PATCH 35/47] Fix test --- .../algorithms/adversarial/common.py | 37 +++++++++++++++---- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index 62b459a0d..545109b0d 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -2,13 +2,13 @@ import abc import dataclasses import logging -from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload +from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload, List import numpy as np import torch as th import torch.utils.tensorboard as thboard import tqdm -from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env +from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env, callbacks from stable_baselines3.sac import policies as sac_policies from torch.nn import functional as F @@ -86,6 +86,30 @@ def compute_train_stats( } +class TrainDiscriminatorCallback(callbacks.BaseCallback): + """Callback for training discriminator after collecting rollouts.""" + + def __init__(self, adversarial_trainer, *args, **kwargs): + """Builds TrainDiscriminatorCallback. + + Args: + *args: Passed through to `callbacks.BaseCallback`. + **kwargs: Passed through to `callbacks.BaseCallback`. + """ + self.adversarial_trainer = adversarial_trainer + super().__init__(*args, **kwargs) + + def _on_step(self) -> bool: + return True + + def _on_rollout_end(self) -> None: + self.adversarial_trainer.model.train_disc() + for _ in range(self.adversarial_trainer.n_disc_updates_per_round): + with networks.training(self.adversarial_trainer.reward_train): + # switch to training mode (affects dropout, normalization) + self.adversarial_trainer.train_disc() + + class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]): """Base class for adversarial imitation learning algorithms like GAIL and AIRL.""" @@ -222,16 +246,17 @@ def __init__( self.venv_buffering = wrappers.BufferingWrapper(self.venv) + self.disc_trainer_callback = TrainDiscriminatorCallback(self) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_wrapped = self.venv_buffering - self.gen_callback = None + self.gen_callback: List[callbacks.BaseCallback] = [self.disc_trainer_callback] else: self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper( self.venv_buffering, reward_fn=self.reward_train.predict_processed, ) - self.gen_callback = self.venv_wrapped.make_log_callback() + self.gen_callback = [self.venv_wrapped.make_log_callback(), self.disc_trainer_callback] self.venv_train = self.venv_wrapped self.gen_algo.set_env(self.venv_train) @@ -446,10 +471,6 @@ def train( ) for r in tqdm.tqdm(range(0, n_rounds), desc="round"): self.train_gen(self.gen_train_timesteps) - for _ in range(self.n_disc_updates_per_round): - with networks.training(self.reward_train): - # switch to training mode (affects dropout, normalization) - self.train_disc() if callback: callback(r) self.logger.dump(self._global_step) From f3ba2b5ec01331f03295856e4219c68212fc7aee Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Fri, 4 Aug 2023 23:13:59 +0530 Subject: [PATCH 36/47] Revert "Fix test" This reverts commit 8b551341a89a5008fd5c35e04110710ea746d52a. --- .../algorithms/adversarial/common.py | 37 ++++--------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index 545109b0d..62b459a0d 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -2,13 +2,13 @@ import abc import dataclasses import logging -from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload, List +from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload import numpy as np import torch as th import torch.utils.tensorboard as thboard import tqdm -from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env, callbacks +from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env from stable_baselines3.sac import policies as sac_policies from torch.nn import functional as F @@ -86,30 +86,6 @@ def compute_train_stats( } -class TrainDiscriminatorCallback(callbacks.BaseCallback): - """Callback for training discriminator after collecting rollouts.""" - - def __init__(self, adversarial_trainer, *args, **kwargs): - """Builds TrainDiscriminatorCallback. - - Args: - *args: Passed through to `callbacks.BaseCallback`. - **kwargs: Passed through to `callbacks.BaseCallback`. - """ - self.adversarial_trainer = adversarial_trainer - super().__init__(*args, **kwargs) - - def _on_step(self) -> bool: - return True - - def _on_rollout_end(self) -> None: - self.adversarial_trainer.model.train_disc() - for _ in range(self.adversarial_trainer.n_disc_updates_per_round): - with networks.training(self.adversarial_trainer.reward_train): - # switch to training mode (affects dropout, normalization) - self.adversarial_trainer.train_disc() - - class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]): """Base class for adversarial imitation learning algorithms like GAIL and AIRL.""" @@ -246,17 +222,16 @@ def __init__( self.venv_buffering = wrappers.BufferingWrapper(self.venv) - self.disc_trainer_callback = TrainDiscriminatorCallback(self) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_wrapped = self.venv_buffering - self.gen_callback: List[callbacks.BaseCallback] = [self.disc_trainer_callback] + self.gen_callback = None else: self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper( self.venv_buffering, reward_fn=self.reward_train.predict_processed, ) - self.gen_callback = [self.venv_wrapped.make_log_callback(), self.disc_trainer_callback] + self.gen_callback = self.venv_wrapped.make_log_callback() self.venv_train = self.venv_wrapped self.gen_algo.set_env(self.venv_train) @@ -471,6 +446,10 @@ def train( ) for r in tqdm.tqdm(range(0, n_rounds), desc="round"): self.train_gen(self.gen_train_timesteps) + for _ in range(self.n_disc_updates_per_round): + with networks.training(self.reward_train): + # switch to training mode (affects dropout, normalization) + self.train_disc() if callback: callback(r) self.logger.dump(self._global_step) From f8251c70e98f0ccf29e10f1b1ac35ce08e25a580 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Fri, 4 Aug 2023 23:14:49 +0530 Subject: [PATCH 37/47] Fix test --- src/imitation/scripts/parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index 9f5478a6e..bb90f6174 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -196,7 +196,7 @@ def _ray_tune_sacred_wrapper( `ex.run`) and `reporter`. The function returns the run result. """ - def inner(config: dict[str, Any], reporter) -> Mapping[str, Any]: + def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: """Trainable function with the correct signature for `ray.tune`. Args: From 664fc37c0dfd118768186e83006fc06def21a48b Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Mon, 7 Aug 2023 22:58:00 +0530 Subject: [PATCH 38/47] Convert Dict to Mapping in input argument --- src/imitation/scripts/parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py index bb90f6174..38881ee2b 100644 --- a/src/imitation/scripts/parallel.py +++ b/src/imitation/scripts/parallel.py @@ -196,7 +196,7 @@ def _ray_tune_sacred_wrapper( `ex.run`) and `reporter`. The function returns the run result. """ - def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: + def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]: """Trainable function with the correct signature for `ray.tune`. Args: @@ -212,7 +212,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]: # TODO(shwang): Stop modifying CAPTURE_MODE once the issue is fixed. sacred.SETTINGS.CAPTURE_MODE = "sys" - run_kwargs = config + run_kwargs = dict(**config) updated_run_kwargs: Dict[str, Any] = {} # Import inside function rather than in module because Sacred experiments # are not picklable, and Ray requires this function to be picklable. From 8690e1dcb01fc96fcfa1813c038f2b1ac26f4a3c Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Wed, 30 Aug 2023 10:47:28 +0200 Subject: [PATCH 39/47] Ignore coverage in script configurations. --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index f39db322f..85dedb3e3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,8 @@ source = imitation include= src/* tests/* +omit = + src/imitation/scripts/config/* [coverage:report] exclude_lines = From dd9eb6a5b7e62b5cf1faf84d9111bac9bef77e9d Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Wed, 30 Aug 2023 11:12:10 +0200 Subject: [PATCH 40/47] Pin huggingface_sb3 version. --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1781a4031..6d1f2489c 100644 --- a/setup.py +++ b/setup.py @@ -207,7 +207,9 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: STABLE_BASELINES3, "sacred>=0.8.4", "tensorboard>=1.14", - "huggingface_sb3>=2.2.1", + # TODO: remove once https://github.com/huggingface/huggingface_sb3/issues/37 is + # fixed + "huggingface_sb3==2.2.5", "optuna>=3.0.1", "datasets>=2.8.0", ], From 40d87ef2e99dcb8a34041d27dd62327ec8faf8b4 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Tue, 26 Sep 2023 16:46:04 +0200 Subject: [PATCH 41/47] Update to the newest seals environment versions. --- benchmarking/airl_seals_ant_best_hp_eval.json | 2 +- benchmarking/airl_seals_half_cheetah_best_hp_eval.json | 2 +- benchmarking/airl_seals_hopper_best_hp_eval.json | 2 +- benchmarking/airl_seals_swimmer_best_hp_eval.json | 4 ++-- benchmarking/airl_seals_walker_best_hp_eval.json | 4 ++-- benchmarking/bc_seals_ant_best_hp_eval.json | 2 +- benchmarking/bc_seals_half_cheetah_best_hp_eval.json | 2 +- benchmarking/bc_seals_hopper_best_hp_eval.json | 2 +- benchmarking/bc_seals_swimmer_best_hp_eval.json | 2 +- benchmarking/bc_seals_walker_best_hp_eval.json | 2 +- benchmarking/dagger_seals_ant_best_hp_eval.json | 2 +- benchmarking/dagger_seals_half_cheetah_best_hp_eval.json | 2 +- benchmarking/dagger_seals_hopper_best_hp_eval.json | 2 +- benchmarking/dagger_seals_swimmer_best_hp_eval.json | 2 +- benchmarking/dagger_seals_walker_best_hp_eval.json | 2 +- benchmarking/gail_seals_ant_best_hp_eval.json | 2 +- benchmarking/gail_seals_half_cheetah_best_hp_eval.json | 2 +- benchmarking/gail_seals_hopper_best_hp_eval.json | 2 +- benchmarking/gail_seals_swimmer_best_hp_eval.json | 4 ++-- benchmarking/gail_seals_walker_best_hp_eval.json | 4 ++-- 20 files changed, 24 insertions(+), 24 deletions(-) diff --git a/benchmarking/airl_seals_ant_best_hp_eval.json b/benchmarking/airl_seals_ant_best_hp_eval.json index 17f969ff0..d4131433e 100644 --- a/benchmarking/airl_seals_ant_best_hp_eval.json +++ b/benchmarking/airl_seals_ant_best_hp_eval.json @@ -62,6 +62,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Ant-v0" + "gym_id": "seals/Ant-v1" } } diff --git a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json index 754ba6736..f69ba5cb5 100644 --- a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json +++ b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json @@ -62,6 +62,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/HalfCheetah-v0" + "gym_id": "seals/HalfCheetah-v1" } } diff --git a/benchmarking/airl_seals_hopper_best_hp_eval.json b/benchmarking/airl_seals_hopper_best_hp_eval.json index 91080d7ce..58c2475f5 100644 --- a/benchmarking/airl_seals_hopper_best_hp_eval.json +++ b/benchmarking/airl_seals_hopper_best_hp_eval.json @@ -75,6 +75,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Hopper-v0" + "gym_id": "seals/Hopper-v1" } } diff --git a/benchmarking/airl_seals_swimmer_best_hp_eval.json b/benchmarking/airl_seals_swimmer_best_hp_eval.json index fcca8e6b3..8529c58b5 100644 --- a/benchmarking/airl_seals_swimmer_best_hp_eval.json +++ b/benchmarking/airl_seals_swimmer_best_hp_eval.json @@ -12,7 +12,7 @@ }, "expert": { "loader_kwargs": { - "gym_id": "seals/Swimmer-v0", + "gym_id": "seals/Swimmer-v1", "organization": "HumanCompatibleAI" } }, @@ -81,6 +81,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Swimmer-v0" + "gym_id": "seals/Swimmer-v1" } } diff --git a/benchmarking/airl_seals_walker_best_hp_eval.json b/benchmarking/airl_seals_walker_best_hp_eval.json index c63070751..edd99806d 100644 --- a/benchmarking/airl_seals_walker_best_hp_eval.json +++ b/benchmarking/airl_seals_walker_best_hp_eval.json @@ -12,7 +12,7 @@ }, "expert": { "loader_kwargs": { - "gym_id": "seals/Walker2d-v0", + "gym_id": "seals/Walker2d-v1", "organization": "HumanCompatibleAI" } }, @@ -81,6 +81,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Walker2d-v0" + "gym_id": "seals/Walker2d-v1" } } diff --git a/benchmarking/bc_seals_ant_best_hp_eval.json b/benchmarking/bc_seals_ant_best_hp_eval.json index 108a93ce7..e9baa8fc1 100644 --- a/benchmarking/bc_seals_ant_best_hp_eval.json +++ b/benchmarking/bc_seals_ant_best_hp_eval.json @@ -43,6 +43,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Ant-v0" + "gym_id": "seals/Ant-v1" } } diff --git a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json index ecaff2eb0..041f159b0 100644 --- a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json +++ b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json @@ -43,6 +43,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/HalfCheetah-v0" + "gym_id": "seals/HalfCheetah-v1" } } diff --git a/benchmarking/bc_seals_hopper_best_hp_eval.json b/benchmarking/bc_seals_hopper_best_hp_eval.json index e8c821841..9a7872d37 100644 --- a/benchmarking/bc_seals_hopper_best_hp_eval.json +++ b/benchmarking/bc_seals_hopper_best_hp_eval.json @@ -43,6 +43,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Hopper-v0" + "gym_id": "seals/Hopper-v1" } } diff --git a/benchmarking/bc_seals_swimmer_best_hp_eval.json b/benchmarking/bc_seals_swimmer_best_hp_eval.json index 30884c9c4..8a8f2456a 100644 --- a/benchmarking/bc_seals_swimmer_best_hp_eval.json +++ b/benchmarking/bc_seals_swimmer_best_hp_eval.json @@ -43,6 +43,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Swimmer-v0" + "gym_id": "seals/Swimmer-v1" } } diff --git a/benchmarking/bc_seals_walker_best_hp_eval.json b/benchmarking/bc_seals_walker_best_hp_eval.json index 0ca30120e..f33e6c5a2 100644 --- a/benchmarking/bc_seals_walker_best_hp_eval.json +++ b/benchmarking/bc_seals_walker_best_hp_eval.json @@ -43,6 +43,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Walker2d-v0" + "gym_id": "seals/Walker2d-v1" } } diff --git a/benchmarking/dagger_seals_ant_best_hp_eval.json b/benchmarking/dagger_seals_ant_best_hp_eval.json index de75b80f1..e02828667 100644 --- a/benchmarking/dagger_seals_ant_best_hp_eval.json +++ b/benchmarking/dagger_seals_ant_best_hp_eval.json @@ -47,6 +47,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Ant-v0" + "gym_id": "seals/Ant-v1" } } diff --git a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json index 7f42bfdf9..d1c9e5923 100644 --- a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json +++ b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json @@ -47,6 +47,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/HalfCheetah-v0" + "gym_id": "seals/HalfCheetah-v1" } } diff --git a/benchmarking/dagger_seals_hopper_best_hp_eval.json b/benchmarking/dagger_seals_hopper_best_hp_eval.json index 1cf29a1a4..b91f66298 100644 --- a/benchmarking/dagger_seals_hopper_best_hp_eval.json +++ b/benchmarking/dagger_seals_hopper_best_hp_eval.json @@ -47,6 +47,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Hopper-v0" + "gym_id": "seals/Hopper-v1" } } diff --git a/benchmarking/dagger_seals_swimmer_best_hp_eval.json b/benchmarking/dagger_seals_swimmer_best_hp_eval.json index c112db680..545761cbc 100644 --- a/benchmarking/dagger_seals_swimmer_best_hp_eval.json +++ b/benchmarking/dagger_seals_swimmer_best_hp_eval.json @@ -47,6 +47,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Swimmer-v0" + "gym_id": "seals/Swimmer-v1" } } diff --git a/benchmarking/dagger_seals_walker_best_hp_eval.json b/benchmarking/dagger_seals_walker_best_hp_eval.json index e59bef464..7b694c8d2 100644 --- a/benchmarking/dagger_seals_walker_best_hp_eval.json +++ b/benchmarking/dagger_seals_walker_best_hp_eval.json @@ -47,6 +47,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Walker2d-v0" + "gym_id": "seals/Walker2d-v1" } } diff --git a/benchmarking/gail_seals_ant_best_hp_eval.json b/benchmarking/gail_seals_ant_best_hp_eval.json index 81399b00c..3d43b34ba 100644 --- a/benchmarking/gail_seals_ant_best_hp_eval.json +++ b/benchmarking/gail_seals_ant_best_hp_eval.json @@ -62,6 +62,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Ant-v0" + "gym_id": "seals/Ant-v1" } } diff --git a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json index 1d2f26648..914f3712a 100644 --- a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json +++ b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json @@ -62,6 +62,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/HalfCheetah-v0" + "gym_id": "seals/HalfCheetah-v1" } } diff --git a/benchmarking/gail_seals_hopper_best_hp_eval.json b/benchmarking/gail_seals_hopper_best_hp_eval.json index 70787ff7e..cebdae71c 100644 --- a/benchmarking/gail_seals_hopper_best_hp_eval.json +++ b/benchmarking/gail_seals_hopper_best_hp_eval.json @@ -75,6 +75,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Hopper-v0" + "gym_id": "seals/Hopper-v1" } } diff --git a/benchmarking/gail_seals_swimmer_best_hp_eval.json b/benchmarking/gail_seals_swimmer_best_hp_eval.json index 650c5f46a..b0bd0e645 100644 --- a/benchmarking/gail_seals_swimmer_best_hp_eval.json +++ b/benchmarking/gail_seals_swimmer_best_hp_eval.json @@ -12,7 +12,7 @@ }, "expert": { "loader_kwargs": { - "gym_id": "seals/Swimmer-v0", + "gym_id": "seals/Swimmer-v1", "organization": "HumanCompatibleAI" } }, @@ -81,6 +81,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Swimmer-v0" + "gym_id": "seals/Swimmer-v1" } } diff --git a/benchmarking/gail_seals_walker_best_hp_eval.json b/benchmarking/gail_seals_walker_best_hp_eval.json index d85eb46d5..2626b4c43 100644 --- a/benchmarking/gail_seals_walker_best_hp_eval.json +++ b/benchmarking/gail_seals_walker_best_hp_eval.json @@ -12,7 +12,7 @@ }, "expert": { "loader_kwargs": { - "gym_id": "seals/Walker2d-v0", + "gym_id": "seals/Walker2d-v1", "organization": "HumanCompatibleAI" } }, @@ -81,6 +81,6 @@ "n_episodes_eval": 50 }, "environment": { - "gym_id": "seals/Walker2d-v0" + "gym_id": "seals/Walker2d-v1" } } From 71f6c9283a387d35ed94f832ca660711942052e3 Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Wed, 27 Sep 2023 09:49:28 +0200 Subject: [PATCH 42/47] Push gymnasium dependency to 0.29 to ensure mujoco envs work. --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7bc4051a9..0384014ee 100644 --- a/setup.py +++ b/setup.py @@ -187,7 +187,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: # encode only known incompatibilities here. This prevents nasty dependency issues # for our users. install_requires=[ - "gymnasium[classic-control]~=0.28.1", + "gymnasium[classic-control]~=0.29", "matplotlib", "numpy>=1.15", "torch>=1.4.0", @@ -220,7 +220,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str: "docs": DOCS_REQUIRE, "parallel": PARALLEL_REQUIRE, "mujoco": [ - "gymnasium[classic-control,mujoco]~=0.28.1", + "gymnasium[classic-control,mujoco]~=0.29", ], "atari": ATARI_REQUIRE, }, From 53c121264d44fd3455888c86eb087a51b7919f9d Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 9 Aug 2023 03:31:10 +0530 Subject: [PATCH 43/47] Update adversarial algorithm --- .../algorithms/adversarial/common.py | 88 +++++++++++++++++-- .../policies/replay_buffer_wrapper.py | 44 +++++++++- 2 files changed, 124 insertions(+), 8 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index ece30b011..e7369719c 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -2,7 +2,7 @@ import abc import dataclasses import logging -from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload +from typing import Callable, Iterable, Iterator, List, Mapping, Optional, Type, overload import numpy as np import torch as th @@ -10,7 +10,9 @@ import tqdm from stable_baselines3.common import ( base_class, + callbacks, distributions, + off_policy_algorithm, on_policy_algorithm, policies, vec_env, @@ -20,6 +22,7 @@ from imitation.algorithms import base from imitation.data import buffer, rollout, types, wrappers +from imitation.policies import replay_buffer_wrapper from imitation.rewards import reward_nets, reward_wrapper from imitation.util import logger, networks, util @@ -92,6 +95,47 @@ def compute_train_stats( } +class TrainDiscriminatorCallback(callbacks.BaseCallback): + """Callback for training discriminator after collecting rollouts.""" + + def __init__(self, adversarial_trainer, *args, **kwargs): + """Builds TrainDiscriminatorCallback. + + Args: + *args: Passed through to `callbacks.BaseCallback`. + **kwargs: Passed through to `callbacks.BaseCallback`. + """ + self.adversarial_trainer = adversarial_trainer + self.gen_ctx_manager = None + super().__init__(*args, **kwargs) + + def _on_step(self) -> bool: + return True + + def _on_rollout_end(self) -> None: + gen_trajs, ep_lens = self.adversarial_trainer.venv_buffering.pop_trajectories() + self.adversarial_trainer._check_fixed_horizon(ep_lens) + gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs) + self.adversarial_trainer._gen_replay_buffer.store(gen_samples) + + for _ in range(self.adversarial_trainer.n_disc_updates_per_round): + with networks.training(self.adversarial_trainer.reward_train): + # switch to training mode (affects dropout, normalization) + self.adversarial_trainer.train_disc() + + # update the rollouts with the reward of the latest discriminator + self.adversarial_trainer.update_rewards_of_rollouts() + + # This is a hacky way to enable logger.accumulate_means for generator + # This is done to avoid nested loggers of discriminator and generator + self.gen_ctx_manager = self.adversarial_trainer.logger.accumulate_means("gen") + self.gen_ctx_manager.__enter__() + + def _on_training_end(self) -> None: + assert self.gen_ctx_manager is not None + self.gen_ctx_manager.__exit__(None, None, None) + + class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]): """Base class for adversarial imitation learning algorithms like GAIL and AIRL.""" @@ -228,16 +272,22 @@ def __init__( self.venv_buffering = wrappers.BufferingWrapper(self.venv) + self.disc_trainer_callback = TrainDiscriminatorCallback(self) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_wrapped = self.venv_buffering - self.gen_callback = None + self.gen_callback: List[callbacks.BaseCallback] = [ + self.disc_trainer_callback + ] else: self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper( self.venv_buffering, reward_fn=self.reward_train.predict_processed, ) - self.gen_callback = self.venv_wrapped.make_log_callback() + self.gen_callback = [ + self.venv_wrapped.make_log_callback(), + self.disc_trainer_callback, + ] self.venv_train = self.venv_wrapped self.gen_algo.set_env(self.venv_train) @@ -314,6 +364,34 @@ def _next_expert_batch(self) -> Mapping: assert self._endless_expert_iterator is not None return next(self._endless_expert_iterator) + def update_rewards_of_rollouts(self) -> None: + """Updates the rewards of the rollouts using the latest discriminator.""" + if isinstance(self.gen_algo, on_policy_algorithm.OnPolicyAlgorithm): + buffer = self.gen_algo.rollout_buffer + assert buffer is not None + reward_fn_inputs = replay_buffer_wrapper._rollout_buffer_to_reward_fn_input( + self.gen_algo.rollout_buffer + ) + rewards = self._reward_net.predict(**reward_fn_inputs) + rewards = rewards.reshape(buffer.rewards.shape) + last_values = buffer.advantages[-1] - buffer.rewards[-1] + buffer.values[-1] + last_values = last_values / buffer.gamma + # here we assume that the actual last_values cannot exactly be 0.0 and so if + # last_values is 0.0 then we know that the episode terminated + last_dones = last_values == 0.0 + self.gen_algo.rollout_buffer.rewards[:] = rewards + self.gen_algo.rollout_buffer.compute_returns_and_advantage( + th.tensor(last_values), last_dones + ) + elif isinstance(self.gen_algo, off_policy_algorithm.OffPolicyAlgorithm): + buffer = self.gen_algo.replay_buffer + assert buffer is not None + reward_fn_inputs = replay_buffer_wrapper._replay_buffer_to_reward_fn_input( + buffer + ) + rewards = self._reward_net.predict(**reward_fn_inputs) + buffer.rewards[:] = rewards.reshape(buffer.rewards.shape) + def train_disc( self, *, @@ -452,10 +530,6 @@ def train( ) for r in tqdm.tqdm(range(0, n_rounds), desc="round"): self.train_gen(self.gen_train_timesteps) - for _ in range(self.n_disc_updates_per_round): - with networks.training(self.reward_train): - # switch to training mode (affects dropout, normalization) - self.train_disc() if callback: callback(r) self.logger.dump(self._global_step) diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py index 7177e2dc1..a8649f78f 100644 --- a/src/imitation/policies/replay_buffer_wrapper.py +++ b/src/imitation/policies/replay_buffer_wrapper.py @@ -4,7 +4,7 @@ import numpy as np from gymnasium import spaces -from stable_baselines3.common.buffers import ReplayBuffer +from stable_baselines3.common.buffers import ReplayBuffer, RolloutBuffer from stable_baselines3.common.type_aliases import ReplayBufferSamples from imitation.rewards.reward_function import RewardFn @@ -23,6 +23,48 @@ def _samples_to_reward_fn_input( ) +def _rollout_buffer_to_reward_fn_input( + buffer: RolloutBuffer, +) -> Mapping[str, np.ndarray]: + """Convert a sample from a rollout buffer to a numpy array.""" + assert buffer.observations is not None + assert buffer.actions is not None + obs = buffer.observations + next_obs = obs[1:] + next_obs = np.concatenate([next_obs, obs[-1:]], axis=0) # last obs not available + actions = buffer.actions + dones = buffer.episode_starts + dones = np.roll(dones, -1, axis=0) + dones[-1] = np.ones_like(dones[-1]) # last dones not available + + return dict( + state=obs.reshape(-1, *obs.shape[2:]), + action=actions.reshape(-1, *actions.shape[2:]), + next_state=next_obs.reshape(-1, *next_obs.shape[2:]), + done=dones.reshape(-1), + ) + + +def _replay_buffer_to_reward_fn_input( + buffer: ReplayBuffer, +) -> Mapping[str, np.ndarray]: + """Convert a sample from a replay buffer to a numpy array.""" + assert buffer.observations is not None + assert buffer.next_observations is not None + assert buffer.actions is not None + obs = buffer.observations + next_obs = buffer.next_observations + actions = buffer.actions + dones = buffer.dones + + return dict( + state=obs.reshape(-1, *obs.shape[2:]), + action=actions.reshape(-1, *actions.shape[2:]), + next_state=next_obs.reshape(-1, *next_obs.shape[2:]), + done=dones.reshape(-1), + ) + + class ReplayBufferRewardWrapper(ReplayBuffer): """Relabel the rewards in transitions sampled from a ReplayBuffer.""" From 47b38741beceddc9c68de8b787baf40b0e3efe13 Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 9 Aug 2023 05:10:46 +0530 Subject: [PATCH 44/47] Fix test errors --- .../algorithms/adversarial/common.py | 22 ++++++++++++------- tests/algorithms/test_adversarial.py | 5 +++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index e7369719c..a95e550e3 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -102,6 +102,8 @@ def __init__(self, adversarial_trainer, *args, **kwargs): """Builds TrainDiscriminatorCallback. Args: + adversarial_trainer: The AdversarialTrainer instance in which + this callback will be called. *args: Passed through to `callbacks.BaseCallback`. **kwargs: Passed through to `callbacks.BaseCallback`. """ @@ -277,7 +279,7 @@ def __init__( # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_wrapped = self.venv_buffering self.gen_callback: List[callbacks.BaseCallback] = [ - self.disc_trainer_callback + self.disc_trainer_callback, ] else: self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper( @@ -370,7 +372,7 @@ def update_rewards_of_rollouts(self) -> None: buffer = self.gen_algo.rollout_buffer assert buffer is not None reward_fn_inputs = replay_buffer_wrapper._rollout_buffer_to_reward_fn_input( - self.gen_algo.rollout_buffer + self.gen_algo.rollout_buffer, ) rewards = self._reward_net.predict(**reward_fn_inputs) rewards = rewards.reshape(buffer.rewards.shape) @@ -381,13 +383,14 @@ def update_rewards_of_rollouts(self) -> None: last_dones = last_values == 0.0 self.gen_algo.rollout_buffer.rewards[:] = rewards self.gen_algo.rollout_buffer.compute_returns_and_advantage( - th.tensor(last_values), last_dones + th.tensor(last_values), + last_dones, ) elif isinstance(self.gen_algo, off_policy_algorithm.OffPolicyAlgorithm): buffer = self.gen_algo.replay_buffer assert buffer is not None reward_fn_inputs = replay_buffer_wrapper._replay_buffer_to_reward_fn_input( - buffer + buffer, ) rewards = self._reward_net.predict(**reward_fn_inputs) buffer.rewards[:] = rewards.reshape(buffer.rewards.shape) @@ -466,13 +469,15 @@ def train_disc( return train_stats - def train_gen( + def train_gen_with_disc( self, total_timesteps: Optional[int] = None, learn_kwargs: Optional[Mapping] = None, ) -> None: """Trains the generator to maximize the discriminator loss. + The discriminator is also trained after the rollouts are collected and before + the generator is trained. After the end of training populates the generator replay buffer (used in discriminator training) with `self.disc_batch_size` transitions. @@ -509,7 +514,7 @@ def train( ) -> None: """Alternates between training the generator and discriminator. - Every "round" consists of a call to `train_gen(self.gen_train_timesteps)`, + Every "round" consists of a call to `train_gen_with_disc(self.gen_train_timesteps)`, a call to `train_disc`, and finally a call to `callback(round)`. Training ends once an additional "round" would cause the number of transitions @@ -529,7 +534,7 @@ def train( f"total_timesteps={total_timesteps})!" ) for r in tqdm.tqdm(range(0, n_rounds), desc="round"): - self.train_gen(self.gen_train_timesteps) + self.train_gen_with_disc(self.gen_train_timesteps) if callback: callback(r) self.logger.dump(self._global_step) @@ -621,7 +626,8 @@ def _make_disc_train_batches( if gen_samples is None: if self._gen_replay_buffer.size() == 0: raise RuntimeError( - "No generator samples for training. " "Call `train_gen()` first.", + "No generator samples for training. " + "Call `train_gen_with_disc()` first.", ) gen_samples_dataclass = self._gen_replay_buffer.sample(batch_size) gen_samples = types.dataclass_quick_asdict(gen_samples_dataclass) diff --git a/tests/algorithms/test_adversarial.py b/tests/algorithms/test_adversarial.py index d3609efaa..769b2d52f 100644 --- a/tests/algorithms/test_adversarial.py +++ b/tests/algorithms/test_adversarial.py @@ -231,8 +231,9 @@ def test_train_gen_train_disc_no_crash( trainer_parametrized: common.AdversarialTrainer, n_updates: int = 2, ) -> None: - trainer_parametrized.train_gen(n_updates * trainer_parametrized.gen_train_timesteps) - trainer_parametrized.train_disc() + trainer_parametrized.train_gen_with_disc( + n_updates * trainer_parametrized.gen_train_timesteps + ) @pytest.fixture From 9fa8969dcab32262934ae5f44d6ad44920354b2b Mon Sep 17 00:00:00 2001 From: taufeeque9 <9taufeeque9@gmail.com> Date: Wed, 9 Aug 2023 05:20:59 +0530 Subject: [PATCH 45/47] Fix test errors --- src/imitation/algorithms/adversarial/common.py | 12 +++++++++--- tests/algorithms/test_adversarial.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index a95e550e3..c876c6f69 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -115,6 +115,8 @@ def _on_step(self) -> bool: return True def _on_rollout_end(self) -> None: + if self.gen_ctx_manager is not None: + self.exit_gen_ctx_manager() gen_trajs, ep_lens = self.adversarial_trainer.venv_buffering.pop_trajectories() self.adversarial_trainer._check_fixed_horizon(ep_lens) gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs) @@ -133,9 +135,13 @@ def _on_rollout_end(self) -> None: self.gen_ctx_manager = self.adversarial_trainer.logger.accumulate_means("gen") self.gen_ctx_manager.__enter__() - def _on_training_end(self) -> None: + def exit_gen_ctx_manager(self) -> None: assert self.gen_ctx_manager is not None self.gen_ctx_manager.__exit__(None, None, None) + self.gen_ctx_manager = None + + def _on_training_end(self) -> None: + self.exit_gen_ctx_manager() class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]): @@ -514,8 +520,8 @@ def train( ) -> None: """Alternates between training the generator and discriminator. - Every "round" consists of a call to `train_gen_with_disc(self.gen_train_timesteps)`, - a call to `train_disc`, and finally a call to `callback(round)`. + Every "round" consists of a call to + `train_gen_with_disc(self.gen_train_timesteps)` and a call to `callback(round)`. Training ends once an additional "round" would cause the number of transitions sampled from the environment to exceed `total_timesteps`. diff --git a/tests/algorithms/test_adversarial.py b/tests/algorithms/test_adversarial.py index 769b2d52f..3a53e35ca 100644 --- a/tests/algorithms/test_adversarial.py +++ b/tests/algorithms/test_adversarial.py @@ -232,7 +232,7 @@ def test_train_gen_train_disc_no_crash( n_updates: int = 2, ) -> None: trainer_parametrized.train_gen_with_disc( - n_updates * trainer_parametrized.gen_train_timesteps + n_updates * trainer_parametrized.gen_train_timesteps, ) From 3edf518608bbc39b2301a7ab3deca2c8fdbea81b Mon Sep 17 00:00:00 2001 From: Maximilian Ernestus Date: Tue, 26 Sep 2023 16:03:10 +0200 Subject: [PATCH 46/47] Don't enter the generator logging ctx twice. --- src/imitation/algorithms/adversarial/common.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index c876c6f69..87bd43d4a 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -499,14 +499,13 @@ def train_gen_with_disc( if learn_kwargs is None: learn_kwargs = {} - with self.logger.accumulate_means("gen"): - self.gen_algo.learn( - total_timesteps=total_timesteps, - reset_num_timesteps=False, - callback=self.gen_callback, - **learn_kwargs, - ) - self._global_step += 1 + self.gen_algo.learn( + total_timesteps=total_timesteps, + reset_num_timesteps=False, + callback=self.gen_callback, + **learn_kwargs, + ) + self._global_step += 1 gen_trajs, ep_lens = self.venv_buffering.pop_trajectories() self._check_fixed_horizon(ep_lens) From ce8c87ddace0017801e6a5e8fcfcd2ca0dc24cf7 Mon Sep 17 00:00:00 2001 From: Mohammad Taufeeque <9taufeeque9@gmail.com> Date: Wed, 27 Sep 2023 05:35:23 +0530 Subject: [PATCH 47/47] Update common.py to fix test errors --- src/imitation/algorithms/adversarial/common.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py index 87bd43d4a..c9e880c07 100644 --- a/src/imitation/algorithms/adversarial/common.py +++ b/src/imitation/algorithms/adversarial/common.py @@ -507,11 +507,6 @@ def train_gen_with_disc( ) self._global_step += 1 - gen_trajs, ep_lens = self.venv_buffering.pop_trajectories() - self._check_fixed_horizon(ep_lens) - gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs) - self._gen_replay_buffer.store(gen_samples) - def train( self, total_timesteps: int,