From b4210c105a34a2b7f83f5e6a29095f8017318cda Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 5 Jan 2023 01:49:50 +0530
Subject: [PATCH 01/47] Merge py file changes from benchmark-algs

---
 src/imitation/algorithms/dagger.py            |  62 +++
 src/imitation/scripts/analyze.py              |  24 +-
 src/imitation/scripts/config/parallel.py      | 406 ++++++++++++++++--
 .../scripts/config/train_adversarial.py       | 175 +++++++-
 .../scripts/config/train_imitation.py         |  26 ++
 .../config/train_preference_comparisons.py    | 128 +++++-
 src/imitation/scripts/config/train_rl.py      | 203 ++++++++-
 src/imitation/scripts/ingredients/reward.py   |   5 +
 src/imitation/scripts/parallel.py             | 166 ++++++-
 src/imitation/scripts/train_adversarial.py    |   1 +
 src/imitation/scripts/train_imitation.py      |   4 +-
 .../scripts/train_preference_comparisons.py   |   1 +
 src/imitation/scripts/train_rl.py             |   4 +-
 tests/algorithms/test_dagger.py               |  25 +-
 tests/scripts/test_scripts.py                 |  31 +-
 15 files changed, 1173 insertions(+), 88 deletions(-)

diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
index a7194a5bf..0034fc4ba 100644
--- a/src/imitation/algorithms/dagger.py
+++ b/src/imitation/algorithms/dagger.py
@@ -65,6 +65,68 @@ def __call__(self, round_num: int) -> float:
         assert round_num >= 0
         return min(1, max(0, (self.rampdown_rounds - round_num) / self.rampdown_rounds))
 
+    def __repr__(self):
+        return f"{type(self).__name__}({self.rampdown_rounds!r})"
+
+
+class IndicatorBetaSchedule(BetaSchedule):
+    """Beta schedule that switches off after a number of rounds."""
+
+    def __init__(self, rampdown_rounds: int):
+        """Builds IndicatorBetaSchedule.
+
+        Args:
+            rampdown_rounds: number of rounds after which beta switches off.
+        """
+        self.rampdown_rounds = rampdown_rounds
+
+    def __call__(self, round_num: int) -> float:
+        """Computes beta value.
+
+        Args:
+            round_num: the current round number.
+
+        Returns:
+            beta as `1` until `self.rampdown_rounds` and then beta as `0`.
+        """
+        assert round_num >= 0
+        return 1 if round_num < self.rampdown_rounds else 0
+
+    def __repr__(self):
+        return f"{type(self).__name__}({self.rampdown_rounds!r})"
+
+
+class ExponentialBetaSchedule(BetaSchedule):
+    """Exponentially decaying schedule for beta."""
+
+    def __init__(self, decay_probability: float):
+        """Builds ExponentialBetaSchedule.
+
+        Args:
+            decay_probability: the decay factor for beta.
+
+        Raises:
+            ValueError: if `decay_probability` not within (0, 1].
+        """
+        if not (0 < decay_probability <= 1):
+            raise ValueError("decay_probability lies outside the range (0, 1].")
+        self.decay_probability = decay_probability
+
+    def __call__(self, round_num: int) -> float:
+        """Computes beta value.
+
+        Args:
+            round_num: the current round number.
+
+        Returns:
+            beta as `self.decay_probability ^ round_num`
+        """
+        assert round_num >= 0
+        return self.decay_probability**round_num
+
+    def __repr__(self):
+        return f"{type(self).__name__}({self.decay_probability!r})"
+
 
 def reconstruct_trainer(
     scratch_dir: types.AnyPath,
diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index 0586f86d6..54fed52f9 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -166,6 +166,8 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str:
 
 def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
     imit_stats = get(sd.run, "result.imit_stats")
+    if imit_stats is None:
+        imit_stats = get(sd.run, "result.rollout")
     expert_stats = get(sd.run, "result.expert_stats")
 
     expert_return_summary = None
@@ -232,7 +234,7 @@ def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
 # verbosity 2
 table_verbosity_mapping.append(
     table_verbosity_mapping[-1]
-    | {"status", "imit_expert_ratio", "exp_command", "run_name"},
+    | {"status", "imit_expert_ratio", "exp_command", "run_name", "seed", ""},
 )
 
 
@@ -268,20 +270,26 @@ def analyze_imitation(
     Returns:
         The DataFrame generated from the Sacred logs.
     """
-    table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
+    if table_verbosity == -1:
+        table_entry_fns_subset = _get_table_entry_fns_subset(0)
+    else:
+        table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
 
-    rows = []
+    df = pd.DataFrame()
     for sd in _gather_sacred_dicts():
-        row = {}
+        new_df = pd.DataFrame()
+        if table_verbosity == -1:
+            new_df = pd.json_normalize(sd.config)
+
         for col_name, make_entry_fn in table_entry_fns_subset.items():
-            row[col_name] = make_entry_fn(sd)
-        rows.append(row)
+            new_df[col_name] = make_entry_fn(sd)
+
+        df = pd.concat([df, new_df])
 
-    df = pd.DataFrame(rows)
     if len(df) > 0:
         df.sort_values(by=["algo", "env_name"], inplace=True)
 
-    display_options = dict(index=False)
+    display_options: Mapping[str, Any] = dict(index=False)
     if csv_output_path is not None:
         df.to_csv(csv_output_path, **display_options)
         print(f"Wrote CSV file to {csv_output_path}")
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index eb206893f..59295d3d3 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -5,13 +5,15 @@
 `@parallel_ex.named_config` to define a new parallel experiment.
 
 Adding custom named configs is necessary because the CLI interface can't add
-search spaces to the config like `"seed": tune.grid_search([0, 1, 2, 3])`.
+search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
 """
 
 import numpy as np
 import ray.tune as tune
 import sacred
+from torch import nn
 
+from imitation.algorithms.dagger import ExponentialBetaSchedule, LinearBetaSchedule
 from imitation.util.util import make_unique_timestamp
 
 parallel_ex = sacred.Experiment("parallel")
@@ -33,12 +35,39 @@ def config():
 
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
-    n_seeds = 3  # Number of seeds to search over by default
+    # n_seeds_start = 0
+    # n_seeds = 1  # Number of seeds to search over by default
+    experiment_checkpoint_path = ""
+    eval_best_trial = False
+    eval_trial_seeds = 5  # Number of seeds to search over by default
+    num_samples = 1  # Number of samples per grid search configuration
+    repeat = 3
+    env = "seals_half_cheetah"
+    wandb_name_prefix = ""
+
+
+# @parallel_ex.config
+# def seeds(n_seeds_start, n_seeds):
+#     search_space = {
+#         "config_updates": {
+#             "seed": tune.choice(
+#                 list(range(n_seeds_start, n_seeds_start + n_seeds)),
+#             )
+#         }
+#     }
 
 
 @parallel_ex.config
-def seeds(n_seeds):
-    search_space = {"config_updates": {"seed": tune.grid_search(list(range(n_seeds)))}}
+def wandb(run_name):
+    base_config_updates = {
+        "common": {
+            "wandb": {
+                "wandb_name_prefix": run_name,
+                "wandb_kwargs": {"project": "algorithm-benchmark"},
+            },
+        },
+    }
+    # base_named_configs = ["common.wandb_logging"]
 
 
 @parallel_ex.named_config
@@ -63,7 +92,7 @@ def generate_test_data():
         "config_updates": {
             "rl": {
                 "rl_kwargs": {
-                    "learning_rate": tune.grid_search(
+                    "learning_rate": tune.choice(
                         [3e-4 * x for x in (1 / 3, 1 / 2)],
                     ),
                 },
@@ -91,8 +120,8 @@ def example_cartpole_rl():
         "config_updates": {
             "rl": {
                 "rl_kwargs": {
-                    "learning_rate": tune.grid_search(np.logspace(3e-6, 1e-1, num=3)),
-                    "nminibatches": tune.grid_search([16, 32, 64]),
+                    "learning_rate": tune.choice(np.logspace(3e-6, 1e-1, num=3)),
+                    "nminibatches": tune.choice([16, 32, 64]),
                 },
             },
         },
@@ -105,44 +134,367 @@ def example_cartpole_rl():
 
 
 @parallel_ex.named_config
-def example_rl_easy():
+def example_rl():
     sacred_ex_name = "train_rl"
-    run_name = "example-rl-easy"
-    n_seeds = 2
+    run_name = "rl_tuning"
+    # n_seeds = 2
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        "common": {
+            "wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}},
+            "num_vec": 1,
+        },
+    }
     search_space = {
-        "named_configs": tune.grid_search([[env] for env in EASY_ENVS]),
+        # "named_configs": tune.choice([[env] for env in EASY_ENVS]),
         "config_updates": {
             "rl": {
+                "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
                 "rl_kwargs": {
-                    "learning_rate": tune.grid_search(np.logspace(3e-6, 1e-1, num=3)),
-                    "nminibatches": tune.grid_search([16, 32, 64]),
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    "batch_size": tune.choice([64, 128, 256, 512]),
+                    "n_epochs": tune.choice([5, 10, 20]),
                 },
             },
         },
     }
-    resources_per_trial = dict(cpu=4)
+    num_samples = 100
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 1
+    resources_per_trial = dict(cpu=1)
 
 
 @parallel_ex.named_config
-def example_gail_easy():
+def example_bc():
+    sacred_ex_name = "train_imitation"
+    run_name = "bc_tuning_hc"
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        # "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}},
+        "common": {"num_vec": 1},
+    }
+    search_space = {
+        "config_updates": {
+            "bc_kwargs": dict(
+                batch_size=tune.choice([8, 16, 32, 64]),
+                l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
+                optimizer_kwargs=dict(
+                    lr=tune.loguniform(1e-5, 1e-2),
+                ),
+            ),
+            "bc_train_kwargs": dict(
+                n_epochs=tune.choice([1, 5, 10, 20]),
+            ),
+        },
+        "command_name": "bc",
+    }
+    num_samples = 64
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_dagger():
+    sacred_ex_name = "train_imitation"
+    run_name = "dagger_tuning_hc"
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        # "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}},
+        "common": {"num_vec": 1},
+        "dagger": {"total_timesteps": 1e5},
+        "bc_kwargs": {
+            "batch_size": 16,
+            "l2_weight": 1e-4,
+            "optimizer_kwargs": {"lr": 1e-3},
+        },
+    }
+    search_space = {
+        "config_updates": {
+            "bc_train_kwargs": dict(
+                n_epochs=tune.choice([1, 5, 10]),
+            ),
+            "dagger": dict(
+                beta_schedule=tune.choice(
+                    [LinearBetaSchedule(i) for i in [1, 5, 15]]
+                    + [ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                ),
+                rollout_round_min_episodes=tune.choice([3, 5, 10]),
+            ),
+        },
+        "command_name": "dagger",
+    }
+    num_samples = 50
+    repeat = 3
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_gail():
     sacred_ex_name = "train_adversarial"
-    run_name = "example-gail-easy"
-    n_seeds = 1
+    run_name = "gail_tuning_hc"
+    base_named_configs = ["common.wandb_logging"]
+    base_config_updates = {
+        "common": {"num_vec": 1},
+        "total_timesteps": 1e7,
+    }
     search_space = {
-        "named_configs": tune.grid_search([[env] for env in EASY_ENVS]),
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
         "config_updates": {
-            "init_trainer_kwargs": {
-                "rl": {
-                    "rl_kwargs": {
-                        "learning_rate": tune.grid_search(
-                            np.logspace(3e-6, 1e-1, num=3),
-                        ),
-                        "nminibatches": tune.grid_search([16, 32, 64]),
-                    },
+            "algorithm_kwargs": dict(
+                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                n_disc_updates_per_round=tune.choice([8, 16]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": tune.choice([4096, 8192, 16384]),
+                "rl_kwargs": {
+                    "ent_coef": tune.loguniform(1e-7, 1e-3),
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
                 },
             },
+            "algorithm_specific": {},
         },
+        "command_name": "gail",
+    }
+    num_samples = 100
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_airl():
+    sacred_ex_name = "train_adversarial"
+    run_name = "airl_tuning_hc"
+    # n_seeds = 1
+    base_named_configs = ["common.wandb_logging"]
+    base_config_updates = {
+        "common": {"num_vec": 1},
+        "total_timesteps": 1e7,
     }
     search_space = {
-        "command_name": "gail",
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            "algorithm_kwargs": dict(
+                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                n_disc_updates_per_round=tune.choice([8, 16]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": tune.choice([4096, 8192, 16384]),
+                "rl_kwargs": {
+                    "ent_coef": tune.loguniform(1e-7, 1e-3),
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
+                },
+            },
+            "algorithm_specific": {},
+        },
+        "command_name": "airl",
+    }
+    num_samples = 100
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def example_pc():
+    sacred_ex_name = "train_preference_comparisons"
+    run_name = "pc_tuning"
+    base_named_configs = ["common.wandb_logging", "seals_half_cheetah"]
+    base_config_updates = {
+        "common": {"num_vec": 1},
+        "total_timesteps": 2e7,
+        "total_comparisons": 5000,
+        "query_schedule": "hyperbolic",
+        "gatherer_kwargs": {"sample": True},
+    }
+    search_space = {
+        "named_configs": tune.choice(
+            [
+                ["reward.normalize_output_disable"],
+                # ["reward.normalize_output_running"],
+                # ["reward.normalize_output_ema"],
+            ],
+        ),
+        "config_updates": {
+            "train": {
+                "policy_kwargs": {
+                    "activation_fn": tune.choice(
+                        [
+                            nn.ReLU,
+                            # nn.Tanh,
+                        ],
+                    ),
+                },
+            },
+            "num_iterations": tune.choice([25, 50]),
+            # "initial_comparison_frac": tune.choice([0.1, 0.25]),
+            # "reward_trainer_kwargs": {
+            #     "epochs": tune.choice([1, 3, 6]),
+            # },
+            # "query_schedule": tune.choice(
+            #     ["constant", "hyperbolic", "inverse_quadratic"],
+            # ),
+            "rl": {
+                "batch_size": tune.choice([512, 2048, 8192]),
+                "rl_kwargs": {
+                    "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    "ent_coef": tune.loguniform(1e-7, 1e-3),
+                },
+            },
+        },
     }
+    num_samples = 24
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    repeat = 3
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def debug_eval():
+    sacred_ex_name = "train_preference_comparisons"
+    run_name = "debug_eval"
+    eval_trial_seeds = 2
+    eval_best_trial = True
+    # base_named_configs = ["seals_half_cheetah"]
+    base_config_updates = {
+        "common": {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}},
+        "total_timesteps": 30,
+        "total_comparisons": 10,
+        # "query_schedule": "hyperbolic",
+        "num_iterations": 1,
+        "fragment_length": 2,
+    }
+    search_space = {
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            # "num_iterations": tune.choice([5, 20, 50]),
+            "initial_comparison_frac": tune.choice([0.1, 0.2]),
+            # "reward_trainer_kwargs": {
+            #     "epochs": tune.choice([1, 2, 3]),
+            # },
+            # "query_schedule": tune.choice(
+            #     ["constant", "hyperbolic", "inverse_quadratic"],
+            # ),
+        },
+    }
+    resources_per_trial = dict(cpu=1)
+
+
+@parallel_ex.named_config
+def debug_eval_adv():
+    sacred_ex_name = "train_adversarial"
+    run_name = "airl_tuning_debug"
+    # n_seeds = 5
+    base_named_configs = []
+    eval_best_trial = True
+    eval_trial_seeds = 2
+    base_config_updates = {
+        "common": {
+            "wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}},
+            # "num_env": 1,
+        },
+        "total_timesteps": 2048,
+    }
+    search_space = {
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            "algorithm_kwargs": dict(
+                # demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                n_disc_updates_per_round=tune.choice([1, 2]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": 8,
+                # "rl_kwargs": {
+                #     "ent_coef": tune.choice([0, 1e-3, 1e-1]),
+                #     "learning_rate": tune.loguniform(1e-5, 5e-3),
+                # },
+            },
+            "algorithm_specific": dict(demo_batch_size=1),
+        },
+        "command_name": "airl",
+    }
+    num_samples = 2
+    repeat = 2
+    resources_per_trial = dict(cpu=8)
+
+
+@parallel_ex.named_config
+def debug_airl():
+    sacred_ex_name = "train_adversarial"
+    run_name = "airl_debug"
+    # n_seeds = 1
+    base_named_configs = ["common.wandb_logging", "seals_walker"]
+    base_config_updates = {
+        "common": {"num_vec": 8},
+        "total_timesteps": 1e7,
+    }
+    search_space = {
+        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
+        "config_updates": {
+            "train": {
+                "policy_kwargs": {
+                    "activation_fn": tune.choice(
+                        [
+                            nn.ReLU,
+                            # nn.Tanh,
+                        ],
+                    ),
+                },
+            },
+            "algorithm_kwargs": dict(
+                demo_batch_size=tune.choice([32]),
+                n_disc_updates_per_round=tune.choice([10]),
+                # both are same as rl.batch_size
+                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+                # gen_train_timesteps=0,
+            ),
+            "rl": {
+                "batch_size": tune.choice([10000]),
+                "rl_kwargs": {
+                    "ent_coef": tune.choice([0.1]),
+                    "learning_rate": tune.choice([1e-4]),
+                },
+            },
+            "algorithm_specific": {},
+        },
+        "command_name": "airl",
+    }
+    num_samples = 1
+    eval_best_trial = False
+    # eval_trial_seeds = 5
+    repeat = 5
+    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
+    resources_per_trial = dict(cpu=8)
+
+
+# @parallel_ex.config_hook
+# def config_hook(config, command_name, logger):
+#     """Sets env."""
+#     del command_name, logger
+#     res = {}
+#     print(config)
+#     if config["env"]:
+#         res["base_named_configs"] = tuple(
+#             config["base_named_configs"] + [config["env"]]
+#         )
+#     print(res)
+#     return res
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index aae3baeb0..bd9df6287 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -1,6 +1,7 @@
 """Configuration for imitation.scripts.train_adversarial."""
 
 import sacred
+from torch import nn
 
 from imitation.rewards import reward_nets
 from imitation.scripts.ingredients import demonstrations, environment, expert
@@ -98,9 +99,25 @@ def pendulum():
 
 @train_adversarial_ex.named_config
 def seals_ant():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    locals().update(**ANT_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=16,
+            clip_range=0.3,
+            ent_coef=3.1441389214159857e-06,
+            gae_lambda=0.8,
+            gamma=0.995,
+            learning_rate=0.00017959211641976886,
+            max_grad_norm=0.9,
+            n_epochs=10,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.4351450387648799,
+        ),
+    )
 
 
 CHEETAH_SHARED_LOCALS = dict(
@@ -139,40 +156,145 @@ def half_cheetah():
 
 @train_adversarial_ex.named_config
 def seals_half_cheetah():
-    locals().update(**CHEETAH_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/HalfCheetah-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    rl = dict(
+        batch_size=512,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=3.794797423594763e-06,
+            gae_lambda=0.95,
+            gamma=0.95,
+            learning_rate=0.0003286871805949382,
+            max_grad_norm=0.8,
+            n_epochs=5,
+            vf_coef=0.11483689492120866,
+        ),
+    )
+    # algorithm_specific = dict(
+    #     airl=dict(total_timesteps=int(5e6)),
+    #     gail=dict(total_timesteps=int(8e6)),
+    # )
+    # reward = dict(
+    #     algorithm_specific=dict(
+    #         airl=dict(
+    #             net_cls=reward_nets.BasicShapedRewardNet,
+    #             net_kwargs=dict(
+    #                 reward_hid_sizes=(32,),
+    #                 potential_hid_sizes=(32,),
+    #             ),
+    #         ),
+    #     ),
+    # )
+    algorithm_kwargs = dict(
+        # Number of discriminator updates after each round of generator updates
+        n_disc_updates_per_round=16,
+        # Equivalent to no replay buffer if batch size is the same
+        gen_replay_buffer_capacity=512,
+        demo_batch_size=8192,
+    )
 
 
 @train_adversarial_ex.named_config
 def seals_hopper():
-    locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=512,
+            clip_range=0.1,
+            ent_coef=0.0010159833764878474,
+            gae_lambda=0.98,
+            gamma=0.995,
+            learning_rate=0.0003904770450788824,
+            max_grad_norm=0.9,
+            n_epochs=20,
+            vf_coef=0.20315938606555833,
+        ),
+    )
 
 
 @train_adversarial_ex.named_config
-def seals_humanoid():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Humanoid-v0")
-    total_timesteps = int(4e6)
+def seals_swimmer():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Swimmer-v0")
+    total_timesteps = int(2e6)
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=5.167107294612664e-08,
+            gae_lambda=0.95,
+            gamma=0.999,
+            learning_rate=0.000414936134792374,
+            max_grad_norm=2,
+            n_epochs=5,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6162112311062333,
+        ),
+    )
 
 
 @train_adversarial_ex.named_config
-def reacher():
-    environment = dict(gym_id="Reacher-v2")
-    algorithm_kwargs = {"allow_variable_horizon": True}
+def seals_walker():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Walker2d-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=8192,
+        rl_kwargs=dict(
+            batch_size=128,
+            clip_range=0.4,
+            ent_coef=0.00013057334805552262,
+            gae_lambda=0.92,
+            gamma=0.98,
+            learning_rate=0.000138575372312869,
+            max_grad_norm=0.6,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6167177795726859,
+        ),
+    )
 
 
 @train_adversarial_ex.named_config
-def seals_swimmer():
+def seals_humanoid():
     locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Swimmer-v0")
-    total_timesteps = int(2e6)
+    environment = dict(gym_id="seals/Humanoid-v0")
+    total_timesteps = int(4e6)
 
 
 @train_adversarial_ex.named_config
-def seals_walker():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Walker2d-v0")
+def reacher():
+    environment = dict(gym_id="Reacher-v2")
+    algorithm_kwargs = {"allow_variable_horizon": True}
 
 
 # Debug configs
@@ -189,3 +311,22 @@ def fast():
         demo_batch_size=1,
         n_disc_updates_per_round=4,
     )
+
+
+@train_adversarial_ex.named_config
+def debug_nans():
+    environment = {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}
+    total_timesteps = 1e7
+    algorithm_kwargs = dict(
+        demo_batch_size=128,
+        n_disc_updates_per_round=8,
+        # both are same as rl.batch_size
+        # gen_replay_buffer_capacity=tune.choice([512, 1024]),
+        # gen_train_timesteps=0,
+    )
+    rl = {
+        "batch_size": 4096,
+        "rl_kwargs": {"ent_coef": 0.1, "learning_rate": 7.316377404994506e-05},
+    }
+    seed = 0
+    checkpoint_interval = 1
diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
index 16da9c694..23e24ec0b 100644
--- a/src/imitation/scripts/config/train_imitation.py
+++ b/src/imitation/scripts/config/train_imitation.py
@@ -38,6 +38,7 @@ def config():
     dagger = dict(
         use_offline_rollouts=False,  # warm-start policy with BC from offline demos
         total_timesteps=1e5,
+        rollout_round_min_episodes=None,  # use default value
     )
     agent_path = None  # Path to load agent from, optional.
 
@@ -81,6 +82,8 @@ def ant():
 @train_imitation_ex.named_config
 def seals_ant():
     environment = dict(gym_id="seals/Ant-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
 
 
 @train_imitation_ex.named_config
@@ -95,6 +98,29 @@ def seals_half_cheetah():
     environment = dict(gym_id="seals/HalfCheetah-v0")
     bc_kwargs = dict(l2_weight=0.0)
     dagger = dict(total_timesteps=60000)
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
+
+
+@train_imitation_ex.named_config
+def seals_hopper():
+    environment = dict(gym_id="seals/Hopper-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
+
+
+@train_imitation_ex.named_config
+def seals_swimmer():
+    environment = dict(gym_id="seals/Swimmer-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
+
+
+@train_imitation_ex.named_config
+def seals_walker():
+    environment = dict(gym_id="seals/Walker2d-v0")
+    demonstrations = dict(rollout_type="ppo-huggingface")
+    expert = {"policy_type": "ppo-huggingface"}
 
 
 @train_imitation_ex.named_config
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index cf25f4783..d12869bf0 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -1,6 +1,7 @@
 """Configuration for imitation.scripts.train_preference_comparisons."""
 
 import sacred
+from torch import nn
 
 from imitation.algorithms import preference_comparisons
 from imitation.scripts.ingredients import environment
@@ -72,9 +73,24 @@ def cartpole():
 
 @train_preference_comparisons_ex.named_config
 def seals_ant():
-    locals().update(**MUJOCO_SHARED_LOCALS)
-    locals().update(**ANT_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=16,
+            clip_range=0.3,
+            ent_coef=3.1441389214159857e-06,
+            gae_lambda=0.8,
+            gamma=0.995,
+            learning_rate=0.00017959211641976886,
+            max_grad_norm=0.9,
+            n_epochs=10,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.4351450387648799,
+        ),
+    )
 
 
 @train_preference_comparisons_ex.named_config
@@ -84,10 +100,116 @@ def half_cheetah():
     rl = dict(batch_size=16384, rl_kwargs=dict(batch_size=1024))
 
 
+@train_preference_comparisons_ex.named_config
+def seals_half_cheetah():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/HalfCheetah-v0")
+    rl = dict(
+        batch_size=512,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=3.794797423594763e-06,
+            gae_lambda=0.95,
+            gamma=0.95,
+            learning_rate=0.0003286871805949382,
+            max_grad_norm=0.8,
+            n_epochs=5,
+            vf_coef=0.11483689492120866,
+        ),
+    )
+    num_iterations = 50
+    total_timesteps = 20000000
+    # train = dict(
+    #     policy_cls="MlpPolicy",
+    #     policy_kwargs=dict(
+    #         activation_fn=nn.ReLU,
+    #         # net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+    #     ),
+    # )
+
+
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
-    locals().update(**MUJOCO_SHARED_LOCALS)
+    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=512,
+            clip_range=0.1,
+            ent_coef=0.0010159833764878474,
+            gae_lambda=0.98,
+            gamma=0.995,
+            learning_rate=0.0003904770450788824,
+            max_grad_norm=0.9,
+            n_epochs=20,
+            vf_coef=0.20315938606555833,
+        ),
+    )
+
+
+@train_preference_comparisons_ex.named_config
+def seals_swimmer():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Swimmer-v0")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=5.167107294612664e-08,
+            gae_lambda=0.95,
+            gamma=0.999,
+            learning_rate=0.000414936134792374,
+            max_grad_norm=2,
+            n_epochs=5,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6162112311062333,
+        ),
+    )
+
+
+@train_preference_comparisons_ex.named_config
+def seals_walker():
+    # locals().update(**MUJOCO_SHARED_LOCALS)
+    environment = dict(gym_id="seals/Walker2d-v0")
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    rl = dict(
+        batch_size=8192,
+        rl_kwargs=dict(
+            batch_size=128,
+            clip_range=0.4,
+            ent_coef=0.00013057334805552262,
+            gae_lambda=0.92,
+            gamma=0.98,
+            learning_rate=0.000138575372312869,
+            max_grad_norm=0.6,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6167177795726859,
+        ),
+    )
 
 
 @train_preference_comparisons_ex.named_config
diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py
index 6d48f8695..9df2581a6 100644
--- a/src/imitation/scripts/config/train_rl.py
+++ b/src/imitation/scripts/config/train_rl.py
@@ -1,6 +1,8 @@
 """Configuration settings for train_rl, training a policy with RL."""
 
+
 import sacred
+from torch import nn
 
 from imitation.scripts.ingredients import environment
 from imitation.scripts.ingredients import logging as logging_ingredient
@@ -70,8 +72,30 @@ def cartpole():
 
 @train_rl_ex.named_config
 def seals_cartpole():
-    environment = dict(gym_id="seals/CartPole-v0")
-    total_timesteps = int(1e6)
+    environment = dict(gym_id="seals/CartPole-v0", num_vec=8)
+    total_timesteps = int(1e5)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    normalize_reward = False
+    rl = dict(
+        batch_size=4096,
+        rl_kwargs=dict(
+            batch_size=256,
+            clip_range=0.4,
+            ent_coef=0.008508727919228772,
+            gae_lambda=0.9,
+            gamma=0.9999,
+            learning_rate=0.0012403278189645594,
+            max_grad_norm=0.8,
+            n_epochs=10,
+            vf_coef=0.489343896591493,
+        ),
+    )
 
 
 @train_rl_ex.named_config
@@ -80,9 +104,69 @@ def half_cheetah():
     total_timesteps = int(5e6)  # does OK after 1e6, but continues improving
 
 
+@train_rl_ex.named_config
+def seals_half_cheetah():
+    environment = dict(
+        gym_id="seals/HalfCheetah-v0",
+        num_vec=1,
+    )
+
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.Tanh,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+    # total_timesteps = int(5e6)  # does OK after 1e6, but continues improving
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=512,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=3.794797423594763e-06,
+            gae_lambda=0.95,
+            gamma=0.95,
+            learning_rate=0.0003286871805949382,
+            max_grad_norm=0.8,
+            n_epochs=5,
+            vf_coef=0.11483689492120866,
+        ),
+    )
+
+
 @train_rl_ex.named_config
 def seals_hopper():
-    environment = dict(gym_id="seals/Hopper-v0")
+    environment = dict(gym_id="seals/Hopper-v0", num_vec=1)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=512,
+            clip_range=0.1,
+            ent_coef=0.0010159833764878474,
+            gae_lambda=0.98,
+            gamma=0.995,
+            learning_rate=0.0003904770450788824,
+            max_grad_norm=0.9,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.20315938606555833,
+        ),
+    )
 
 
 @train_rl_ex.named_config
@@ -104,15 +188,34 @@ def seals_mountain_car():
 
 @train_rl_ex.named_config
 def pendulum():
-    environment = dict(gym_id="Pendulum-v1")
+    environment = dict(gym_id="Pendulum-v1", num_vec=4)
+    total_timesteps = int(1e5)
+
+    train = dict(
+        policy_cls="MlpPolicy",
+        # policy_kwargs=dict(
+        #     activation_fn=nn.Tanh,
+        #     net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        # ),
+    )
+    normalize_reward = False
+
     rl = dict(
-        batch_size=4096,
+        batch_size=1024 * 4,
         rl_kwargs=dict(
+            gae_lambda=0.95,
             gamma=0.9,
+            n_epochs=10,
+            ent_coef=0.0,
             learning_rate=1e-3,
+            clip_range=0.2,
+            use_sde=True,
+            sde_sample_freq=4,
+            # batch_size=64,
+            # max_grad_norm=0.8,
+            # vf_coef=0.11483689492120866,
         ),
     )
-    total_timesteps = int(2e5)
 
 
 @train_rl_ex.named_config
@@ -122,17 +225,99 @@ def reacher():
 
 @train_rl_ex.named_config
 def seals_ant():
-    environment = dict(gym_id="seals/Ant-v0")
+    environment = dict(
+        gym_id="seals/Ant-v0",
+        num_vec=1,
+    )
+
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.Tanh,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=16,
+            clip_range=0.3,
+            ent_coef=3.1441389214159857e-06,
+            gae_lambda=0.8,
+            gamma=0.995,
+            learning_rate=0.00017959211641976886,
+            max_grad_norm=0.9,
+            n_epochs=10,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.4351450387648799,
+        ),
+    )
 
 
 @train_rl_ex.named_config
 def seals_swimmer():
-    environment = dict(gym_id="seals/Swimmer-v0")
+    environment = dict(gym_id="seals/Swimmer-v0", num_vec=1)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=2048,
+        rl_kwargs=dict(
+            batch_size=64,
+            clip_range=0.1,
+            ent_coef=5.167107294612664e-08,
+            gae_lambda=0.95,
+            gamma=0.999,
+            learning_rate=0.000414936134792374,
+            max_grad_norm=2,
+            n_epochs=5,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6162112311062333,
+        ),
+    )
 
 
 @train_rl_ex.named_config
 def seals_walker():
-    environment = dict(gym_id="seals/Walker2d-v0")
+    environment = dict(gym_id="seals/Walker2d-v0", num_vec=1)
+    train = dict(
+        policy_cls="MlpPolicy",
+        policy_kwargs=dict(
+            activation_fn=nn.ReLU,
+            net_arch=[dict(pi=[64, 64], vf=[64, 64])],
+        ),
+    )
+
+    total_timesteps = 1e6
+    normalize_reward = False
+
+    rl = dict(
+        batch_size=8192,
+        rl_kwargs=dict(
+            batch_size=128,
+            clip_range=0.4,
+            ent_coef=0.00013057334805552262,
+            gae_lambda=0.92,
+            gamma=0.98,
+            learning_rate=0.000138575372312869,
+            max_grad_norm=0.6,
+            n_epochs=20,
+            # policy_kwargs are same as the defaults
+            vf_coef=0.6167177795726859,
+        ),
+    )
 
 
 # Debug configs
diff --git a/src/imitation/scripts/ingredients/reward.py b/src/imitation/scripts/ingredients/reward.py
index c40d3751f..a4bd98d1f 100644
--- a/src/imitation/scripts/ingredients/reward.py
+++ b/src/imitation/scripts/ingredients/reward.py
@@ -46,6 +46,11 @@ def normalize_output_running():
     normalize_output_layer = networks.RunningNorm  # noqa: F841
 
 
+@reward_ingredient.named_config
+def normalize_output_ema():
+    normalize_output_layer = networks.EMANorm  # noqa: F841
+
+
 @reward_ingredient.named_config
 def reward_ensemble():
     net_cls = reward_nets.RewardEnsemble
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 6014a08b6..c196954d1 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -2,12 +2,18 @@
 
 import collections.abc
 import copy
+import glob
 import pathlib
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
 
+import numpy as np
 import ray
 import ray.tune
 import sacred
+from pandas.api.types import is_object_dtype
+from ray.tune import search
+from ray.tune.registry import register_trainable
+from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
 
 from imitation.scripts.config.parallel import parallel_ex
@@ -17,6 +23,7 @@
 def parallel(
     sacred_ex_name: str,
     run_name: str,
+    num_samples: int,
     search_space: Mapping[str, Any],
     base_named_configs: Sequence[str],
     base_config_updates: Mapping[str, Any],
@@ -24,6 +31,12 @@ def parallel(
     init_kwargs: Mapping[str, Any],
     local_dir: Optional[str],
     upload_dir: Optional[str],
+    repeat: int = 3,
+    eval_best_trial: bool = False,
+    eval_trial_seeds: int = 5,
+    experiment_checkpoint_path: str = "",
+    syncer=None,
+    resume: Union[str, bool] = False,
 ) -> None:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
@@ -40,6 +53,7 @@ def parallel(
             under the 'experiment.name' key. This is equivalent to using the Sacred
             CLI '--name' option on the inner experiment. Offline analysis jobs can use
             this argument to group similar data.
+        num_samples: Number of times to sample from the hyperparameter space.
         search_space: A dictionary which can contain Ray Tune search objects like
             `ray.tune.grid_search` and `ray.tune.sample_from`, and is
             passed as the `config` argument to `ray.tune.run()`. After the
@@ -62,6 +76,19 @@ def parallel(
         init_kwargs: Arguments to pass to `ray.init`.
         local_dir: `local_dir` argument to `ray.tune.run()`.
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
+        repeat: Number of runs to repeat each trial for.
+        eval_best_trial: Whether to evaluate the trial with the best mean return
+            at the end of tuning on a different set of seeds.
+        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
+        experiment_checkpoint_path: Path containing the checkpoints of a previous
+            experiment. ran using this script. Useful for resuming cancelled trials
+            of the experiments (using `resume`) or evaluating the best trial of the
+            experiment (using `eval_best_trial`).
+        resume: If true and `experiment_checkpoint_path` is given, then resumes the
+            experiment by restarting the trials that did not finish in the experiment
+            checkpoint path.
+        syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
+
 
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
@@ -73,8 +100,8 @@ def parallel(
     if not isinstance(base_config_updates, collections.abc.Mapping):
         raise TypeError("base_config_updates must be a Mapping")
 
-    if not isinstance(search_space["named_configs"], collections.abc.Sequence):
-        raise TypeError('search_space["named_configs"] must be a Sequence')
+    # if not isinstance(search_space["named_configs"], collections.abc.Sequence):
+    #     raise TypeError('search_space["named_configs"] must be a Sequence')
 
     if not isinstance(search_space["config_updates"], collections.abc.Mapping):
         raise TypeError('search_space["config_updates"] must be a Mapping')
@@ -95,15 +122,104 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
+    search_alg = optuna.OptunaSearch()
+    search_alg = search.Repeater(search_alg, repeat=repeat)
     try:
-        ray.tune.run(
-            trainable,
-            config=search_space,
-            name=run_name,
-            local_dir=local_dir,
-            resources_per_trial=resources_per_trial,
-            sync_config=ray.tune.syncer.SyncConfig(upload_dir=upload_dir),
+        if experiment_checkpoint_path:
+            if resume:
+                register_trainable("inner", trainable)
+                runner = ray.tune.execution.trial_runner.TrialRunner(
+                    local_checkpoint_dir=experiment_checkpoint_path,
+                    sync_config=ray.tune.syncer.SyncConfig(
+                        upload_dir=upload_dir,
+                        syncer=syncer,
+                    ),
+                    metric="mean_return",
+                    resume=resume,
+                )
+                print(
+                    "Live trials:", len(runner._live_trials), "/", len(runner._trials)
+                )
+                while not runner.is_finished():
+                    runner.step()
+                    print("Debug:", runner.debug_string())
+
+            result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
+            result._load_checkpoints_from_latest(
+                glob.glob(experiment_checkpoint_path + "/experiment_state*.json"),
+            )
+            result.trials = None
+            result.fetch_trial_dataframes()
+        else:
+            result = ray.tune.run(
+                trainable,
+                config=search_space,
+                num_samples=num_samples * repeat,
+                name=run_name,
+                local_dir=local_dir,
+                resources_per_trial=resources_per_trial,
+                sync_config=ray.tune.syncer.SyncConfig(
+                    upload_dir=upload_dir,
+                    syncer=syncer,
+                ),
+                search_alg=search_alg,
+                metric="mean_return",
+                mode="max",
+            )
+
+        key = (
+            "rollout/"
+            if sacred_ex_name == "train_preference_comparisons"
+            else ""
+            if sacred_ex_name == "train_rl"
+            else "imit_stats/"
         )
+        key += "monitor_return_mean"
+        if eval_best_trial:
+            df = result.results_df
+            df = df[df["config/named_configs"].notna()]
+            for col in df.columns:
+                if is_object_dtype(df[col]):
+                    df[col] = df[col].astype("str")
+
+            grp_keys = [
+                c for c in df.columns if c.startswith("config") and "seed" not in c
+            ]
+            grps = df.groupby(grp_keys)
+            print(grps[key])
+            df["mean_return"] = grps[key].transform(lambda x: x.mean())
+            best_config_df = df[df["mean_return"] == df["mean_return"].max()]
+            envs_processed = set()
+            for i, row in best_config_df.iterrows():
+                tag = row["experiment_tag"]
+                trial = [t for t in result.trials if tag in t.experiment_tag][0]
+                best_config = trial.config
+                env = tuple(best_config["named_configs"])
+                if env in envs_processed:
+                    continue
+                envs_processed.add(env)
+                print("Named configs:", env)
+                print("Mean return:", row["mean_return"])
+                print("All returns:", df[df["mean_return"] == row["mean_return"]][key])
+                print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
+                best_config["config_updates"].update(
+                    seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
+                )
+                resources_per_trial = {k: 2 * v for k, v in resources_per_trial.items()}
+                eval_result = ray.tune.run(
+                    trainable,
+                    config={
+                        "named_configs": best_config["named_configs"],
+                        "config_updates": best_config["config_updates"],
+                        "command_name": best_config.get("command_name", None),
+                    },
+                    name=run_name + "_best_hp_eval",
+                    resources_per_trial=resources_per_trial,
+                )
+                returns = eval_result.results_df["mean_return"].to_numpy()
+                print("Returns:", returns)
+                print(np.mean(returns), np.std(returns))
+
     finally:
         ray.shutdown()
 
@@ -148,7 +264,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
@@ -169,11 +285,17 @@ def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
         # Import inside function rather than in module because Sacred experiments
         # are not picklable, and Ray requires this function to be picklable.
         from imitation.scripts.train_adversarial import train_adversarial_ex
+        from imitation.scripts.train_imitation import train_imitation_ex
+        from imitation.scripts.train_preference_comparisons import (
+            train_preference_comparisons_ex,
+        )
         from imitation.scripts.train_rl import train_rl_ex
 
         experiments = {
             "train_rl": train_rl_ex,
             "train_adversarial": train_adversarial_ex,
+            "train_imitation": train_imitation_ex,
+            "train_preference_comparisons": train_preference_comparisons_ex,
         }
         ex = experiments[sacred_ex_name]
 
@@ -181,22 +303,28 @@ def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
         named_configs = base_named_configs + run_kwargs["named_configs"]
         updated_run_kwargs["named_configs"] = named_configs
 
-        config_updates = {**base_config_updates, **run_kwargs["config_updates"]}
+        config_updates: Mapping[str, Any] = {}
+        config_updates.update(base_config_updates)
+        config_updates.update(run_kwargs["config_updates"])
+        if "__trial_index__" in run_kwargs:
+            config_updates.update(seed=run_kwargs.pop("__trial_index__"))
         updated_run_kwargs["config_updates"] = config_updates
 
         # Add other run_kwargs items to updated_run_kwargs.
         for k, v in run_kwargs.items():
             if k not in updated_run_kwargs:
                 updated_run_kwargs[k] = v
-
-        run = ex.run(
-            **updated_run_kwargs,
-            options={"--run": run_name, "--file_storage": "sacred"},
-        )
-
+        run = ex.run(**updated_run_kwargs, options={"--run": run_name})
         # Ray Tune has a string formatting error if raylet completes without
         # any calls to `reporter`.
-        reporter(done=True)
+        # reporter(done=True)
+        # if sacred_ex_name == "train_preference_comparisons":
+        #     #reporter(mean_return=run.result["rollout"]["monitor_return_mean"])
+        #     #ray.tune.report(mean_return=run.result["rollout"]["monitor_return_mean"])
+        #     ray.tune.report(mean_return=234)
+        # else:
+        #     # reporter(mean_return=run.result["imit_stats"]["monitor_return_mean"])
+        #     ray.tune.report(mean_return=run.result["imit_stats"]["monitor_return_mean"])
 
         assert run.status == "COMPLETED"
         return run.result
diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py
index 71fc0c2c9..58f7fb4c4 100644
--- a/src/imitation/scripts/train_adversarial.py
+++ b/src/imitation/scripts/train_adversarial.py
@@ -162,6 +162,7 @@ def callback(round_num: int, /) -> None:
     return {
         "imit_stats": imit_stats,
         "expert_stats": rollout.rollout_stats(expert_trajs),
+        "mean_return": imit_stats["monitor_return_mean"],
     }
 
 
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 2b4946668..c5673fa3e 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -125,10 +125,12 @@ def train_imitation(
                 expert_policy=expert_policy,
                 custom_logger=custom_logger,
                 bc_trainer=bc_trainer,
+                beta_schedule=dagger["beta_schedule"],
                 rng=_rnd,
             )
             model.train(
                 total_timesteps=int(dagger["total_timesteps"]),
+                rollout_round_min_episodes=dagger["rollout_round_min_episodes"],
                 bc_train_kwargs=bc_train_kwargs,
             )
             # TODO(adam): add checkpointing to DAgger?
@@ -141,7 +143,7 @@ def train_imitation(
 
         imit_stats = train.eval_policy(imit_policy, venv)
 
-    stats = {"imit_stats": imit_stats}
+    stats = {"imit_stats": imit_stats, "mean_return": imit_stats["monitor_return_mean"]}
     trajectories = model._all_demos if use_dagger else expert_trajs
     assert trajectories is not None
     if all(isinstance(t, types.TrajectoryWithRew) for t in trajectories):
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index e1aab27ff..1daa306af 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -268,6 +268,7 @@ def save_callback(iteration_num):
         if bool(trajectory_path is None):
             results = dict(results)
             results["rollout"] = train.eval_policy(agent, venv)
+            results["mean_return"] = results["rollout"]["monitor_return_mean"]
 
     if save_preferences:
         main_trainer.dataset.save(log_dir / "preferences.pkl")
diff --git a/src/imitation/scripts/train_rl.py b/src/imitation/scripts/train_rl.py
index fd345ca62..a88e6096a 100644
--- a/src/imitation/scripts/train_rl.py
+++ b/src/imitation/scripts/train_rl.py
@@ -157,7 +157,9 @@ def train_rl(
             serialize.save_stable_model(output_dir, rl_algo)
 
         # Final evaluation of expert policy.
-        return train.eval_policy(rl_algo, venv)
+        eval_stats = train.eval_policy(rl_algo, venv)
+        eval_stats["mean_return"] = eval_stats["monitor_return_mean"]
+        return eval_stats
 
 
 def main_console():
diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py
index 549e38fd2..6cc42bc78 100644
--- a/tests/algorithms/test_dagger.py
+++ b/tests/algorithms/test_dagger.py
@@ -33,7 +33,7 @@ def maybe_pendulum_expert_trajectories(
         return None
 
 
-def test_beta_schedule():
+def test_linear_beta_schedule():
     one_step_sched = dagger.LinearBetaSchedule(1)
     three_step_sched = dagger.LinearBetaSchedule(3)
     for i in range(10):
@@ -41,6 +41,29 @@ def test_beta_schedule():
         assert np.allclose(three_step_sched(i), (3 - i) / 3 if i <= 2 else 0)
 
 
+def test_indicator_beta_schedule():
+    one_step_sched = dagger.IndicatorBetaSchedule(1)
+    three_step_sched = dagger.IndicatorBetaSchedule(3)
+    for i in range(10):
+        assert np.allclose(one_step_sched(i), 1 if i == 0 else 0)
+        assert np.allclose(three_step_sched(i), 1 if i <= 2 else 0)
+
+
+def test_exponential_beta_schedule():
+    constant_sched = dagger.ExponentialBetaSchedule(1)
+    decay = 0.5
+    decaying_sched = dagger.ExponentialBetaSchedule(decay)
+    for i in range(10):
+        assert np.allclose(constant_sched(i), 1)
+        assert np.allclose(decaying_sched(i), decay**i)
+
+    with pytest.raises(
+        ValueError,
+        match=r"decay_probability lies outside the range \(0, 1\]\.",
+    ):
+        decaying_sched = dagger.ExponentialBetaSchedule(1.1)
+
+
 def test_traj_collector_seed(tmpdir, pendulum_venv, rng):
     collector = dagger.InteractiveTrajectoryCollector(
         venv=pendulum_venv,
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 2196b4af1..0a2766dbb 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -375,7 +375,10 @@ def bc_config(tmpdir, request):
             policy_type="ppo",
             loader_kwargs=dict(path=CARTPOLE_TEST_POLICY_PATH / "model.zip"),
         ),
-        expert_from_huggingface=dict(policy_type="ppo-huggingface"),
+        expert_from_huggingface=dict(
+            policy_type="ppo-huggingface",
+            loader_kwargs=dict(env_id="seals/CartPole-v0"),
+        ),
         random_expert=dict(policy_type="random"),
         zero_expert=dict(policy_type="zero"),
     )[request.param]
@@ -403,7 +406,10 @@ def test_train_bc_warmstart(tmpdir):
         config_updates=dict(
             logging=dict(log_root=tmpdir),
             demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
-            expert=dict(policy_type="ppo-huggingface"),
+            expert=dict(
+                policy_type="ppo-huggingface",
+                loader_kwargs=dict(env_id="seals/CartPole-v0"),
+            ),
         ),
     )
     assert run.status == "COMPLETED"
@@ -559,6 +565,27 @@ def test_train_adversarial(tmpdir, named_configs, command):
     _check_train_ex_result(run.result)
 
 
+def test_train_adversarial_debug():
+    """Smoke test for imitation.scripts.train_adversarial."""
+    named_configs = ["seals_ant", "debug_nans"]
+    config_updates = {
+        "common": dict(log_root="/home/tf/imitation/debug", parallel=False),
+        "demonstrations": dict(
+            rollout_path="/home/tf/imitation/download/final.pkl",
+        ),
+        # TensorBoard logs to get extra coverage
+        # "algorithm_kwargs": dict(init_tensorboard=True),
+        "agent_path": "/home/tf/imitation/download/01124/gen_policy",
+    }
+    run = train_adversarial.train_adversarial_ex.run(
+        command_name="airl",
+        named_configs=named_configs,
+        config_updates=config_updates,
+    )
+    assert run.status == "COMPLETED"
+    _check_train_ex_result(run.result)
+
+
 @pytest.mark.parametrize("command", ("airl", "gail"))
 def test_train_adversarial_warmstart(tmpdir, command):
     named_configs = ["cartpole"] + ALGO_FAST_CONFIGS["adversarial"]

From 97bc063e72e6fc769222351d954f68be28cf761f Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 10 Jan 2023 15:56:14 +0530
Subject: [PATCH 02/47] Clean parallel script

---
 src/imitation/scripts/parallel.py | 54 +++++++++++++++++++------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index c196954d1..da492804e 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -27,12 +27,13 @@ def parallel(
     search_space: Mapping[str, Any],
     base_named_configs: Sequence[str],
     base_config_updates: Mapping[str, Any],
-    resources_per_trial: Mapping[str, Any],
+    resources_per_trial: Dict[str, Any],
     init_kwargs: Mapping[str, Any],
     local_dir: Optional[str],
     upload_dir: Optional[str],
     repeat: int = 3,
     eval_best_trial: bool = False,
+    eval_best_trial_resource_multiplier: int = 2,
     eval_trial_seeds: int = 5,
     experiment_checkpoint_path: str = "",
     syncer=None,
@@ -79,6 +80,8 @@ def parallel(
         repeat: Number of runs to repeat each trial for.
         eval_best_trial: Whether to evaluate the trial with the best mean return
             at the end of tuning on a different set of seeds.
+        eval_best_trial_resource_multiplier: factor by which to multiply the
+            number of cpus per trial in `resources_per_trial`.
         eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
             experiment. ran using this script. Useful for resuming cancelled trials
@@ -122,11 +125,11 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
-    search_alg = optuna.OptunaSearch()
-    search_alg = search.Repeater(search_alg, repeat=repeat)
+    search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
     try:
         if experiment_checkpoint_path:
             if resume:
+                # restart failed runs from experiment_checkpoint_path
                 register_trainable("inner", trainable)
                 runner = ray.tune.execution.trial_runner.TrialRunner(
                     local_checkpoint_dir=experiment_checkpoint_path,
@@ -138,16 +141,21 @@ def parallel(
                     resume=resume,
                 )
                 print(
-                    "Live trials:", len(runner._live_trials), "/", len(runner._trials)
+                    "Live trials:",
+                    len(runner._live_trials),
+                    "/",
+                    len(runner._trials),
                 )
                 while not runner.is_finished():
                     runner.step()
                     print("Debug:", runner.debug_string())
 
+            # load experiment analysis results
             result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
             result._load_checkpoints_from_latest(
                 glob.glob(experiment_checkpoint_path + "/experiment_state*.json"),
             )
+            # update result.trials using all the experiment_state json files
             result.trials = None
             result.fetch_trial_dataframes()
         else:
@@ -167,45 +175,50 @@ def parallel(
                 mode="max",
             )
 
-        key = (
+        key_prefix = (
             "rollout/"
             if sacred_ex_name == "train_preference_comparisons"
             else ""
             if sacred_ex_name == "train_rl"
             else "imit_stats/"
         )
-        key += "monitor_return_mean"
+        key = key_prefix + "monitor_return_mean"
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
+            # convert object dtype to str required by df.groupby
             for col in df.columns:
                 if is_object_dtype(df[col]):
                     df[col] = df[col].astype("str")
-
+            # group into separate HP configs
             grp_keys = [
                 c for c in df.columns if c.startswith("config") and "seed" not in c
             ]
             grps = df.groupby(grp_keys)
-            print(grps[key])
+            # store mean return of runs across all seeds in a group
             df["mean_return"] = grps[key].transform(lambda x: x.mean())
             best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-            envs_processed = set()
-            for i, row in best_config_df.iterrows():
-                tag = row["experiment_tag"]
-                trial = [t for t in result.trials if tag in t.experiment_tag][0]
+            row = best_config_df.loc[0]
+            best_config_tag = row["experiment_tag"]
+            if result.trials is not None:
+                trial = [
+                    t for t in result.trials if best_config_tag in t.experiment_tag
+                ][0]
                 best_config = trial.config
-                env = tuple(best_config["named_configs"])
-                if env in envs_processed:
-                    continue
-                envs_processed.add(env)
-                print("Named configs:", env)
                 print("Mean return:", row["mean_return"])
                 print("All returns:", df[df["mean_return"] == row["mean_return"]][key])
                 print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
                 best_config["config_updates"].update(
                     seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
                 )
-                resources_per_trial = {k: 2 * v for k, v in resources_per_trial.items()}
+                # update cpus per trial only if it is provided in `resources_per_trial`
+                # Uses the default values (cpu=1) if it is not provided
+                if "cpu" in resources_per_trial:
+                    resources_per_trial["cpu"] *= eval_best_trial_resource_multiplier
+                    best_config["config_updates"].update(
+                        environment=dict(num_vec=resources_per_trial["cpu"]),
+                    )
+
                 eval_result = ray.tune.run(
                     trainable,
                     config={
@@ -219,7 +232,6 @@ def parallel(
                 returns = eval_result.results_df["mean_return"].to_numpy()
                 print("Returns:", returns)
                 print(np.mean(returns), np.std(returns))
-
     finally:
         ray.shutdown()
 
@@ -229,7 +241,7 @@ def _ray_tune_sacred_wrapper(
     run_name: str,
     base_named_configs: list,
     base_config_updates: Mapping[str, Any],
-) -> Callable[[Mapping[str, Any], Any], Mapping[str, Any]]:
+) -> Callable[[Dict[str, Any], Any], Mapping[str, Any]]:
     """From an Experiment build a wrapped run function suitable for Ray Tune.
 
     `ray.tune.run(...)` expects a trainable function that takes a dict
@@ -303,7 +315,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         named_configs = base_named_configs + run_kwargs["named_configs"]
         updated_run_kwargs["named_configs"] = named_configs
 
-        config_updates: Mapping[str, Any] = {}
+        config_updates: Dict[str, Any] = {}
         config_updates.update(base_config_updates)
         config_updates.update(run_kwargs["config_updates"])
         if "__trial_index__" in run_kwargs:

From 92912256816e51ce6e4266ac80ed990c6416493d Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 26 Jan 2023 15:18:04 +0100
Subject: [PATCH 03/47] Undo the changes from #653 to the dagger benchmark
 config files.

This change just made some error messages go away indicating the missing imitation.algorithms.dagger.ExponentialBetaSchedule but it did not fix the root cause.
---
 benchmarking/example_dagger_seals_ant_best_hp_eval.json         | 2 +-
 .../example_dagger_seals_half_cheetah_best_hp_eval.json         | 2 +-
 benchmarking/example_dagger_seals_hopper_best_hp_eval.json      | 2 +-
 benchmarking/example_dagger_seals_swimmer_best_hp_eval.json     | 2 +-
 benchmarking/example_dagger_seals_walker_best_hp_eval.json      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarking/example_dagger_seals_ant_best_hp_eval.json b/benchmarking/example_dagger_seals_ant_best_hp_eval.json
index 035beab83..38f3f504a 100644
--- a/benchmarking/example_dagger_seals_ant_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_ant_best_hp_eval.json
@@ -16,7 +16,7 @@
   },
   "dagger": {
     "beta_schedule": {
-      "py/type": "imitation.algorithms.dagger.LinearBetaSchedule",
+      "py/object": "imitation.algorithms.dagger.LinearBetaSchedule",
       "rampdown_rounds": 15
     },
     "rollout_round_min_episodes": 5,
diff --git a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
index 8961f8c26..708c92547 100644
--- a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
@@ -17,7 +17,7 @@
   "dagger": {
     "beta_schedule": {
       "decay_probability": 0.7,
-      "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule"
+      "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
     },
     "rollout_round_min_episodes": 5,
     "total_timesteps": 60000,
diff --git a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json b/benchmarking/example_dagger_seals_hopper_best_hp_eval.json
index fe47291e0..001479ec3 100644
--- a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_hopper_best_hp_eval.json
@@ -17,7 +17,7 @@
   "dagger": {
     "beta_schedule": {
       "decay_probability": 0.7,
-      "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule"
+      "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
     },
     "rollout_round_min_episodes": 10,
     "total_timesteps": 100000,
diff --git a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json b/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
index 2e6cba2c0..df1606fca 100644
--- a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
@@ -16,7 +16,7 @@
   },
   "dagger": {
     "beta_schedule": {
-      "py/type": "imitation.algorithms.dagger.LinearBetaSchedule",
+      "py/object": "imitation.algorithms.dagger.LinearBetaSchedule",
       "rampdown_rounds": 15
     },
     "rollout_round_min_episodes": 3,
diff --git a/benchmarking/example_dagger_seals_walker_best_hp_eval.json b/benchmarking/example_dagger_seals_walker_best_hp_eval.json
index e4569321f..ce6baff1c 100644
--- a/benchmarking/example_dagger_seals_walker_best_hp_eval.json
+++ b/benchmarking/example_dagger_seals_walker_best_hp_eval.json
@@ -17,7 +17,7 @@
   "dagger": {
     "beta_schedule": {
       "decay_probability": 0.7,
-      "py/type": "imitation.algorithms.dagger.ExponentialBetaSchedule"
+      "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
     },
     "rollout_round_min_episodes": 5,
     "total_timesteps": 100000,

From 276d863f488512067c38408ecf1386e8199abf50 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 25 Jan 2023 17:08:27 +0100
Subject: [PATCH 04/47] Improve readability and interpretability of
 benchmarking tests.

---
 tests/test_benchmarking.py | 51 ++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 5c42063c6..67b9eb489 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -1,6 +1,4 @@
 """Tests for config files in benchmarking/ folder."""
-import glob
-import os
 import pathlib
 
 import pytest
@@ -10,24 +8,39 @@
 THIS_DIR = pathlib.Path(__file__).absolute().parent
 BENCHMARKING_DIR = THIS_DIR.parent / "benchmarking"
 
+ALGORITHMS = ["bc", "dagger", "airl", "gail"]
+ENVIRONMENTS = [
+    "seals_walker",
+    "seals_ant",
+    "seals_half_cheetah",
+    "seals_hopper",
+    "seals_swimmer",
+]
 
-@pytest.mark.parametrize(
-    "command_name",
-    ["bc", "dagger", "airl", "gail"],
-)
-def test_benchmarking_configs(tmpdir, command_name):
+
+@pytest.mark.parametrize("environment", ENVIRONMENTS)
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
     # We test the configs using the print_config command,
     # because running the configs requires MuJoCo.
     # Requiring MuJoCo to run the tests adds too much complexity.
-    if command_name in ("bc", "dagger"):
-        ex = train_imitation.train_imitation_ex
-    elif command_name in ("airl", "gail"):
-        ex = train_adversarial.train_adversarial_ex
-    cfg_pattern = os.path.join(BENCHMARKING_DIR, f"example_{command_name}_*.json")
-    cfg_files = glob.glob(cfg_pattern)
-    assert len(cfg_files) == 5, "There should be 1 config file for each of environment."
-    for i, cfg_file in enumerate(cfg_files):
-        cfg_name = f"{tmpdir.basename}_{i}"
-        ex.add_named_config(cfg_name, cfg_file)
-        run = ex.run(command_name="print_config", named_configs=[cfg_name])
-        assert run.status == "COMPLETED"
+
+    # GIVEN
+    if algorithm in ("bc", "dagger"):
+        experiment = train_imitation.train_imitation_ex
+    elif algorithm in ("airl", "gail"):
+        experiment = train_adversarial.train_adversarial_ex
+    else:
+        raise ValueError(f"Unknown algorithm: {algorithm}")
+
+    config_name = f"{algorithm}_{environment}"
+    config_file = str(
+        BENCHMARKING_DIR / f"example_{algorithm}_{environment}_best_hp_eval.json",
+    )
+
+    # WHEN
+    experiment.add_named_config(config_name, config_file)
+    run = experiment.run(command_name="print_config", named_configs=[config_name])
+
+    # THEN
+    assert run.status == "COMPLETED"

From 37eb914cba0aaa416543b763b6f2246eae8f9fa7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 1 Mar 2023 21:48:13 +0530
Subject: [PATCH 05/47] Add pxponential beta scheduler for dagger

---
 src/imitation/algorithms/dagger.py            | 29 +++++++++++++++++++
 .../scripts/config/train_imitation.py         |  1 +
 src/imitation/scripts/train_imitation.py      |  1 +
 3 files changed, 31 insertions(+)

diff --git a/src/imitation/algorithms/dagger.py b/src/imitation/algorithms/dagger.py
index d43ca5eec..34d8cef7e 100644
--- a/src/imitation/algorithms/dagger.py
+++ b/src/imitation/algorithms/dagger.py
@@ -66,6 +66,35 @@ def __call__(self, round_num: int) -> float:
         return min(1, max(0, (self.rampdown_rounds - round_num) / self.rampdown_rounds))
 
 
+class ExponentialBetaSchedule(BetaSchedule):
+    """Exponentially decaying schedule for beta."""
+
+    def __init__(self, decay_probability: float):
+        """Builds ExponentialBetaSchedule.
+
+        Args:
+            decay_probability: the decay factor for beta.
+
+        Raises:
+            ValueError: if `decay_probability` not within (0, 1].
+        """
+        if not (0 < decay_probability <= 1):
+            raise ValueError("decay_probability lies outside the range (0, 1].")
+        self.decay_probability = decay_probability
+
+    def __call__(self, round_num: int) -> float:
+        """Computes beta value.
+
+        Args:
+            round_num: the current round number.
+
+        Returns:
+            beta as `self.decay_probability ^ round_num`
+        """
+        assert round_num >= 0
+        return self.decay_probability**round_num
+
+
 def reconstruct_trainer(
     scratch_dir: types.AnyPath,
     venv: vec_env.VecEnv,
diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
index 16da9c694..2ef2eed44 100644
--- a/src/imitation/scripts/config/train_imitation.py
+++ b/src/imitation/scripts/config/train_imitation.py
@@ -38,6 +38,7 @@ def config():
     dagger = dict(
         use_offline_rollouts=False,  # warm-start policy with BC from offline demos
         total_timesteps=1e5,
+        beta_schedule=None,
     )
     agent_path = None  # Path to load agent from, optional.
 
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 2b4946668..f8cc992fd 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -125,6 +125,7 @@ def train_imitation(
                 expert_policy=expert_policy,
                 custom_logger=custom_logger,
                 bc_trainer=bc_trainer,
+                beta_schedule=dagger["beta_schedule"],
                 rng=_rnd,
             )
             model.train(

From 877383b03d7d3260746997f3cab7b5272125b07b Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 2 Feb 2023 13:00:06 +0100
Subject: [PATCH 06/47] Ignore coverage for unknown algorithms.

---
 tests/test_benchmarking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 67b9eb489..ba01b38a2 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -31,7 +31,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
     elif algorithm in ("airl", "gail"):
         experiment = train_adversarial.train_adversarial_ex
     else:
-        raise ValueError(f"Unknown algorithm: {algorithm}")
+        raise ValueError(f"Unknown algorithm: {algorithm}")  # pragma: no cover
 
     config_name = f"{algorithm}_{environment}"
     config_file = str(

From c8e55cb1efee3913bf306c23f6a5c361674d7380 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 2 Feb 2023 13:04:02 +0100
Subject: [PATCH 07/47] Cleanup and extend tests for beta schedules in dagger.

---
 tests/algorithms/test_dagger.py | 39 ++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/tests/algorithms/test_dagger.py b/tests/algorithms/test_dagger.py
index 525fc449a..6e5582810 100644
--- a/tests/algorithms/test_dagger.py
+++ b/tests/algorithms/test_dagger.py
@@ -33,12 +33,39 @@ def maybe_pendulum_expert_trajectories(
         return None
 
 
-def test_beta_schedule():
-    one_step_sched = dagger.LinearBetaSchedule(1)
-    three_step_sched = dagger.LinearBetaSchedule(3)
-    for i in range(10):
-        assert np.allclose(one_step_sched(i), 1 if i == 0 else 0)
-        assert np.allclose(three_step_sched(i), (3 - i) / 3 if i <= 2 else 0)
+@pytest.mark.parametrize("num_rampdown_rounds", [1, 2, 3, 10])
+def test_linear_beta_schedule(num_rampdown_rounds):
+    # GIVEN
+    sched = dagger.LinearBetaSchedule(num_rampdown_rounds)
+    idx_after_rampdown = num_rampdown_rounds + 1
+
+    # WHEN
+    betas = [sched(i) for i in range(num_rampdown_rounds + 10)]
+
+    # THEN
+    assert np.allclose(
+        betas[:idx_after_rampdown],
+        np.linspace(1, 0, idx_after_rampdown),
+    )
+    assert np.allclose(betas[idx_after_rampdown:], 0)
+
+
+@pytest.mark.parametrize("decay_probability", [0.1, 0.5, 0.9, 1])
+def test_exponential_beta_schedule(decay_probability):
+    # GIVEN
+    sched = dagger.ExponentialBetaSchedule(decay_probability)
+
+    # WHEN
+    betas = [sched(i) for i in range(10)]
+
+    # THEN
+    assert np.allclose(betas, decay_probability ** np.arange(10))
+
+
+@pytest.mark.parametrize("decay_probability", [-0.1, 0, 1.1, 2])
+def test_forbidden_decay_probability_on_exp_beta_schedule(decay_probability):
+    with pytest.raises(ValueError):
+        dagger.ExponentialBetaSchedule(decay_probability)
 
 
 def test_traj_collector_seed(tmpdir, pendulum_venv, rng):

From d81eb68d2359ebb1927f6ebb2ba573f0c7e5745a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 02:02:21 +0530
Subject: [PATCH 08/47] Add optuna to dependencies

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 557015d91..867c1b775 100644
--- a/setup.py
+++ b/setup.py
@@ -210,6 +210,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "chai-sacred>=0.8.3",
         "tensorboard>=1.14",
         "huggingface_sb3>=2.2.1",
+        "optuna>=3.0.1",
     ],
     tests_require=TESTS_REQUIRE,
     extras_require={

From 27467d38268a2217731f019dc0202ce3a520cf2a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 02:22:24 +0530
Subject: [PATCH 09/47] Fix test case

---
 tests/scripts/test_scripts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 78bbca9bd..ad559d2d9 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -910,7 +910,7 @@ def test_parallel_train_adversarial_custom_env(tmpdir):
             logging=dict(log_root=tmpdir),
             demonstrations=dict(rollout_path=rollout_path),
         ),
-        search_space=dict(command_name="gail"),
+        search_space=dict(command_name=tune.choice(["gail"])),
     )
     config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE)
     run = parallel.parallel_ex.run(config_updates=config_updates)

From 1a3b6b81f70cdfc515dc41a264ae1e81347ac588 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 12:04:03 +0530
Subject: [PATCH 10/47] Clean up the scripts

---
 src/imitation/scripts/analyze.py              |  12 +-
 src/imitation/scripts/config/parallel.py      | 219 ++----------------
 .../scripts/config/train_adversarial.py       |  40 +---
 src/imitation/scripts/parallel.py             |  39 ++--
 4 files changed, 48 insertions(+), 262 deletions(-)

diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index a7b52af36..b7b990800 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -167,6 +167,7 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str:
 def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
     imit_stats = get(sd.run, "result.imit_stats")
     if imit_stats is None:
+        # stored in rollout key for preference comparison
         imit_stats = get(sd.run, "result.rollout")
     expert_stats = get(sd.run, "result.expert_stats")
 
@@ -234,7 +235,7 @@ def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
 # verbosity 2
 table_verbosity_mapping.append(
     table_verbosity_mapping[-1]
-    | {"status", "imit_expert_ratio", "exp_command", "run_name", "seed", ""},
+    | {"status", "imit_expert_ratio", "exp_command", "run_name"},
 )
 
 
@@ -264,14 +265,14 @@ def analyze_imitation(
         csv_output_path: If provided, then save a CSV output file to this path.
         tex_output_path: If provided, then save a LaTeX-format table to this path.
         print_table: If True, then print the dataframe to stdout.
-        table_verbosity: Increasing levels of verbosity, from 0 to 2, increase the
-            number of columns in the table.
+        table_verbosity: Increasing levels of verbosity, from 0 to 3, increase the
+            number of columns in the table. Level 3 prints all of the columns available.
 
     Returns:
         The DataFrame generated from the Sacred logs.
     """
-    if table_verbosity == -1:
-        table_entry_fns_subset = _get_table_entry_fns_subset(0)
+    if table_verbosity == 3:
+        table_entry_fns_subset = _get_table_entry_fns_subset(2)
     else:
         table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
 
@@ -279,6 +280,7 @@ def analyze_imitation(
     for sd in _gather_sacred_dicts():
         new_df = pd.DataFrame()
         if table_verbosity == -1:
+            # gets all config columns
             new_df = pd.json_normalize(sd.config)
         else:
             new_df = new_df.append({}, ignore_index=True)
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 0525641e3..697c5d862 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -6,6 +6,11 @@
 
 Adding custom named configs is necessary because the CLI interface can't add
 search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
+
+For tuning hyperparameters of an algorithm on a given environment, override
+the `base_named_configs` argument with the named config of the environment.
+Ex: python -m imitation.scripts.parallel with example_gail \
+    'base_named_configs=["logging.wandb_logging", "seals_half_cheetah"]'
 """
 
 import numpy as np
@@ -13,7 +18,7 @@
 import sacred
 from torch import nn
 
-from imitation.algorithms.dagger import ExponentialBetaSchedule, LinearBetaSchedule
+from imitation.algorithms import dagger
 from imitation.util.util import make_unique_timestamp
 
 parallel_ex = sacred.Experiment("parallel")
@@ -35,44 +40,11 @@ def config():
 
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
-    # n_seeds_start = 0
-    # n_seeds = 1  # Number of seeds to search over by default
     experiment_checkpoint_path = ""
     eval_best_trial = False
     eval_trial_seeds = 5  # Number of seeds to search over by default
     num_samples = 1  # Number of samples per grid search configuration
-    repeat = 3
-    env = "seals_half_cheetah"
-    wandb_name_prefix = ""
-
-
-# @parallel_ex.config
-# def seeds(n_seeds_start, n_seeds):
-#     search_space = {
-#         "config_updates": {
-#             "seed": tune.choice(
-#                 list(range(n_seeds_start, n_seeds_start + n_seeds)),
-#             )
-#         }
-#     }
-
-
-# @parallel_ex.config
-# def wandb(run_name):
-#     base_config_updates = {
-#         "logging": {
-#             "wandb": {
-#                 "wandb_name_prefix": run_name,
-#                 "wandb_kwargs": {"project": "algorithm-benchmark"},
-#             },
-#         },
-#     }
-# base_named_configs = ["logging.wandb_logging"]
-
-
-@parallel_ex.named_config
-def s3():
-    upload_dir = "s3://shwang-chai/private"
+    repeat = 1
 
 
 # Debug named configs
@@ -137,11 +109,9 @@ def example_cartpole_rl():
 def example_rl():
     sacred_ex_name = "train_rl"
     run_name = "rl_tuning"
-    # n_seeds = 2
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {"environment": {"num_vec": 1}}
     search_space = {
-        # "named_configs": tune.choice([[env] for env in EASY_ENVS]),
         "config_updates": {
             "rl": {
                 "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
@@ -163,8 +133,8 @@ def example_rl():
 @parallel_ex.named_config
 def example_bc():
     sacred_ex_name = "train_imitation"
-    run_name = "bc_tuning_hc"
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    run_name = "bc_tuning"
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {"environment": {"num_vec": 1}}
     search_space = {
         "config_updates": {
@@ -191,8 +161,8 @@ def example_bc():
 @parallel_ex.named_config
 def example_dagger():
     sacred_ex_name = "train_imitation"
-    run_name = "dagger_tuning_hc"
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    run_name = "dagger_tuning"
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
         "dagger": {"total_timesteps": 1e5},
@@ -209,8 +179,8 @@ def example_dagger():
             ),
             "dagger": dict(
                 beta_schedule=tune.choice(
-                    [LinearBetaSchedule(i) for i in [1, 5, 15]]
-                    + [ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                    [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
+                    + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
                 ),
                 rollout_round_min_episodes=tune.choice([3, 5, 10]),
             ),
@@ -234,14 +204,10 @@ def example_gail():
         "total_timesteps": 1e7,
     }
     search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
         "config_updates": {
             "algorithm_kwargs": dict(
                 demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
                 n_disc_updates_per_round=tune.choice([8, 16]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
             ),
             "rl": {
                 "batch_size": tune.choice([4096, 8192, 16384]),
@@ -258,29 +224,23 @@ def example_gail():
     eval_best_trial = True
     eval_trial_seeds = 5
     repeat = 3
-    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
     resources_per_trial = dict(cpu=1)
 
 
 @parallel_ex.named_config
 def example_airl():
     sacred_ex_name = "train_adversarial"
-    run_name = "airl_tuning_hc"
-    # n_seeds = 1
+    run_name = "airl_tuning"
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
         "total_timesteps": 1e7,
     }
     search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
         "config_updates": {
             "algorithm_kwargs": dict(
                 demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
                 n_disc_updates_per_round=tune.choice([8, 16]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
             ),
             "rl": {
                 "batch_size": tune.choice([4096, 8192, 16384]),
@@ -297,7 +257,6 @@ def example_airl():
     eval_best_trial = True
     eval_trial_seeds = 5
     repeat = 3
-    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
     resources_per_trial = dict(cpu=1)
 
 
@@ -305,7 +264,7 @@ def example_airl():
 def example_pc():
     sacred_ex_name = "train_preference_comparisons"
     run_name = "pc_tuning"
-    base_named_configs = ["logging.wandb_logging", "seals_half_cheetah"]
+    base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
         "total_timesteps": 2e7,
@@ -317,8 +276,6 @@ def example_pc():
         "named_configs": tune.choice(
             [
                 ["reward.normalize_output_disable"],
-                # ["reward.normalize_output_running"],
-                # ["reward.normalize_output_ema"],
             ],
         ),
         "config_updates": {
@@ -327,19 +284,15 @@ def example_pc():
                     "activation_fn": tune.choice(
                         [
                             nn.ReLU,
-                            # nn.Tanh,
                         ],
                     ),
                 },
             },
             "num_iterations": tune.choice([25, 50]),
-            # "initial_comparison_frac": tune.choice([0.1, 0.25]),
-            # "reward_trainer_kwargs": {
-            #     "epochs": tune.choice([1, 3, 6]),
-            # },
-            # "query_schedule": tune.choice(
-            #     ["constant", "hyperbolic", "inverse_quadratic"],
-            # ),
+            "initial_comparison_frac": tune.choice([0.1, 0.25]),
+            "reward_trainer_kwargs": {
+                "epochs": tune.choice([1, 3, 6]),
+            },
             "rl": {
                 "batch_size": tune.choice([512, 2048, 8192]),
                 "rl_kwargs": {
@@ -349,138 +302,8 @@ def example_pc():
             },
         },
     }
-    num_samples = 24
+    num_samples = 100
     eval_best_trial = True
     eval_trial_seeds = 5
     repeat = 3
     resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def debug_eval():
-    sacred_ex_name = "train_preference_comparisons"
-    run_name = "debug_eval"
-    eval_trial_seeds = 2
-    eval_best_trial = True
-    # base_named_configs = ["seals_half_cheetah"]
-    base_config_updates = {
-        "total_timesteps": 30,
-        "total_comparisons": 10,
-        # "query_schedule": "hyperbolic",
-        "num_iterations": 1,
-        "fragment_length": 2,
-    }
-    search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
-        "config_updates": {
-            # "num_iterations": tune.choice([5, 20, 50]),
-            "initial_comparison_frac": tune.choice([0.1, 0.2]),
-            # "reward_trainer_kwargs": {
-            #     "epochs": tune.choice([1, 2, 3]),
-            # },
-            # "query_schedule": tune.choice(
-            #     ["constant", "hyperbolic", "inverse_quadratic"],
-            # ),
-        },
-    }
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def debug_eval_adv():
-    sacred_ex_name = "train_adversarial"
-    run_name = "airl_tuning_debug"
-    # n_seeds = 5
-    base_named_configs = []
-    eval_best_trial = True
-    eval_trial_seeds = 2
-    base_config_updates = {
-        "total_timesteps": 2048,
-    }
-    search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
-        "config_updates": {
-            "algorithm_kwargs": dict(
-                # demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
-                n_disc_updates_per_round=tune.choice([1, 2]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
-            ),
-            "rl": {
-                "batch_size": 8,
-                # "rl_kwargs": {
-                #     "ent_coef": tune.choice([0, 1e-3, 1e-1]),
-                #     "learning_rate": tune.loguniform(1e-5, 5e-3),
-                # },
-            },
-            "algorithm_specific": dict(demo_batch_size=1),
-        },
-        "command_name": "airl",
-    }
-    num_samples = 2
-    repeat = 2
-    resources_per_trial = dict(cpu=8)
-
-
-@parallel_ex.named_config
-def debug_airl():
-    sacred_ex_name = "train_adversarial"
-    run_name = "airl_debug"
-    # n_seeds = 1
-    base_named_configs = ["logging.wandb_logging", "seals_walker"]
-    base_config_updates = {
-        "environment": {"num_vec": 8},
-        "total_timesteps": 1e7,
-    }
-    search_space = {
-        # "named_configs": tune.choice([[env] for env in MY_ENVS]),
-        "config_updates": {
-            "train": {
-                "policy_kwargs": {
-                    "activation_fn": tune.choice(
-                        [
-                            nn.ReLU,
-                            # nn.Tanh,
-                        ],
-                    ),
-                },
-            },
-            "algorithm_kwargs": dict(
-                demo_batch_size=tune.choice([32]),
-                n_disc_updates_per_round=tune.choice([10]),
-                # both are same as rl.batch_size
-                # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-                # gen_train_timesteps=0,
-            ),
-            "rl": {
-                "batch_size": tune.choice([10000]),
-                "rl_kwargs": {
-                    "ent_coef": tune.choice([0.1]),
-                    "learning_rate": tune.choice([1e-4]),
-                },
-            },
-            "algorithm_specific": {},
-        },
-        "command_name": "airl",
-    }
-    num_samples = 1
-    eval_best_trial = False
-    # eval_trial_seeds = 5
-    repeat = 5
-    # experiment_checkpoint_path = f"/home/taufeeque/ray_results/{run_name}"
-    resources_per_trial = dict(cpu=8)
-
-
-# @parallel_ex.config_hook
-# def config_hook(config, command_name, logger):
-#     """Sets env."""
-#     del command_name, logger
-#     res = {}
-#     print(config)
-#     if config["env"]:
-#         res["base_named_configs"] = tuple(
-#             config["base_named_configs"] + [config["env"]]
-#         )
-#     print(res)
-#     return res
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index bd9df6287..fb26c99c6 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -99,8 +99,8 @@ def pendulum():
 
 @train_adversarial_ex.named_config
 def seals_ant():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
-    # locals().update(**ANT_SHARED_LOCALS)
+    locals().update(**MUJOCO_SHARED_LOCALS)
+    locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     rl = dict(
@@ -173,21 +173,6 @@ def seals_half_cheetah():
             vf_coef=0.11483689492120866,
         ),
     )
-    # algorithm_specific = dict(
-    #     airl=dict(total_timesteps=int(5e6)),
-    #     gail=dict(total_timesteps=int(8e6)),
-    # )
-    # reward = dict(
-    #     algorithm_specific=dict(
-    #         airl=dict(
-    #             net_cls=reward_nets.BasicShapedRewardNet,
-    #             net_kwargs=dict(
-    #                 reward_hid_sizes=(32,),
-    #                 potential_hid_sizes=(32,),
-    #             ),
-    #         ),
-    #     ),
-    # )
     algorithm_kwargs = dict(
         # Number of discriminator updates after each round of generator updates
         n_disc_updates_per_round=16,
@@ -257,7 +242,7 @@ def seals_swimmer():
 
 @train_adversarial_ex.named_config
 def seals_walker():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
+    locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     train = dict(
@@ -311,22 +296,3 @@ def fast():
         demo_batch_size=1,
         n_disc_updates_per_round=4,
     )
-
-
-@train_adversarial_ex.named_config
-def debug_nans():
-    environment = {"wandb": {"wandb_kwargs": {"project": "algorithm-benchmark"}}}
-    total_timesteps = 1e7
-    algorithm_kwargs = dict(
-        demo_batch_size=128,
-        n_disc_updates_per_round=8,
-        # both are same as rl.batch_size
-        # gen_replay_buffer_capacity=tune.choice([512, 1024]),
-        # gen_train_timesteps=0,
-    )
-    rl = {
-        "batch_size": 4096,
-        "rl_kwargs": {"ent_coef": 0.1, "learning_rate": 7.316377404994506e-05},
-    }
-    seed = 0
-    checkpoint_interval = 1
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 3e713777e..9ee8e6ee9 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -33,7 +33,7 @@ def parallel(
     upload_dir: Optional[str],
     repeat: int = 1,
     eval_best_trial: bool = False,
-    eval_best_trial_resource_multiplier: int = 2,
+    eval_best_trial_resource_multiplier: int = 1,
     eval_trial_seeds: int = 5,
     experiment_checkpoint_path: str = "",
     syncer=None,
@@ -54,7 +54,8 @@ def parallel(
             under the 'experiment.name' key. This is equivalent to using the Sacred
             CLI '--name' option on the inner experiment. Offline analysis jobs can use
             this argument to group similar data.
-        num_samples: Number of times to sample from the hyperparameter space.
+        num_samples: Number of times to sample from the hyperparameter space without
+            considering repetition using `repeat`.
         search_space: A dictionary which can contain Ray Tune search objects like
             `ray.tune.grid_search` and `ray.tune.sample_from`, and is
             passed as the `config` argument to `ray.tune.run()`. After the
@@ -79,12 +80,12 @@ def parallel(
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
         repeat: Number of runs to repeat each trial for.
         eval_best_trial: Whether to evaluate the trial with the best mean return
-            at the end of tuning on a different set of seeds.
+            at the end of tuning on a separate set of seeds.
         eval_best_trial_resource_multiplier: factor by which to multiply the
             number of cpus per trial in `resources_per_trial`.
         eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment. ran using this script. Useful for resuming cancelled trials
+            experiment ran using this script. Useful for resuming cancelled trials
             of the experiments (using `resume`) or evaluating the best trial of the
             experiment (using `eval_best_trial`).
         resume: If true and `experiment_checkpoint_path` is given, then resumes the
@@ -159,6 +160,7 @@ def parallel(
             result.trials = None
             result.fetch_trial_dataframes()
         else:
+            # run hyperparameter tuning
             result = ray.tune.run(
                 trainable,
                 config=search_space,
@@ -174,15 +176,14 @@ def parallel(
                 metric="mean_return",
                 mode="max",
             )
-
-        key_prefix = (
-            "rollout/"
-            if sacred_ex_name == "train_preference_comparisons"
-            else ""
-            if sacred_ex_name == "train_rl"
-            else "imit_stats/"
-        )
+        if sacred_ex_name == "train_rl":
+            key_prefix = ""
+        elif sacred_ex_name == "train_preference_comparisons":
+            key_prefix = "rollout/"
+        else:
+            key_prefix = "imit_stats/"
         key = key_prefix + "monitor_return_mean"
+
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
@@ -230,8 +231,9 @@ def parallel(
                     resources_per_trial=resources_per_trial,
                 )
                 returns = eval_result.results_df["mean_return"].to_numpy()
-                print("Returns:", returns)
-                print(np.mean(returns), np.std(returns))
+                print("All returns:", returns)
+                print("Mean:", np.mean(returns))
+                print("Std:", np.std(returns))
     finally:
         ray.shutdown()
 
@@ -333,14 +335,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         )
         # Ray Tune has a string formatting error if raylet completes without
         # any calls to `reporter`.
-        # reporter(done=True)
-        # if sacred_ex_name == "train_preference_comparisons":
-        #     #reporter(mean_return=run.result["rollout"]["monitor_return_mean"])
-        #     #ray.tune.report(mean_return=run.result["rollout"]["monitor_return_mean"])
-        #     ray.tune.report(mean_return=234)
-        # else:
-        #     # reporter(mean_return=run.result["imit_stats"]["monitor_return_mean"])
-        #     ray.tune.report(mean_return=run.result["imit_stats"]["monitor_return_mean"])
+        reporter(done=True)
 
         assert run.status == "COMPLETED"
         return run.result

From 7a438da0f5421f0d98fdb4db9747a8af10d26297 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 9 Feb 2023 19:53:14 +0530
Subject: [PATCH 11/47] Remove reporter(done) since mean_return is reported by
 the runs

---
 src/imitation/scripts/parallel.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 9ee8e6ee9..2dd2254bf 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -333,9 +333,6 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
             **updated_run_kwargs,
             options={"--run": run_name, "--file_storage": "sacred"},
         )
-        # Ray Tune has a string formatting error if raylet completes without
-        # any calls to `reporter`.
-        reporter(done=True)
 
         assert run.status == "COMPLETED"
         return run.result

From 2e56de8eb97713b88ada09564369214f5e4fa661 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 23 Feb 2023 23:53:12 +0530
Subject: [PATCH 12/47] Add beta_schedule parameter to dagger script

---
 src/imitation/scripts/train_imitation.py              | 1 +
 src/imitation/scripts/train_preference_comparisons.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index e607339b4..56633e33a 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -119,6 +119,7 @@ def dagger(
             expert_policy=expert_policy,
             custom_logger=custom_logger,
             bc_trainer=bc_trainer,
+            beta_schedule=dagger["beta_schedule"],
             rng=_rnd,
         )
 
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index 3d4fb4e33..4030317c4 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -280,7 +280,6 @@ def save_callback(iteration_num):
             results = dict(results)
             results["rollout"] = policy_evaluation.eval_policy(agent, venv)
             results["mean_return"] = results["rollout"]["monitor_return_mean"]
-            
 
     if save_preferences:
         main_trainer.dataset.save(log_dir / "preferences.pkl")

From 73d8576fc893868c68442b657bd25aaffb7df9bf Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 17 Mar 2023 03:37:15 +0530
Subject: [PATCH 13/47] Update config policy kwargs

---
 src/imitation/scripts/config/train_adversarial.py   |  6 +++---
 .../scripts/config/train_preference_comparisons.py  | 13 +++----------
 src/imitation/scripts/config/train_rl.py            | 12 ++++++------
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index 08b92fe9c..7989f3eab 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -187,7 +187,7 @@ def seals_hopper():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -216,7 +216,7 @@ def seals_swimmer():
     environment = dict(gym_id="seals/Swimmer-v0")
     total_timesteps = int(2e6)
     demonstrations = dict(rollout_type="ppo-huggingface")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -245,7 +245,7 @@ def seals_walker():
     locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 236edad47..1a039c762 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -120,20 +120,13 @@ def seals_half_cheetah():
     )
     num_iterations = 50
     total_timesteps = 20000000
-    # train = dict(
-    #     policy_cls="MlpPolicy",
-    #     policy_kwargs=dict(
-    #         activation_fn=nn.ReLU,
-    #         # net_arch=[dict(pi=[64, 64], vf=[64, 64])],
-    #     ),
-    # )
 
 
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -160,7 +153,7 @@ def seals_hopper():
 def seals_swimmer():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Swimmer-v0")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -188,7 +181,7 @@ def seals_swimmer():
 def seals_walker():
     # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py
index 34b45250c..a5475540d 100644
--- a/src/imitation/scripts/config/train_rl.py
+++ b/src/imitation/scripts/config/train_rl.py
@@ -74,7 +74,7 @@ def cartpole():
 def seals_cartpole():
     environment = dict(gym_id="seals/CartPole-v0", num_vec=8)
     total_timesteps = int(1e5)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -111,7 +111,7 @@ def seals_half_cheetah():
         num_vec=1,
     )
 
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.Tanh,
@@ -141,7 +141,7 @@ def seals_half_cheetah():
 @train_rl_ex.named_config
 def seals_hopper():
     environment = dict(gym_id="seals/Hopper-v0", num_vec=1)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -211,7 +211,7 @@ def seals_ant():
         num_vec=1,
     )
 
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.Tanh,
@@ -242,7 +242,7 @@ def seals_ant():
 @train_rl_ex.named_config
 def seals_swimmer():
     environment = dict(gym_id="seals/Swimmer-v0", num_vec=1)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,
@@ -273,7 +273,7 @@ def seals_swimmer():
 @train_rl_ex.named_config
 def seals_walker():
     environment = dict(gym_id="seals/Walker2d-v0", num_vec=1)
-    train = dict(
+    policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
             activation_fn=nn.ReLU,

From 9fdf8786663473334f94b24a841a832b29da435f Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 16 May 2023 19:00:32 +0530
Subject: [PATCH 14/47] Changes from review

---
 src/imitation/scripts/config/parallel.py       | 16 ++++++++--------
 .../scripts/config/train_adversarial.py        |  4 ----
 .../config/train_preference_comparisons.py     |  6 ------
 src/imitation/scripts/parallel.py              | 18 +++++++-----------
 src/imitation/scripts/train_imitation.py       |  1 +
 5 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index ea90f11b8..b52446154 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -102,9 +102,6 @@ def example_cartpole_rl():
     resources_per_trial = dict(cpu=4)
 
 
-EASY_ENVS = ["cartpole", "pendulum", "mountain_car"]
-
-
 @parallel_ex.named_config
 def example_rl():
     sacred_ex_name = "train_rl"
@@ -135,18 +132,21 @@ def example_bc():
     sacred_ex_name = "train_imitation"
     run_name = "bc_tuning"
     base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {"environment": {"num_vec": 1}}
+    base_config_updates = {
+        "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
+    }
     search_space = {
         "config_updates": {
-            "bc_kwargs": dict(
+            "bc": dict(
                 batch_size=tune.choice([8, 16, 32, 64]),
                 l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
                 optimizer_kwargs=dict(
                     lr=tune.loguniform(1e-5, 1e-2),
                 ),
-            ),
-            "bc_train_kwargs": dict(
-                n_epochs=tune.choice([1, 5, 10, 20]),
+                train_kwargs=dict(
+                    n_epochs=tune.choice([1, 5, 10, 20]),
+                ),
             ),
         },
         "command_name": "bc",
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index 7989f3eab..ef675eab6 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -156,7 +156,6 @@ def half_cheetah():
 
 @train_adversarial_ex.named_config
 def seals_half_cheetah():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/HalfCheetah-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     rl = dict(
@@ -184,7 +183,6 @@ def seals_half_cheetah():
 
 @train_adversarial_ex.named_config
 def seals_hopper():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     policy = dict(
@@ -212,7 +210,6 @@ def seals_hopper():
 
 @train_adversarial_ex.named_config
 def seals_swimmer():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Swimmer-v0")
     total_timesteps = int(2e6)
     demonstrations = dict(rollout_type="ppo-huggingface")
@@ -242,7 +239,6 @@ def seals_swimmer():
 
 @train_adversarial_ex.named_config
 def seals_walker():
-    locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     demonstrations = dict(rollout_type="ppo-huggingface")
     policy = dict(
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 1a039c762..4fe9c793e 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -73,8 +73,6 @@ def cartpole():
 
 @train_preference_comparisons_ex.named_config
 def seals_ant():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
-    # locals().update(**ANT_SHARED_LOCALS)
     environment = dict(gym_id="seals/Ant-v0")
     rl = dict(
         batch_size=2048,
@@ -102,7 +100,6 @@ def half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_half_cheetah():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/HalfCheetah-v0")
     rl = dict(
         batch_size=512,
@@ -124,7 +121,6 @@ def seals_half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Hopper-v0")
     policy = dict(
         policy_cls="MlpPolicy",
@@ -151,7 +147,6 @@ def seals_hopper():
 
 @train_preference_comparisons_ex.named_config
 def seals_swimmer():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Swimmer-v0")
     policy = dict(
         policy_cls="MlpPolicy",
@@ -179,7 +174,6 @@ def seals_swimmer():
 
 @train_preference_comparisons_ex.named_config
 def seals_walker():
-    # locals().update(**MUJOCO_SHARED_LOCALS)
     environment = dict(gym_id="seals/Walker2d-v0")
     policy = dict(
         policy_cls="MlpPolicy",
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 2dd2254bf..53b4c2b32 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -139,7 +139,7 @@ def parallel(
                         syncer=syncer,
                     ),
                     metric="mean_return",
-                    resume=resume,
+                    resume=True,
                 )
                 print(
                     "Live trials:",
@@ -176,14 +176,7 @@ def parallel(
                 metric="mean_return",
                 mode="max",
             )
-        if sacred_ex_name == "train_rl":
-            key_prefix = ""
-        elif sacred_ex_name == "train_preference_comparisons":
-            key_prefix = "rollout/"
-        else:
-            key_prefix = "imit_stats/"
-        key = key_prefix + "monitor_return_mean"
-
+        key = "mean_return"
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
@@ -199,7 +192,7 @@ def parallel(
             # store mean return of runs across all seeds in a group
             df["mean_return"] = grps[key].transform(lambda x: x.mean())
             best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-            row = best_config_df.loc[0]
+            row = best_config_df.iloc[0]
             best_config_tag = row["experiment_tag"]
             if result.trials is not None:
                 trial = [
@@ -215,7 +208,10 @@ def parallel(
                 # update cpus per trial only if it is provided in `resources_per_trial`
                 # Uses the default values (cpu=1) if it is not provided
                 if "cpu" in resources_per_trial:
-                    resources_per_trial["cpu"] *= eval_best_trial_resource_multiplier
+                    resources_per_trial_eval = copy.deepcopy(resources_per_trial)
+                    resources_per_trial_eval[
+                        "cpu"
+                    ] *= eval_best_trial_resource_multiplier
                     best_config["config_updates"].update(
                         environment=dict(num_vec=resources_per_trial["cpu"]),
                     )
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 56633e33a..5a6925eb3 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -76,6 +76,7 @@ def bc(
     expert_stats = _try_computing_expert_stats(expert_trajs)
     if expert_stats is not None:
         stats["expert_stats"] = expert_stats
+    stats["mean_return"] = imit_stats["monitor_return_mean"]
     return stats
 
 

From 1c1dbc44970016fd5ef6bb965cf69afbf33590a1 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 16 May 2023 21:43:43 +0530
Subject: [PATCH 15/47] Fix errors with some configs

---
 src/imitation/scripts/config/parallel.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index b52446154..095c67107 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -165,8 +165,9 @@ def example_dagger():
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
         "dagger": {"total_timesteps": 1e5},
-        "bc_kwargs": {
+        "bc": {
             "batch_size": 16,
             "l2_weight": 1e-4,
             "optimizer_kwargs": {"lr": 1e-3},
@@ -174,8 +175,10 @@ def example_dagger():
     }
     search_space = {
         "config_updates": {
-            "bc_train_kwargs": dict(
-                n_epochs=tune.choice([1, 5, 10]),
+            "bc": dict(
+                train_kwargs=dict(
+                    n_epochs=tune.choice([1, 5, 10]),
+                ),
             ),
             "dagger": dict(
                 beta_schedule=tune.choice(
@@ -201,6 +204,7 @@ def example_gail():
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
         "total_timesteps": 1e7,
     }
     search_space = {
@@ -234,6 +238,7 @@ def example_airl():
     base_named_configs = ["logging.wandb_logging"]
     base_config_updates = {
         "environment": {"num_vec": 1},
+        "demonstrations": {"rollout_type": "ppo-huggingface"},
         "total_timesteps": 1e7,
     }
     search_space = {
@@ -273,11 +278,9 @@ def example_pc():
         "gatherer_kwargs": {"sample": True},
     }
     search_space = {
-        "named_configs": tune.choice(
-            [
-                ["reward.normalize_output_disable"],
-            ],
-        ),
+        "named_configs": [
+            ["reward.normalize_output_disable"],
+        ],
         "config_updates": {
             "train": {
                 "policy_kwargs": {

From 44c4e97d64980118b3a07f06f7c15edb273a16a1 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 14 Jun 2023 06:38:42 +0530
Subject: [PATCH 16/47] Updates based on review

---
 src/imitation/scripts/analyze.py              | 29 ++++++++++---------
 src/imitation/scripts/parallel.py             | 26 ++++++++++++-----
 src/imitation/scripts/train_adversarial.py    |  1 -
 src/imitation/scripts/train_imitation.py      |  1 -
 .../scripts/train_preference_comparisons.py   |  3 +-
 src/imitation/scripts/train_rl.py             |  1 -
 6 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index f036efe40..8977fed47 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -272,40 +272,43 @@ def analyze_imitation(
         The DataFrame generated from the Sacred logs.
     """
     if table_verbosity == 3:
+        # Get column names for which we have get value using make_entry_fn
+        # These are same across Level 2 & 3. In Level 3, we additionally add remaining
+        #  config columns.
         table_entry_fns_subset = _get_table_entry_fns_subset(2)
     else:
         table_entry_fns_subset = _get_table_entry_fns_subset(table_verbosity)
 
-    df = pd.DataFrame()
+    output_table = pd.DataFrame()
     for sd in _gather_sacred_dicts():
-        new_df = pd.DataFrame()
-        if table_verbosity == -1:
+        if table_verbosity == 3:
             # gets all config columns
-            new_df = pd.json_normalize(sd.config)
+            row = pd.json_normalize(sd.config)
         else:
-            new_df = new_df.append({}, ignore_index=True)
+            # create an empty dataframe with a single row
+            row = pd.DataFrame(index=[0])
 
         for col_name, make_entry_fn in table_entry_fns_subset.items():
-            new_df[col_name] = make_entry_fn(sd)
+            row[col_name] = make_entry_fn(sd)
 
-        df = pd.concat([df, new_df])
+        output_table = pd.concat([output_table, row])
 
-    if len(df) > 0:
-        df.sort_values(by=["algo", "env_name"], inplace=True)
+    if len(output_table) > 0:
+        output_table.sort_values(by=["algo", "env_name"], inplace=True)
 
     display_options: Mapping[str, Any] = dict(index=False)
     if csv_output_path is not None:
-        df.to_csv(csv_output_path, **display_options)
+        output_table.to_csv(csv_output_path, **display_options)
         print(f"Wrote CSV file to {csv_output_path}")
     if tex_output_path is not None:
-        s: str = df.to_latex(**display_options)
+        s: str = output_table.to_latex(**display_options)
         with open(tex_output_path, "w") as f:
             f.write(s)
         print(f"Wrote TeX file to {tex_output_path}")
 
     if print_table:
-        print(df.to_string(**display_options))
-    return df
+        print(output_table.to_string(**display_options))
+    return output_table
 
 
 def _make_return_summary(stats: dict, prefix="") -> str:
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 53b4c2b32..2bb0129cb 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -127,6 +127,12 @@ def parallel(
 
     ray.init(**init_kwargs)
     search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
+
+    if sacred_ex_name == "train_rl":
+        return_key = "monitor_return_mean"
+    else:
+        return_key = "imit_stats/monitor_return_mean"
+
     try:
         if experiment_checkpoint_path:
             if resume:
@@ -173,10 +179,9 @@ def parallel(
                     syncer=syncer,
                 ),
                 search_alg=search_alg,
-                metric="mean_return",
+                metric=return_key,
                 mode="max",
             )
-        key = "mean_return"
         if eval_best_trial:
             df = result.results_df
             df = df[df["config/named_configs"].notna()]
@@ -190,7 +195,7 @@ def parallel(
             ]
             grps = df.groupby(grp_keys)
             # store mean return of runs across all seeds in a group
-            df["mean_return"] = grps[key].transform(lambda x: x.mean())
+            df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
             best_config_df = df[df["mean_return"] == df["mean_return"].max()]
             row = best_config_df.iloc[0]
             best_config_tag = row["experiment_tag"]
@@ -200,20 +205,25 @@ def parallel(
                 ][0]
                 best_config = trial.config
                 print("Mean return:", row["mean_return"])
-                print("All returns:", df[df["mean_return"] == row["mean_return"]][key])
+                print(
+                    "All returns:",
+                    df[df["mean_return"] == row["mean_return"]][return_key],
+                )
                 print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
                 best_config["config_updates"].update(
                     seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
                 )
+
+                resources_per_trial_eval = copy.deepcopy(resources_per_trial)
                 # update cpus per trial only if it is provided in `resources_per_trial`
                 # Uses the default values (cpu=1) if it is not provided
                 if "cpu" in resources_per_trial:
-                    resources_per_trial_eval = copy.deepcopy(resources_per_trial)
+
                     resources_per_trial_eval[
                         "cpu"
                     ] *= eval_best_trial_resource_multiplier
                     best_config["config_updates"].update(
-                        environment=dict(num_vec=resources_per_trial["cpu"]),
+                        environment=dict(num_vec=resources_per_trial_eval["cpu"]),
                     )
 
                 eval_result = ray.tune.run(
@@ -224,9 +234,9 @@ def parallel(
                         "command_name": best_config.get("command_name", None),
                     },
                     name=run_name + "_best_hp_eval",
-                    resources_per_trial=resources_per_trial,
+                    resources_per_trial=resources_per_trial_eval,
                 )
-                returns = eval_result.results_df["mean_return"].to_numpy()
+                returns = eval_result.results_df[return_key].to_numpy()
                 print("All returns:", returns)
                 print("Mean:", np.mean(returns))
                 print("Std:", np.std(returns))
diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py
index d1f99a54b..26c8d7bcf 100644
--- a/src/imitation/scripts/train_adversarial.py
+++ b/src/imitation/scripts/train_adversarial.py
@@ -167,7 +167,6 @@ def callback(round_num: int, /) -> None:
     return {
         "imit_stats": imit_stats,
         "expert_stats": rollout.rollout_stats(expert_trajs),
-        "mean_return": imit_stats["monitor_return_mean"],
     }
 
 
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
index 5a6925eb3..56633e33a 100644
--- a/src/imitation/scripts/train_imitation.py
+++ b/src/imitation/scripts/train_imitation.py
@@ -76,7 +76,6 @@ def bc(
     expert_stats = _try_computing_expert_stats(expert_trajs)
     if expert_stats is not None:
         stats["expert_stats"] = expert_stats
-    stats["mean_return"] = imit_stats["monitor_return_mean"]
     return stats
 
 
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index b054a5a6c..867a666a4 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -280,8 +280,7 @@ def save_callback(iteration_num):
         # Storing and evaluating policy only useful if we generated trajectory data
         if bool(trajectory_path is None):
             results = dict(results)
-            results["rollout"] = policy_evaluation.eval_policy(agent, venv)
-            results["mean_return"] = results["rollout"]["monitor_return_mean"]
+            results["imit_stats"] = policy_evaluation.eval_policy(agent, venv)
 
     if save_preferences:
         main_trainer.dataset.save(log_dir / "preferences.pkl")
diff --git a/src/imitation/scripts/train_rl.py b/src/imitation/scripts/train_rl.py
index 20a7b263c..6780a557b 100644
--- a/src/imitation/scripts/train_rl.py
+++ b/src/imitation/scripts/train_rl.py
@@ -159,7 +159,6 @@ def train_rl(
 
         # Final evaluation of expert policy.
         eval_stats = policy_evaluation.eval_policy(rl_algo, venv)
-        eval_stats["mean_return"] = eval_stats["monitor_return_mean"]
         return eval_stats
 
 

From ab0126998a4f8beb44e93eb11d6c2b17e68038a8 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 14 Jun 2023 07:40:52 +0530
Subject: [PATCH 17/47] Change metric everywhere

---
 src/imitation/scripts/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 2bb0129cb..6f77330df 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -144,7 +144,7 @@ def parallel(
                         upload_dir=upload_dir,
                         syncer=syncer,
                     ),
-                    metric="mean_return",
+                    metric=return_key,
                     resume=True,
                 )
                 print(

From e896d7db127f9025d89387cc10e513409fd973b1 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 11 Jul 2023 16:03:02 +0530
Subject: [PATCH 18/47] Separate tuning code from parallel.py

---
 benchmarking/tuning.py                   | 102 ++++++++++
 benchmarking/tuning_config.py            | 237 +++++++++++++++++++++++
 setup.cfg                                |   1 +
 src/imitation/scripts/config/parallel.py | 216 +--------------------
 src/imitation/scripts/parallel.py        | 101 ++--------
 5 files changed, 363 insertions(+), 294 deletions(-)
 create mode 100644 benchmarking/tuning.py
 create mode 100644 benchmarking/tuning_config.py

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
new file mode 100644
index 000000000..b4e62a84a
--- /dev/null
+++ b/benchmarking/tuning.py
@@ -0,0 +1,102 @@
+"""Tunes the hyperparameters of the algorithms."""
+
+import copy
+import pathlib
+from typing import Any, Dict
+
+import numpy as np
+import ray
+from pandas.api import types as pd_types
+from sacred.observers import FileStorageObserver
+from tuning_config import parallel_ex, tuning_ex
+
+
+@tuning_ex.main
+def tune(
+    parallel: Dict[str, Any],
+    eval_best_trial: bool = False,
+    eval_best_trial_resource_multiplier: int = 1,
+    eval_trial_seeds: int = 5,
+) -> None:
+    """Tune hyperparameters of imitation algorithms using parallel script.
+
+    Args:
+        parallel: A dictionary of arguments from the parallel script.
+        eval_best_trial: Whether to evaluate the trial with the best mean return
+            at the end of tuning on a separate set of seeds.
+        eval_best_trial_resource_multiplier: factor by which to multiply the
+            number of cpus per trial in `resources_per_trial`.
+        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
+    """
+    run = parallel_ex.run(config_updates=parallel)
+    result = run.result
+
+    if eval_best_trial:
+        if parallel["sacred_ex_name"] == "train_rl":
+            return_key = "monitor_return_mean"
+        else:
+            return_key = "imit_stats/monitor_return_mean"
+        df = result.results_df
+        df = df[df["config/named_configs"].notna()]
+        # convert object dtype to str required by df.groupby
+        for col in df.columns:
+            if pd_types.is_object_dtype(df[col]):
+                df[col] = df[col].astype("str")
+        # group into separate HP configs
+        grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
+        grps = df.groupby(grp_keys)
+        # store mean return of runs across all seeds in a group
+        df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
+        best_config_df = df[df["mean_return"] == df["mean_return"].max()]
+        row = best_config_df.iloc[0]
+        best_config_tag = row["experiment_tag"]
+        if result.trials is not None:
+            trial = [t for t in result.trials if best_config_tag in t.experiment_tag][0]
+            best_config = trial.config
+            print("Mean return:", row["mean_return"])
+            print(
+                "All returns:",
+                df[df["mean_return"] == row["mean_return"]][return_key],
+            )
+            print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
+
+            best_config["config_updates"].update(
+                seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
+            )
+
+            resources_per_trial_eval = copy.deepcopy(parallel["resources_per_trial"])
+            # update cpus per trial only if it is provided in `resources_per_trial`
+            # Uses the default values (cpu=1) if it is not provided
+            if "cpu" in parallel["resources_per_trial"]:
+                resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
+
+            eval_config_updates = parallel.copy()
+            eval_config_updates.update(
+                run_name=parallel["run_name"] + "_best_hp_eval",
+                num_samples=1,
+                search_space=best_config,
+                base_named_configs=parallel["base_named_configs"],
+                base_config_updates=parallel["base_config_updates"],
+                resources_per_trial=resources_per_trial_eval,
+                search_alg=None,
+                repeat=1,
+                experiment_checkpoint_path="",
+                resume=False,
+            )
+            eval_run = parallel_ex.run(config_updates=eval_config_updates)
+            eval_result = eval_run.result
+            returns = eval_result.results_df[return_key].to_numpy()
+            print("All returns:", returns)
+            print("Mean:", np.mean(returns))
+            print("Std:", np.std(returns))
+
+
+def main_console():
+    observer_path = pathlib.Path.cwd() / "output" / "sacred" / "tuning"
+    observer = FileStorageObserver(observer_path)
+    tuning_ex.observers.append(observer)
+    tuning_ex.run_commandline()
+
+
+if __name__ == "__main__":  # pragma: no cover
+    main_console()
diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py
new file mode 100644
index 000000000..79c8d0347
--- /dev/null
+++ b/benchmarking/tuning_config.py
@@ -0,0 +1,237 @@
+"""Config files for tuning experiments."""
+
+import ray.tune as tune
+import sacred
+from torch import nn
+
+from imitation.algorithms import dagger
+from imitation.scripts.parallel import parallel_ex
+
+tuning_ex = sacred.Experiment("tuning", ingredients=[parallel_ex])
+
+
+@tuning_ex.named_config
+def example_rl():
+    parallel = dict(
+        sacred_ex_name="train_rl",
+        run_name="rl_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={"environment": {"num_vec": 1}},
+        search_space={
+            "config_updates": {
+                "rl": {
+                    "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
+                    "rl_kwargs": {
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                        "batch_size": tune.choice([64, 128, 256, 512]),
+                        "n_epochs": tune.choice([5, 10, 20]),
+                    },
+                },
+            },
+        },
+        num_samples=100,
+        repeat=1,
+        resources_per_trial=dict(cpu=1),
+    )
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_bc():
+    parallel = dict(
+        sacred_ex_name="train_imitation",
+        run_name="bc_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+        },
+        search_space={
+            "config_updates": {
+                "bc": dict(
+                    batch_size=tune.choice([8, 16, 32, 64]),
+                    l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
+                    optimizer_kwargs=dict(
+                        lr=tune.loguniform(1e-5, 1e-2),
+                    ),
+                    train_kwargs=dict(
+                        n_epochs=tune.choice([1, 5, 10, 20]),
+                    ),
+                ),
+            },
+            "command_name": "bc",
+        },
+        num_samples=2,
+        repeat=1,
+        resources_per_trial=dict(cpu=1),
+    )
+
+    eval_best_trial = True
+    eval_trial_seeds = 5
+    eval_best_trial_resource_multiplier = 1
+
+
+@tuning_ex.named_config
+def example_dagger():
+    parallel = dict(
+        sacred_ex_name="train_imitation",
+        run_name="dagger_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "dagger": {"total_timesteps": 1e5},
+            "bc": {
+                "batch_size": 16,
+                "l2_weight": 1e-4,
+                "optimizer_kwargs": {"lr": 1e-3},
+            },
+        },
+        search_space={
+            "config_updates": {
+                "bc": dict(
+                    train_kwargs=dict(
+                        n_epochs=tune.choice([1, 5, 10]),
+                    ),
+                ),
+                "dagger": dict(
+                    beta_schedule=tune.choice(
+                        [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
+                        + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                    ),
+                    rollout_round_min_episodes=tune.choice([3, 5, 10]),
+                ),
+            },
+            "command_name": "dagger",
+        },
+        num_samples=50,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_gail():
+    parallel = dict(
+        sacred_ex_name="train_adversarial",
+        run_name="gail_tuning_hc",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "total_timesteps": 1e7,
+        },
+        search_space={
+            "config_updates": {
+                "algorithm_kwargs": dict(
+                    demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                    n_disc_updates_per_round=tune.choice([8, 16]),
+                ),
+                "rl": {
+                    "batch_size": tune.choice([4096, 8192, 16384]),
+                    "rl_kwargs": {
+                        "ent_coef": tune.loguniform(1e-7, 1e-3),
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    },
+                },
+                "algorithm_specific": {},
+            },
+            "command_name": "gail",
+        },
+        num_samples=100,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_airl():
+    parallel = dict(
+        sacred_ex_name="train_adversarial",
+        run_name="airl_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "total_timesteps": 1e7,
+        },
+        search_space={
+            "config_updates": {
+                "algorithm_kwargs": dict(
+                    demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
+                    n_disc_updates_per_round=tune.choice([8, 16]),
+                ),
+                "rl": {
+                    "batch_size": tune.choice([4096, 8192, 16384]),
+                    "rl_kwargs": {
+                        "ent_coef": tune.loguniform(1e-7, 1e-3),
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                    },
+                },
+                "algorithm_specific": {},
+            },
+            "command_name": "airl",
+        },
+        num_samples=100,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+
+    eval_best_trial = True
+    eval_trial_seeds = 5
+
+
+@tuning_ex.named_config
+def example_pc():
+    parallel = dict(
+        sacred_ex_name="train_preference_comparisons",
+        run_name="pc_tuning",
+        base_named_configs=["logging.wandb_logging"],
+        base_config_updates={
+            "environment": {"num_vec": 1},
+            "demonstrations": {"source": "huggingface"},
+            "total_timesteps": 2e7,
+            "total_comparisons": 5000,
+            "query_schedule": "hyperbolic",
+            "gatherer_kwargs": {"sample": True},
+        },
+        search_space={
+            "named_configs": [
+                ["reward.normalize_output_disable"],
+            ],
+            "config_updates": {
+                "train": {
+                    "policy_kwargs": {
+                        "activation_fn": tune.choice(
+                            [
+                                nn.ReLU,
+                            ],
+                        ),
+                    },
+                },
+                "num_iterations": tune.choice([25, 50]),
+                "initial_comparison_frac": tune.choice([0.1, 0.25]),
+                "reward_trainer_kwargs": {
+                    "epochs": tune.choice([1, 3, 6]),
+                },
+                "rl": {
+                    "batch_size": tune.choice([512, 2048, 8192]),
+                    "rl_kwargs": {
+                        "learning_rate": tune.loguniform(1e-5, 1e-2),
+                        "ent_coef": tune.loguniform(1e-7, 1e-3),
+                    },
+                },
+            },
+        },
+        num_samples=100,
+        repeat=3,
+        resources_per_trial=dict(cpu=1),
+    )
+
+    eval_best_trial = True
+    eval_trial_seeds = 5
diff --git a/setup.cfg b/setup.cfg
index 979c3ca46..f39db322f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,6 +7,7 @@ per-file-ignores =
 # F841 local variable unused [for Sacred config scopes]
   src/imitation/scripts/config/*.py:F841
   ../src/imitation/scripts/config/*.py:F841
+  benchmarking/tuning_config.py:F841
   src/imitation/envs/examples/airl_envs/*.py:D
 
 [darglint]
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 095c67107..e9c5b8245 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -16,9 +16,7 @@
 import numpy as np
 import ray.tune as tune
 import sacred
-from torch import nn
 
-from imitation.algorithms import dagger
 from imitation.util.util import make_unique_timestamp
 
 parallel_ex = sacred.Experiment("parallel")
@@ -45,6 +43,10 @@ def config():
     eval_trial_seeds = 5  # Number of seeds to search over by default
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1
+    search_alg = "optuna"  # search algorithm to use
+    experiment_checkpoint_path = ""  # Path to checkpoint of experiment to resume
+    syncer = None  # Sacred syncer to use
+    resume = False  # Whether to resume experiment from checkpoint
 
 
 # Debug named configs
@@ -100,213 +102,3 @@ def example_cartpole_rl():
     }
     base_named_configs = ["cartpole"]
     resources_per_trial = dict(cpu=4)
-
-
-@parallel_ex.named_config
-def example_rl():
-    sacred_ex_name = "train_rl"
-    run_name = "rl_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {"environment": {"num_vec": 1}}
-    search_space = {
-        "config_updates": {
-            "rl": {
-                "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
-                "rl_kwargs": {
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                    "batch_size": tune.choice([64, 128, 256, 512]),
-                    "n_epochs": tune.choice([5, 10, 20]),
-                },
-            },
-        },
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 1
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_bc():
-    sacred_ex_name = "train_imitation"
-    run_name = "bc_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-    }
-    search_space = {
-        "config_updates": {
-            "bc": dict(
-                batch_size=tune.choice([8, 16, 32, 64]),
-                l2_weight=tune.loguniform(1e-6, 1e-2),  # L2 regularization weight
-                optimizer_kwargs=dict(
-                    lr=tune.loguniform(1e-5, 1e-2),
-                ),
-                train_kwargs=dict(
-                    n_epochs=tune.choice([1, 5, 10, 20]),
-                ),
-            ),
-        },
-        "command_name": "bc",
-    }
-    num_samples = 64
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_dagger():
-    sacred_ex_name = "train_imitation"
-    run_name = "dagger_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-        "dagger": {"total_timesteps": 1e5},
-        "bc": {
-            "batch_size": 16,
-            "l2_weight": 1e-4,
-            "optimizer_kwargs": {"lr": 1e-3},
-        },
-    }
-    search_space = {
-        "config_updates": {
-            "bc": dict(
-                train_kwargs=dict(
-                    n_epochs=tune.choice([1, 5, 10]),
-                ),
-            ),
-            "dagger": dict(
-                beta_schedule=tune.choice(
-                    [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
-                    + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
-                ),
-                rollout_round_min_episodes=tune.choice([3, 5, 10]),
-            ),
-        },
-        "command_name": "dagger",
-    }
-    num_samples = 50
-    repeat = 3
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_gail():
-    sacred_ex_name = "train_adversarial"
-    run_name = "gail_tuning_hc"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-        "total_timesteps": 1e7,
-    }
-    search_space = {
-        "config_updates": {
-            "algorithm_kwargs": dict(
-                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
-                n_disc_updates_per_round=tune.choice([8, 16]),
-            ),
-            "rl": {
-                "batch_size": tune.choice([4096, 8192, 16384]),
-                "rl_kwargs": {
-                    "ent_coef": tune.loguniform(1e-7, 1e-3),
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                },
-            },
-            "algorithm_specific": {},
-        },
-        "command_name": "gail",
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_airl():
-    sacred_ex_name = "train_adversarial"
-    run_name = "airl_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "demonstrations": {"rollout_type": "ppo-huggingface"},
-        "total_timesteps": 1e7,
-    }
-    search_space = {
-        "config_updates": {
-            "algorithm_kwargs": dict(
-                demo_batch_size=tune.choice([32, 128, 512, 2048, 8192]),
-                n_disc_updates_per_round=tune.choice([8, 16]),
-            ),
-            "rl": {
-                "batch_size": tune.choice([4096, 8192, 16384]),
-                "rl_kwargs": {
-                    "ent_coef": tune.loguniform(1e-7, 1e-3),
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                },
-            },
-            "algorithm_specific": {},
-        },
-        "command_name": "airl",
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
-
-
-@parallel_ex.named_config
-def example_pc():
-    sacred_ex_name = "train_preference_comparisons"
-    run_name = "pc_tuning"
-    base_named_configs = ["logging.wandb_logging"]
-    base_config_updates = {
-        "environment": {"num_vec": 1},
-        "total_timesteps": 2e7,
-        "total_comparisons": 5000,
-        "query_schedule": "hyperbolic",
-        "gatherer_kwargs": {"sample": True},
-    }
-    search_space = {
-        "named_configs": [
-            ["reward.normalize_output_disable"],
-        ],
-        "config_updates": {
-            "train": {
-                "policy_kwargs": {
-                    "activation_fn": tune.choice(
-                        [
-                            nn.ReLU,
-                        ],
-                    ),
-                },
-            },
-            "num_iterations": tune.choice([25, 50]),
-            "initial_comparison_frac": tune.choice([0.1, 0.25]),
-            "reward_trainer_kwargs": {
-                "epochs": tune.choice([1, 3, 6]),
-            },
-            "rl": {
-                "batch_size": tune.choice([512, 2048, 8192]),
-                "rl_kwargs": {
-                    "learning_rate": tune.loguniform(1e-5, 1e-2),
-                    "ent_coef": tune.loguniform(1e-7, 1e-3),
-                },
-            },
-        },
-    }
-    num_samples = 100
-    eval_best_trial = True
-    eval_trial_seeds = 5
-    repeat = 3
-    resources_per_trial = dict(cpu=1)
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 6f77330df..2417414cb 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -6,11 +6,9 @@
 import pathlib
 from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
 
-import numpy as np
 import ray
 import ray.tune
 import sacred
-from pandas.api.types import is_object_dtype
 from ray.tune import search
 from ray.tune.registry import register_trainable
 from ray.tune.search import optuna
@@ -31,14 +29,12 @@ def parallel(
     init_kwargs: Mapping[str, Any],
     local_dir: Optional[str],
     upload_dir: Optional[str],
-    repeat: int = 1,
-    eval_best_trial: bool = False,
-    eval_best_trial_resource_multiplier: int = 1,
-    eval_trial_seeds: int = 5,
-    experiment_checkpoint_path: str = "",
-    syncer=None,
-    resume: Union[str, bool] = False,
-) -> None:
+    repeat: int,
+    search_alg: Optional[str],
+    experiment_checkpoint_path: str,
+    syncer,
+    resume: Union[str, bool],
+) -> ray.tune.ExperimentAnalysis:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
     A Sacred FileObserver is attached to the inner experiment and writes Sacred
@@ -47,7 +43,7 @@ def parallel(
 
     Args:
         sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or
-            "train_adversarial".
+            "train_imitation" or "train_adversarial" or "train_preference_comparisons".
         run_name: A name describing this parallelizing experiment.
             This argument is also passed to `ray.tune.run` as the `name` argument.
             It is also saved in 'sacred/run.json' of each inner Sacred experiment
@@ -78,24 +74,19 @@ def parallel(
         init_kwargs: Arguments to pass to `ray.init`.
         local_dir: `local_dir` argument to `ray.tune.run()`.
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
+        search_alg: can be either "optuna" or None.
         repeat: Number of runs to repeat each trial for.
-        eval_best_trial: Whether to evaluate the trial with the best mean return
-            at the end of tuning on a separate set of seeds.
-        eval_best_trial_resource_multiplier: factor by which to multiply the
-            number of cpus per trial in `resources_per_trial`.
-        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
-        experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment ran using this script. Useful for resuming cancelled trials
-            of the experiments (using `resume`) or evaluating the best trial of the
-            experiment (using `eval_best_trial`).
+            Not used if `search_alg` is None.
         resume: If true and `experiment_checkpoint_path` is given, then resumes the
             experiment by restarting the trials that did not finish in the experiment
             checkpoint path.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
-
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
+
+    Returns:
+        The result of `ray.tune.run()`.
     """
     # Basic validation for config options before we enter parallel jobs.
     if not isinstance(base_named_configs, collections.abc.Sequence):
@@ -126,7 +117,11 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
-    search_alg = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
+    if search_alg == "optuna":
+        algo = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
+    else:
+        assert repeat == 1  # repeat should not be used if search_alg is None
+        algo = None
 
     if sacred_ex_name == "train_rl":
         return_key = "monitor_return_mean"
@@ -166,7 +161,6 @@ def parallel(
             result.trials = None
             result.fetch_trial_dataframes()
         else:
-            # run hyperparameter tuning
             result = ray.tune.run(
                 trainable,
                 config=search_space,
@@ -178,68 +172,11 @@ def parallel(
                     upload_dir=upload_dir,
                     syncer=syncer,
                 ),
-                search_alg=search_alg,
+                search_alg=algo,
                 metric=return_key,
                 mode="max",
             )
-        if eval_best_trial:
-            df = result.results_df
-            df = df[df["config/named_configs"].notna()]
-            # convert object dtype to str required by df.groupby
-            for col in df.columns:
-                if is_object_dtype(df[col]):
-                    df[col] = df[col].astype("str")
-            # group into separate HP configs
-            grp_keys = [
-                c for c in df.columns if c.startswith("config") and "seed" not in c
-            ]
-            grps = df.groupby(grp_keys)
-            # store mean return of runs across all seeds in a group
-            df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
-            best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-            row = best_config_df.iloc[0]
-            best_config_tag = row["experiment_tag"]
-            if result.trials is not None:
-                trial = [
-                    t for t in result.trials if best_config_tag in t.experiment_tag
-                ][0]
-                best_config = trial.config
-                print("Mean return:", row["mean_return"])
-                print(
-                    "All returns:",
-                    df[df["mean_return"] == row["mean_return"]][return_key],
-                )
-                print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
-                best_config["config_updates"].update(
-                    seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
-                )
-
-                resources_per_trial_eval = copy.deepcopy(resources_per_trial)
-                # update cpus per trial only if it is provided in `resources_per_trial`
-                # Uses the default values (cpu=1) if it is not provided
-                if "cpu" in resources_per_trial:
-
-                    resources_per_trial_eval[
-                        "cpu"
-                    ] *= eval_best_trial_resource_multiplier
-                    best_config["config_updates"].update(
-                        environment=dict(num_vec=resources_per_trial_eval["cpu"]),
-                    )
-
-                eval_result = ray.tune.run(
-                    trainable,
-                    config={
-                        "named_configs": best_config["named_configs"],
-                        "config_updates": best_config["config_updates"],
-                        "command_name": best_config.get("command_name", None),
-                    },
-                    name=run_name + "_best_hp_eval",
-                    resources_per_trial=resources_per_trial_eval,
-                )
-                returns = eval_result.results_df[return_key].to_numpy()
-                print("All returns:", returns)
-                print("Mean:", np.mean(returns))
-                print("Std:", np.std(returns))
+        return result
     finally:
         ray.shutdown()
 

From 64c3a8d0deb8748eba2a69be20d7f9a464639523 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 11 Jul 2023 16:07:13 +0530
Subject: [PATCH 19/47] Fix docstring

---
 src/imitation/scripts/parallel.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 2417414cb..10ae9f924 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -77,6 +77,10 @@ def parallel(
         search_alg: can be either "optuna" or None.
         repeat: Number of runs to repeat each trial for.
             Not used if `search_alg` is None.
+        experiment_checkpoint_path: Path containing the checkpoints of a previous
+            experiment ran using this script. Useful for resuming cancelled trials
+            of the experiments (using `resume`) or evaluating the best trial of the
+            experiment (using `eval_best_trial`).
         resume: If true and `experiment_checkpoint_path` is given, then resumes the
             experiment by restarting the trials that did not finish in the experiment
             checkpoint path.

From 8fba0d3ac9b690613b7526b68bd1c68b3ac6efa7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 11 Jul 2023 17:42:08 +0530
Subject: [PATCH 20/47] Removing resume option as it is getting tricky to
 correctly implement

---
 src/imitation/scripts/config/parallel.py |  5 +---
 src/imitation/scripts/parallel.py        | 31 ++----------------------
 tests/scripts/test_scripts.py            |  1 +
 3 files changed, 4 insertions(+), 33 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index e9c5b8245..3416f9442 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -39,14 +39,11 @@ def config():
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
     experiment_checkpoint_path = ""
-    eval_best_trial = False
-    eval_trial_seeds = 5  # Number of seeds to search over by default
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1
     search_alg = "optuna"  # search algorithm to use
-    experiment_checkpoint_path = ""  # Path to checkpoint of experiment to resume
+    experiment_checkpoint_path = ""  # Path to checkpoint of experiment 
     syncer = None  # Sacred syncer to use
-    resume = False  # Whether to resume experiment from checkpoint
 
 
 # Debug named configs
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 10ae9f924..bf73c1c72 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -33,7 +33,6 @@ def parallel(
     search_alg: Optional[str],
     experiment_checkpoint_path: str,
     syncer,
-    resume: Union[str, bool],
 ) -> ray.tune.ExperimentAnalysis:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
@@ -78,12 +77,8 @@ def parallel(
         repeat: Number of runs to repeat each trial for.
             Not used if `search_alg` is None.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment ran using this script. Useful for resuming cancelled trials
-            of the experiments (using `resume`) or evaluating the best trial of the
-            experiment (using `eval_best_trial`).
-        resume: If true and `experiment_checkpoint_path` is given, then resumes the
-            experiment by restarting the trials that did not finish in the experiment
-            checkpoint path.
+            experiment ran using this script. Useful for  evaluating the best trial
+             of the experiment.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
     Raises:
@@ -134,28 +129,6 @@ def parallel(
 
     try:
         if experiment_checkpoint_path:
-            if resume:
-                # restart failed runs from experiment_checkpoint_path
-                register_trainable("inner", trainable)
-                runner = ray.tune.execution.trial_runner.TrialRunner(
-                    local_checkpoint_dir=experiment_checkpoint_path,
-                    sync_config=ray.tune.syncer.SyncConfig(
-                        upload_dir=upload_dir,
-                        syncer=syncer,
-                    ),
-                    metric=return_key,
-                    resume=True,
-                )
-                print(
-                    "Live trials:",
-                    len(runner._live_trials),
-                    "/",
-                    len(runner._trials),
-                )
-                while not runner.is_finished():
-                    runner.step()
-                    print("Debug:", runner.debug_string())
-
             # load experiment analysis results
             result = ray.tune.ExperimentAnalysis(experiment_checkpoint_path)
             result._load_checkpoints_from_latest(
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 4435155cd..586fa91ba 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -802,6 +802,7 @@ def test_train_rl_cnn_policy(tmpdir: str, rng):
             # Need absolute path because raylet runs in different working directory.
             "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(),
         },
+        search_alg=None, # Use default search algorithm of ray.
         search_space={
             "command_name": "airl",
             "config_updates": {"total_timesteps": tune.choice([5, 10])},

From 12ab31c1641b6b99abb6823cf037a3f9340cb86c Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 12 Jul 2023 04:26:17 +0530
Subject: [PATCH 21/47] Minor fixes

---
 src/imitation/scripts/config/analyze.py  | 2 +-
 src/imitation/scripts/config/parallel.py | 2 +-
 src/imitation/scripts/parallel.py        | 5 ++---
 tests/scripts/test_scripts.py            | 7 ++++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/imitation/scripts/config/analyze.py b/src/imitation/scripts/config/analyze.py
index 5213a875d..01cc2d035 100644
--- a/src/imitation/scripts/config/analyze.py
+++ b/src/imitation/scripts/config/analyze.py
@@ -18,7 +18,7 @@ def config():
     tex_output_path = None  # Write LaTex output to this path
     print_table = True  # Set to True to print analysis to stdout
     split_str = ","  # str used to split source_dir_str into multiple source dirs
-    table_verbosity = 1  # Choose from 0, 1, or 2
+    table_verbosity = 1  # Choose from 0, 1, 2 or 3
     source_dirs = None
 
 
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 3416f9442..b09f9fc4a 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -42,7 +42,7 @@ def config():
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1
     search_alg = "optuna"  # search algorithm to use
-    experiment_checkpoint_path = ""  # Path to checkpoint of experiment 
+    experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     syncer = None  # Sacred syncer to use
 
 
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index bf73c1c72..ebda17c82 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -4,13 +4,12 @@
 import copy
 import glob
 import pathlib
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence
 
 import ray
 import ray.tune
 import sacred
 from ray.tune import search
-from ray.tune.registry import register_trainable
 from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
 
@@ -78,7 +77,7 @@ def parallel(
             Not used if `search_alg` is None.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
             experiment ran using this script. Useful for  evaluating the best trial
-             of the experiment.
+            of the experiment.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
     Raises:
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 586fa91ba..e17765471 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -802,7 +802,7 @@ def test_train_rl_cnn_policy(tmpdir: str, rng):
             # Need absolute path because raylet runs in different working directory.
             "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(),
         },
-        search_alg=None, # Use default search algorithm of ray.
+        search_alg=None,  # Use default search algorithm of ray.
         search_space={
             "command_name": "airl",
             "config_updates": {"total_timesteps": tune.choice([5, 10])},
@@ -942,7 +942,7 @@ def test_analyze_imitation(tmpdir: str, run_names: List[str], run_sacred_fn):
             assert run.status == "COMPLETED"
 
     # Check that analyze script finds the correct number of logs.
-    def check(run_name: Optional[str], count: int) -> None:
+    def check(run_name: Optional[str], count: int, table_verbosity=1) -> None:
         run = analyze.analysis_ex.run(
             command_name="analyze_imitation",
             config_updates=dict(
@@ -952,6 +952,7 @@ def check(run_name: Optional[str], count: int) -> None:
                 csv_output_path=tmpdir_path / "analysis.csv",
                 tex_output_path=tmpdir_path / "analysis.tex",
                 print_table=True,
+                table_verbosity=table_verbosity,
             ),
         )
         assert run.status == "COMPLETED"
@@ -961,7 +962,7 @@ def check(run_name: Optional[str], count: int) -> None:
     for run_name, count in Counter(run_names).items():
         check(run_name, count)
 
-    check(None, len(run_names))  # Check total number of logs.
+    check(None, len(run_names), table_verbosity=3)  # Check total number of logs.
 
 
 def test_analyze_gather_tb(tmpdir: str):

From 19b0f2c3ed8d7d2ef10aaabab21739d31b51261c Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Sun, 16 Jul 2023 10:39:12 +0530
Subject: [PATCH 22/47] Updates from review

---
 benchmarking/tuning.py                   | 202 +++++++++++++++--------
 benchmarking/tuning_config.py            |  36 ++--
 src/imitation/scripts/config/parallel.py |   3 +-
 src/imitation/scripts/parallel.py        |   9 +-
 tests/test_benchmarking.py               |  27 +++
 5 files changed, 180 insertions(+), 97 deletions(-)

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index b4e62a84a..0c18b1256 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -13,82 +13,144 @@
 
 @tuning_ex.main
 def tune(
-    parallel: Dict[str, Any],
-    eval_best_trial: bool = False,
+    parallel_run_config: Dict[str, Any],
     eval_best_trial_resource_multiplier: int = 1,
-    eval_trial_seeds: int = 5,
+    num_eval_seeds: int = 5,
 ) -> None:
     """Tune hyperparameters of imitation algorithms using parallel script.
 
     Args:
-        parallel: A dictionary of arguments from the parallel script.
-        eval_best_trial: Whether to evaluate the trial with the best mean return
-            at the end of tuning on a separate set of seeds.
-        eval_best_trial_resource_multiplier: factor by which to multiply the
-            number of cpus per trial in `resources_per_trial`.
-        eval_trial_seeds: Number of distinct seeds to evaluate the best trial on.
+        parallel_run_config: Dictionary of arguments to pass to the parallel script.
+        eval_best_trial_resource_multiplier: Factor by which to multiply the
+            number of cpus per trial in `resources_per_trial`. This is useful for
+            allocating more resources per trial to the evaluation trials than the
+            resources for hyperparameter tuning since number of evaluation trials
+            is usually much smaller than the number of tuning trials.
+        num_eval_seeds: Number of distinct seeds to evaluate the best trial on.
+            Set to 0 to disable evaluation.
+
+    Raises:
+        ValueError: If no trials are returned by.
+    """
+    run = parallel_ex.run(config_updates=parallel_run_config)
+    experiment_analysis = run.result
+    if not experiment_analysis.trials:
+        raise ValueError(
+            "No trials found. Please ensure that the `experiment_checkpoint_path` "
+            "in `parallel_run_config` is passed correctly "
+            "or that the tuning run finished properly.",
+        )
+
+    return_key = "imit_stats/monitor_return_mean"
+    if parallel_run_config["sacred_ex_name"] == "train_rl":
+        return_key = "monitor_return_mean"
+    best_trial = find_best_trial(experiment_analysis, return_key, print_return=True)
+
+    if num_eval_seeds > 0:  # evaluate the best trial
+        resources_per_trial_eval = copy.deepcopy(
+            parallel_run_config["resources_per_trial"],
+        )
+        # update cpus per trial only if it is provided in `resources_per_trial`
+        # Uses the default values (cpu=1) if it is not provided
+        if "cpu" in parallel_run_config["resources_per_trial"]:
+            resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
+        evaluate_best_trial(
+            best_trial,
+            num_eval_seeds,
+            parallel_run_config,
+            resources_per_trial_eval,
+            return_key,
+        )
+
+
+def find_best_trial(
+    experiment_analysis: ray.tune.analysis.ExperimentAnalysis,
+    return_key: str,
+    print_return: bool = False,
+) -> ray.tune.experiment.Trial:
+    """Find the trial with the best mean return across all seeds.
+
+    Args:
+        experiment_analysis: The result of a parallel/tuning experiment.
+        return_key: The key of the return metric in the results dataframe.
+        print_return: Whether to print the mean and std of the returns
+            of the best trial.
+
+    Returns:
+        best_trial: The trial with the best mean return across all seeds.
+    """
+    df = experiment_analysis.results_df
+    # convert object dtype to str required by df.groupby
+    for col in df.columns:
+        if pd_types.is_object_dtype(df[col]):
+            df[col] = df[col].astype("str")
+    # group into separate HP configs
+    grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
+    grps = df.groupby(grp_keys)
+    # store mean return of runs across all seeds in a group
+    df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
+    best_config_df = df[df["mean_return"] == df["mean_return"].max()]
+    row = best_config_df.iloc[0]
+    best_config_tag = row["experiment_tag"]
+    assert experiment_analysis.trials is not None  # for mypy
+    best_trial = [
+        t for t in experiment_analysis.trials if best_config_tag in t.experiment_tag
+    ][0]
+
+    if print_return:
+        all_returns = df[df["mean_return"] == row["mean_return"]][return_key]
+        all_returns = all_returns.to_numpy()
+        print("All returns:", all_returns)
+        print("Mean return:", row["mean_return"])
+        print("Std return:", np.std(all_returns))
+        print("Total seeds:", len(all_returns))
+    return best_trial
+
+
+def evaluate_best_trial(
+    best_trial: ray.tune.experiment.Trial,
+    num_eval_seeds: int,
+    parallel_run_config: Dict[str, Any],
+    resources_per_trial: Dict[str, int],
+    return_key: str,
+    print_return: bool = False,
+):
+    """Evaluate the best trial of a parallel run on a separate set of seeds.
+
+    Args:
+        best_trial: The trial with the best mean return across all seeds.
+        num_eval_seeds: Number of distinct seeds to evaluate the best trial on.
+        parallel_run_config: Dictionary of arguments passed to the parallel
+            script to get best_trial.
+        resources_per_trial: Resources to be used for each evaluation trial.
+        return_key: The key of the return metric in the results dataframe.
+        print_return: Whether to print the mean and std of the evaluation returns.
+
+    Returns:
+        eval_run: The result of the evaluation run.
     """
-    run = parallel_ex.run(config_updates=parallel)
-    result = run.result
-
-    if eval_best_trial:
-        if parallel["sacred_ex_name"] == "train_rl":
-            return_key = "monitor_return_mean"
-        else:
-            return_key = "imit_stats/monitor_return_mean"
-        df = result.results_df
-        df = df[df["config/named_configs"].notna()]
-        # convert object dtype to str required by df.groupby
-        for col in df.columns:
-            if pd_types.is_object_dtype(df[col]):
-                df[col] = df[col].astype("str")
-        # group into separate HP configs
-        grp_keys = [c for c in df.columns if c.startswith("config") and "seed" not in c]
-        grps = df.groupby(grp_keys)
-        # store mean return of runs across all seeds in a group
-        df["mean_return"] = grps[return_key].transform(lambda x: x.mean())
-        best_config_df = df[df["mean_return"] == df["mean_return"].max()]
-        row = best_config_df.iloc[0]
-        best_config_tag = row["experiment_tag"]
-        if result.trials is not None:
-            trial = [t for t in result.trials if best_config_tag in t.experiment_tag][0]
-            best_config = trial.config
-            print("Mean return:", row["mean_return"])
-            print(
-                "All returns:",
-                df[df["mean_return"] == row["mean_return"]][return_key],
-            )
-            print("Total seeds:", (df["mean_return"] == row["mean_return"]).sum())
-
-            best_config["config_updates"].update(
-                seed=ray.tune.grid_search(list(range(100, 100 + eval_trial_seeds))),
-            )
-
-            resources_per_trial_eval = copy.deepcopy(parallel["resources_per_trial"])
-            # update cpus per trial only if it is provided in `resources_per_trial`
-            # Uses the default values (cpu=1) if it is not provided
-            if "cpu" in parallel["resources_per_trial"]:
-                resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
-
-            eval_config_updates = parallel.copy()
-            eval_config_updates.update(
-                run_name=parallel["run_name"] + "_best_hp_eval",
-                num_samples=1,
-                search_space=best_config,
-                base_named_configs=parallel["base_named_configs"],
-                base_config_updates=parallel["base_config_updates"],
-                resources_per_trial=resources_per_trial_eval,
-                search_alg=None,
-                repeat=1,
-                experiment_checkpoint_path="",
-                resume=False,
-            )
-            eval_run = parallel_ex.run(config_updates=eval_config_updates)
-            eval_result = eval_run.result
-            returns = eval_result.results_df[return_key].to_numpy()
-            print("All returns:", returns)
-            print("Mean:", np.mean(returns))
-            print("Std:", np.std(returns))
+    best_config = best_trial.config
+    best_config["config_updates"].update(
+        seed=ray.tune.grid_search(list(range(100, 100 + num_eval_seeds))),
+    )
+    eval_config_updates = parallel_run_config.copy()
+    eval_config_updates.update(
+        run_name=parallel_run_config["run_name"] + "_best_hp_eval",
+        num_samples=1,
+        search_space=best_config,
+        resources_per_trial=resources_per_trial,
+        search_alg=None,
+        repeat=1,
+        experiment_checkpoint_path="",
+    )
+    eval_run = parallel_ex.run(config_updates=eval_config_updates)
+    eval_result = eval_run.result
+    returns = eval_result.results_df[return_key].to_numpy()
+    if print_return:
+        print("All returns:", returns)
+        print("Mean:", np.mean(returns))
+        print("Std:", np.std(returns))
+    return eval_run
 
 
 def main_console():
diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py
index 79c8d0347..187963d02 100644
--- a/benchmarking/tuning_config.py
+++ b/benchmarking/tuning_config.py
@@ -12,7 +12,7 @@
 
 @tuning_ex.named_config
 def example_rl():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_rl",
         run_name="rl_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -33,13 +33,12 @@ def example_rl():
         repeat=1,
         resources_per_trial=dict(cpu=1),
     )
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_bc():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="bc_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -62,19 +61,18 @@ def example_bc():
             },
             "command_name": "bc",
         },
-        num_samples=2,
-        repeat=1,
+        num_samples=64,
+        repeat=3,
         resources_per_trial=dict(cpu=1),
     )
 
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
     eval_best_trial_resource_multiplier = 1
 
 
 @tuning_ex.named_config
 def example_dagger():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="dagger_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -109,13 +107,12 @@ def example_dagger():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_gail():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="gail_tuning_hc",
         base_named_configs=["logging.wandb_logging"],
@@ -145,13 +142,12 @@ def example_gail():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_airl():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="airl_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -181,14 +177,12 @@ def example_airl():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
 
 
 @tuning_ex.named_config
 def example_pc():
-    parallel = dict(
+    parallel_run_config = dict(
         sacred_ex_name="train_preference_comparisons",
         run_name="pc_tuning",
         base_named_configs=["logging.wandb_logging"],
@@ -232,6 +226,4 @@ def example_pc():
         repeat=3,
         resources_per_trial=dict(cpu=1),
     )
-
-    eval_best_trial = True
-    eval_trial_seeds = 5
+    num_eval_seeds = 5
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index b09f9fc4a..b38b6f28c 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -38,9 +38,8 @@ def config():
 
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
-    experiment_checkpoint_path = ""
     num_samples = 1  # Number of samples per grid search configuration
-    repeat = 1
+    repeat = 1 # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     syncer = None  # Sacred syncer to use
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index ebda17c82..93aa932b9 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -72,11 +72,13 @@ def parallel(
         init_kwargs: Arguments to pass to `ray.init`.
         local_dir: `local_dir` argument to `ray.tune.run()`.
         upload_dir: `upload_dir` argument to `ray.tune.run()`.
-        search_alg: can be either "optuna" or None.
+        search_alg: can be either "optuna" or None. Setting `None` allows for
+            adding grid_search to the `search_space` hyperparameters but doesn't allow
+            for trials to be repeated.
         repeat: Number of runs to repeat each trial for.
             Not used if `search_alg` is None.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
-            experiment ran using this script. Useful for  evaluating the best trial
+            experiment ran using this script. Useful for evaluating the best trial
             of the experiment.
         syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
 
@@ -84,7 +86,8 @@ def parallel(
         TypeError: Named configs not string sequences or config updates not mappings.
 
     Returns:
-        The result of `ray.tune.run()`.
+        The result of running the parallel experiment with `ray.tune.run()`.
+        Useful for fetching the configs and results dataframe of all the trials.
     """
     # Basic validation for config options before we enter parallel jobs.
     if not isinstance(base_named_configs, collections.abc.Sequence):
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index ba01b38a2..4a8f6ea6f 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -1,5 +1,7 @@
 """Tests for config files in benchmarking/ folder."""
 import pathlib
+import subprocess
+import sys
 
 import pytest
 
@@ -44,3 +46,28 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
 
     # THEN
     assert run.status == "COMPLETED"
+
+
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_tuning_print_config_succeeds(algorithm: str):
+    # We test the configs using the print_config command,
+    # because running the configs requires MuJoCo.
+    # Requiring MuJoCo to run the tests adds too much complexity.
+
+    # We need to use sys.executable, not just "python", on Windows as
+    # subprocess.call ignores PATH (unless shell=True) so runs a
+    # system-wide Python interpreter outside of our venv. See:
+    # https://stackoverflow.com/questions/5658622/
+    tuning_path = str(BENCHMARKING_DIR / "tuning.py")
+    env = 'parallel_run_config.base_named_configs=["seals_cartpole"]'
+    exit_code = subprocess.call(
+        [
+            sys.executable,
+            tuning_path,
+            "print_config",
+            "with",
+            f"example_{algorithm}",
+            env,
+        ],
+    )
+    assert exit_code == 0

From 046b8d9987e13a8d87f2bd52fe75be562e80db04 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Sun, 16 Jul 2023 13:04:14 +0530
Subject: [PATCH 23/47] fix lint error

---
 src/imitation/scripts/config/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index b38b6f28c..e81a617db 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -39,7 +39,7 @@ def config():
     local_dir = None  # `local_dir` arg for `ray.tune.run`
     upload_dir = None  # `upload_dir` arg for `ray.tune.run`
     num_samples = 1  # Number of samples per grid search configuration
-    repeat = 1 # Number of times to repeat a sampled configuration
+    repeat = 1  # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     syncer = None  # Sacred syncer to use

From 8eee0822d3fb4686d5801a6e955fdde0c9a90ce7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Sun, 16 Jul 2023 13:52:43 +0530
Subject: [PATCH 24/47] Add documentation for using the tuning script

---
 benchmarking/README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 3f5114545..95e67f1d3 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -17,3 +17,24 @@ python -m imitation.scripts.<train_script> <algo> with benchmarking/<config_name
 ...
 ex.add_config('benchmarking/<config_name>.json')
 ```
+
+# Tuning Hyperparameters
+
+The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script.
+The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
+the search space defined in the `tuning_config.py` script. The tuning script proceeds in two
+phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best 
+hyperparameter config found in the first phase based on the maximum mean return is
+re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials
+are reported.
+
+To tune the hyperparameters of an algorithm using the default search space provided:
+```bash
+python tuning.py with example_{algo} 'parallel_run_config.base_named_configs=["{env}"]'
+```
+
+In this command, `example_{algo}` provides the default search space and settings to be used for
+the specific algorithm, which is defined in the `tuning_config.py` script and
+`'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in.
+See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be
+provided through the command line to change the tuning behavior.

From 5ce765859f7cd295ae607cab2709d0f626c65de7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Mon, 17 Jul 2023 09:08:04 +0530
Subject: [PATCH 25/47] Fix lint error

---
 benchmarking/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 95e67f1d3..892908ac8 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -23,7 +23,7 @@ ex.add_config('benchmarking/<config_name>.json')
 The hyperparameters of any algorithm in imitation can be tuned using the `tuning.py` script.
 The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
 the search space defined in the `tuning_config.py` script. The tuning script proceeds in two
-phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best 
+phases: 1) The hyperparameters are tuned using the search space provided, and 2) the best
 hyperparameter config found in the first phase based on the maximum mean return is
 re-evaluated on a separate set of seeds, and the mean and standard deviation of these trials
 are reported.

From a8be3316b653451ce8366379cf413627dd22e1ec Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 18 Jul 2023 11:09:05 +0530
Subject: [PATCH 26/47] Updates from the review

---
 benchmarking/README.md                        |  4 ++--
 ....json => airl_seals_ant_best_hp_eval.json} |  0
 ...airl_seals_half_cheetah_best_hp_eval.json} |  0
 ...on => airl_seals_hopper_best_hp_eval.json} |  0
 ...n => airl_seals_swimmer_best_hp_eval.json} |  0
 ...on => airl_seals_walker_best_hp_eval.json} |  0
 ...al.json => bc_seals_ant_best_hp_eval.json} |  0
 ...> bc_seals_half_cheetah_best_hp_eval.json} |  0
 ...json => bc_seals_hopper_best_hp_eval.json} |  0
 ...son => bc_seals_swimmer_best_hp_eval.json} |  0
 ...json => bc_seals_walker_best_hp_eval.json} |  0
 ...son => dagger_seals_ant_best_hp_eval.json} |  0
 ...gger_seals_half_cheetah_best_hp_eval.json} |  0
 ... => dagger_seals_hopper_best_hp_eval.json} |  0
 ...=> dagger_seals_swimmer_best_hp_eval.json} |  0
 ... => dagger_seals_walker_best_hp_eval.json} |  0
 ....json => gail_seals_ant_best_hp_eval.json} |  0
 ...gail_seals_half_cheetah_best_hp_eval.json} |  0
 ...on => gail_seals_hopper_best_hp_eval.json} |  0
 ...n => gail_seals_swimmer_best_hp_eval.json} |  0
 ...on => gail_seals_walker_best_hp_eval.json} |  0
 benchmarking/tuning.py                        | 23 +++++++++++--------
 benchmarking/tuning_config.py                 | 21 +++++++++--------
 benchmarking/util.py                          |  2 +-
 experiments/commands.py                       | 18 +++++++--------
 src/imitation/scripts/config/parallel.py      |  6 ++---
 tests/test_benchmarking.py                    |  4 ++--
 tests/test_experiments.py                     | 16 ++++++-------
 28 files changed, 49 insertions(+), 45 deletions(-)
 rename benchmarking/{example_airl_seals_ant_best_hp_eval.json => airl_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_half_cheetah_best_hp_eval.json => airl_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_hopper_best_hp_eval.json => airl_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_swimmer_best_hp_eval.json => airl_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_airl_seals_walker_best_hp_eval.json => airl_seals_walker_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_ant_best_hp_eval.json => bc_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_half_cheetah_best_hp_eval.json => bc_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_hopper_best_hp_eval.json => bc_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_swimmer_best_hp_eval.json => bc_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_bc_seals_walker_best_hp_eval.json => bc_seals_walker_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_ant_best_hp_eval.json => dagger_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_half_cheetah_best_hp_eval.json => dagger_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_hopper_best_hp_eval.json => dagger_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_swimmer_best_hp_eval.json => dagger_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_dagger_seals_walker_best_hp_eval.json => dagger_seals_walker_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_ant_best_hp_eval.json => gail_seals_ant_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_half_cheetah_best_hp_eval.json => gail_seals_half_cheetah_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_hopper_best_hp_eval.json => gail_seals_hopper_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_swimmer_best_hp_eval.json => gail_seals_swimmer_best_hp_eval.json} (100%)
 rename benchmarking/{example_gail_seals_walker_best_hp_eval.json => gail_seals_walker_best_hp_eval.json} (100%)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 892908ac8..3973c6181 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -30,10 +30,10 @@ are reported.
 
 To tune the hyperparameters of an algorithm using the default search space provided:
 ```bash
-python tuning.py with example_{algo} 'parallel_run_config.base_named_configs=["{env}"]'
+python tuning.py with {algo} 'parallel_run_config.base_named_configs=["{env}"]'
 ```
 
-In this command, `example_{algo}` provides the default search space and settings to be used for
+In this command, `{algo}` provides the default search space and settings to be used for
 the specific algorithm, which is defined in the `tuning_config.py` script and
 `'parallel_run_config.base_named_configs=["{env}"]'` sets the environment to tune the algorithm in.
 See the documentation of `tuning.py` and `parallel.py` scripts for many other arguments that can be
diff --git a/benchmarking/example_airl_seals_ant_best_hp_eval.json b/benchmarking/airl_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_ant_best_hp_eval.json
rename to benchmarking/airl_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_half_cheetah_best_hp_eval.json b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/airl_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_hopper_best_hp_eval.json b/benchmarking/airl_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_hopper_best_hp_eval.json
rename to benchmarking/airl_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_swimmer_best_hp_eval.json b/benchmarking/airl_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_swimmer_best_hp_eval.json
rename to benchmarking/airl_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_airl_seals_walker_best_hp_eval.json b/benchmarking/airl_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_airl_seals_walker_best_hp_eval.json
rename to benchmarking/airl_seals_walker_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_ant_best_hp_eval.json b/benchmarking/bc_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_ant_best_hp_eval.json
rename to benchmarking/bc_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/bc_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_hopper_best_hp_eval.json b/benchmarking/bc_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_hopper_best_hp_eval.json
rename to benchmarking/bc_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_swimmer_best_hp_eval.json b/benchmarking/bc_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_swimmer_best_hp_eval.json
rename to benchmarking/bc_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_bc_seals_walker_best_hp_eval.json b/benchmarking/bc_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_bc_seals_walker_best_hp_eval.json
rename to benchmarking/bc_seals_walker_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_ant_best_hp_eval.json b/benchmarking/dagger_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_ant_best_hp_eval.json
rename to benchmarking/dagger_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_hopper_best_hp_eval.json b/benchmarking/dagger_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_hopper_best_hp_eval.json
rename to benchmarking/dagger_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_swimmer_best_hp_eval.json b/benchmarking/dagger_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_swimmer_best_hp_eval.json
rename to benchmarking/dagger_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_dagger_seals_walker_best_hp_eval.json b/benchmarking/dagger_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_dagger_seals_walker_best_hp_eval.json
rename to benchmarking/dagger_seals_walker_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_ant_best_hp_eval.json b/benchmarking/gail_seals_ant_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_ant_best_hp_eval.json
rename to benchmarking/gail_seals_ant_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_half_cheetah_best_hp_eval.json b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_half_cheetah_best_hp_eval.json
rename to benchmarking/gail_seals_half_cheetah_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_hopper_best_hp_eval.json b/benchmarking/gail_seals_hopper_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_hopper_best_hp_eval.json
rename to benchmarking/gail_seals_hopper_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_swimmer_best_hp_eval.json b/benchmarking/gail_seals_swimmer_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_swimmer_best_hp_eval.json
rename to benchmarking/gail_seals_swimmer_best_hp_eval.json
diff --git a/benchmarking/example_gail_seals_walker_best_hp_eval.json b/benchmarking/gail_seals_walker_best_hp_eval.json
similarity index 100%
rename from benchmarking/example_gail_seals_walker_best_hp_eval.json
rename to benchmarking/gail_seals_walker_best_hp_eval.json
diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index 0c18b1256..324032088 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -30,7 +30,7 @@ def tune(
             Set to 0 to disable evaluation.
 
     Raises:
-        ValueError: If no trials are returned by.
+        ValueError: If no trials are returned by the parallel run of tuning.
     """
     run = parallel_ex.run(config_updates=parallel_run_config)
     experiment_analysis = run.result
@@ -54,9 +54,10 @@ def tune(
         # Uses the default values (cpu=1) if it is not provided
         if "cpu" in parallel_run_config["resources_per_trial"]:
             resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
-        evaluate_best_trial(
+        evaluate_trial(
             best_trial,
             num_eval_seeds,
+            parallel_run_config["run_name"] + "_best_hp_eval",
             parallel_run_config,
             resources_per_trial_eval,
             return_key,
@@ -107,19 +108,21 @@ def find_best_trial(
     return best_trial
 
 
-def evaluate_best_trial(
-    best_trial: ray.tune.experiment.Trial,
+def evaluate_trial(
+    trial: ray.tune.experiment.Trial,
     num_eval_seeds: int,
+    run_name: str,
     parallel_run_config: Dict[str, Any],
     resources_per_trial: Dict[str, int],
     return_key: str,
     print_return: bool = False,
 ):
-    """Evaluate the best trial of a parallel run on a separate set of seeds.
+    """Evaluate a given trial of a parallel run on a separate set of seeds.
 
     Args:
-        best_trial: The trial with the best mean return across all seeds.
+        trial: The trial to evaluate.
         num_eval_seeds: Number of distinct seeds to evaluate the best trial on.
+        run_name: The name of the evaluation run.
         parallel_run_config: Dictionary of arguments passed to the parallel
             script to get best_trial.
         resources_per_trial: Resources to be used for each evaluation trial.
@@ -129,15 +132,15 @@ def evaluate_best_trial(
     Returns:
         eval_run: The result of the evaluation run.
     """
-    best_config = best_trial.config
-    best_config["config_updates"].update(
+    config = trial.config
+    config["config_updates"].update(
         seed=ray.tune.grid_search(list(range(100, 100 + num_eval_seeds))),
     )
     eval_config_updates = parallel_run_config.copy()
     eval_config_updates.update(
-        run_name=parallel_run_config["run_name"] + "_best_hp_eval",
+        run_name=run_name,
         num_samples=1,
-        search_space=best_config,
+        search_space=config,
         resources_per_trial=resources_per_trial,
         search_alg=None,
         repeat=1,
diff --git a/benchmarking/tuning_config.py b/benchmarking/tuning_config.py
index 187963d02..239537406 100644
--- a/benchmarking/tuning_config.py
+++ b/benchmarking/tuning_config.py
@@ -4,14 +4,14 @@
 import sacred
 from torch import nn
 
-from imitation.algorithms import dagger
+from imitation.algorithms import dagger as dagger_alg
 from imitation.scripts.parallel import parallel_ex
 
 tuning_ex = sacred.Experiment("tuning", ingredients=[parallel_ex])
 
 
 @tuning_ex.named_config
-def example_rl():
+def rl():
     parallel_run_config = dict(
         sacred_ex_name="train_rl",
         run_name="rl_tuning",
@@ -37,7 +37,7 @@ def example_rl():
 
 
 @tuning_ex.named_config
-def example_bc():
+def bc():
     parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="bc_tuning",
@@ -71,7 +71,7 @@ def example_bc():
 
 
 @tuning_ex.named_config
-def example_dagger():
+def dagger():
     parallel_run_config = dict(
         sacred_ex_name="train_imitation",
         run_name="dagger_tuning",
@@ -95,8 +95,11 @@ def example_dagger():
                 ),
                 "dagger": dict(
                     beta_schedule=tune.choice(
-                        [dagger.LinearBetaSchedule(i) for i in [1, 5, 15]]
-                        + [dagger.ExponentialBetaSchedule(i) for i in [0.3, 0.5, 0.7]],
+                        [dagger_alg.LinearBetaSchedule(i) for i in [1, 5, 15]]
+                        + [
+                            dagger_alg.ExponentialBetaSchedule(i)
+                            for i in [0.3, 0.5, 0.7]
+                        ],
                     ),
                     rollout_round_min_episodes=tune.choice([3, 5, 10]),
                 ),
@@ -111,7 +114,7 @@ def example_dagger():
 
 
 @tuning_ex.named_config
-def example_gail():
+def gail():
     parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="gail_tuning_hc",
@@ -146,7 +149,7 @@ def example_gail():
 
 
 @tuning_ex.named_config
-def example_airl():
+def airl():
     parallel_run_config = dict(
         sacred_ex_name="train_adversarial",
         run_name="airl_tuning",
@@ -181,7 +184,7 @@ def example_airl():
 
 
 @tuning_ex.named_config
-def example_pc():
+def pc():
     parallel_run_config = dict(
         sacred_ex_name="train_preference_comparisons",
         run_name="pc_tuning",
diff --git a/benchmarking/util.py b/benchmarking/util.py
index 408f0d812..88416344d 100644
--- a/benchmarking/util.py
+++ b/benchmarking/util.py
@@ -79,7 +79,7 @@ def clean_config_file(file: pathlib.Path, write_path: pathlib.Path, /) -> None:
 
     remove_empty_dicts(config)
     # files are of the format
-    # /path/to/file/example_<algo>_<env>_best_hp_eval/<other_info>/sacred/1/config.json
+    # /path/to/file/<algo>_<env>_best_hp_eval/<other_info>/sacred/1/config.json
     # we want to write to /<write_path>/<algo>_<env>.json
     with open(write_path / f"{file.parents[3].name}.json", "w") as f:
         json.dump(config, f, indent=4)
diff --git a/experiments/commands.py b/experiments/commands.py
index 2ac737e06..9021d3738 100644
--- a/experiments/commands.py
+++ b/experiments/commands.py
@@ -22,13 +22,13 @@
 python -m imitation.scripts.train_adversarial airl \
     --capture=sys --name=run0 \
     --file_storage=output/sacred/$USER-cmd-run0-airl-0-a3531726 \
-    with ../benchmarking/example_airl_seals_walker_best_hp_eval.json \
+    with ../benchmarking/airl_seals_walker_best_hp_eval.json \
     seed=0 logging.log_root=output
 
 python -m imitation.scripts.train_adversarial gail \
     --capture=sys --name=run0 \
     --file_storage=output/sacred/$USER-cmd-run0-gail-0-a1ec171b \
-    with ../benchmarking/example_gail_seals_walker_best_hp_eval.json \
+    with ../benchmarking/gail_seals_walker_best_hp_eval.json \
     seed=0 logging.log_root=output
 
 We can execute commands in parallel by piping them to GNU parallel:
@@ -42,7 +42,7 @@
 
 python commands.py \
     --name=run0 \
-    --cfg_pattern=../benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json \
+    --cfg_pattern=../benchmarking/bc_seals_half_cheetah_best_hp_eval.json \
     --output_dir=/data/output \
     --remote
 
@@ -52,7 +52,7 @@
     --command "python -m imitation.scripts.train_imitation bc \
     --capture=sys --name=run0 \
     --file_storage=/data/output/sacred/$USER-cmd-run0-bc-0-72cb1df3 \
-    with /data/imitation/benchmarking/example_bc_seals_half_cheetah_best_hp_eval.json \
+    with /data/imitation/benchmarking/bc_seals_half_cheetah_best_hp_eval.json \
     seed=0 logging.log_root=/data/output" \
     --container hacobe/devbox:imitation \
     --login --force-pull --never-restart --gpu 0 --shared-host-dir-mount /data
@@ -177,19 +177,19 @@ def parse() -> argparse.Namespace:
     parser.add_argument(
         "--cfg_pattern",
         type=str,
-        default="example_bc_seals_half_cheetah_best_hp_eval.json",
+        default="bc_seals_half_cheetah_best_hp_eval.json",
         help="""Generate a command for every file that matches this glob pattern. \
 Each matching file should be a config file that has its algorithm name \
 (bc, dagger, airl or gail) bookended by underscores in the filename. \
 If the --remote flag is enabled, then generate a command for every file in the \
 --remote_cfg_dir directory that has the same filename as a file that matches this \
 glob pattern. E.g., suppose the current, local working directory is 'foo' and \
-the subdirectory 'foo/bar' contains the config files 'example_bc_best.json' and \
-'example_dagger_best.json'. If the pattern 'bar/*.json' is supplied, then globbing \
-will return ['bar/example_bc_best.json', 'bar/example_dagger_best.json']. \
+the subdirectory 'foo/bar' contains the config files 'bc_best.json' and \
+'dagger_best.json'. If the pattern 'bar/*.json' is supplied, then globbing \
+will return ['bar/bc_best.json', 'bar/dagger_best.json']. \
 If the --remote flag is enabled, 'bar' will be replaced with `remote_cfg_dir` and \
 commands will be created for the following configs: \
-[`remote_cfg_dir`/example_bc_best.json, `remote_cfg_dir`/example_dagger_best.json] \
+[`remote_cfg_dir`/bc_best.json, `remote_cfg_dir`/dagger_best.json] \
 Why not just supply the pattern '`remote_cfg_dir`/*.json' directly? \
 Because the `remote_cfg_dir` directory may not exist on the local machine.""",
     )
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index e81a617db..a591f3d9a 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -7,10 +7,8 @@
 Adding custom named configs is necessary because the CLI interface can't add
 search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
 
-For tuning hyperparameters of an algorithm on a given environment, override
-the `base_named_configs` argument with the named config of the environment.
-Ex: python -m imitation.scripts.parallel with example_gail \
-    'base_named_configs=["logging.wandb_logging", "seals_half_cheetah"]'
+For tuning hyperparameters of an algorithm on a given environment,
+check out the benchmarking/tuning.py script.
 """
 
 import numpy as np
diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
index 4a8f6ea6f..18d4f12cf 100644
--- a/tests/test_benchmarking.py
+++ b/tests/test_benchmarking.py
@@ -37,7 +37,7 @@ def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
 
     config_name = f"{algorithm}_{environment}"
     config_file = str(
-        BENCHMARKING_DIR / f"example_{algorithm}_{environment}_best_hp_eval.json",
+        BENCHMARKING_DIR / f"{algorithm}_{environment}_best_hp_eval.json",
     )
 
     # WHEN
@@ -66,7 +66,7 @@ def test_tuning_print_config_succeeds(algorithm: str):
             tuning_path,
             "print_config",
             "with",
-            f"example_{algorithm}",
+            f"{algorithm}",
             env,
         ],
     )
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 0f6d314fe..0d431d0e9 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -245,13 +245,13 @@ def test_commands_hofvarpnir_config_with_special_characters_in_flags(tmpdir):
 def test_commands_bc_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_bc_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("bc_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation bc \
 --capture=sys --name=run0 --file_storage=output/sacred/\
 $USER-cmd-run0-bc-0-138a1475 \
-with benchmarking/example_bc_seals_ant_best_hp_eval.json \
+with benchmarking/bc_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -259,13 +259,13 @@ def test_commands_bc_config():
 def test_commands_dagger_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_dagger_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("dagger_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=run0 --file_storage=output/sacred/\
 $USER-cmd-run0-dagger-0-6a49161a \
-with benchmarking/example_dagger_seals_ant_best_hp_eval.json \
+with benchmarking/dagger_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -273,13 +273,13 @@ def test_commands_dagger_config():
 def test_commands_gail_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_gail_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("gail_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial gail \
 --capture=sys --name=run0 --file_storage=output/sacred/\
 $USER-cmd-run0-gail-0-3ec8154d \
-with benchmarking/example_gail_seals_ant_best_hp_eval.json \
+with benchmarking/gail_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 
@@ -287,13 +287,13 @@ def test_commands_gail_config():
 def test_commands_airl_config():
     if os.name == "nt":  # pragma: no cover
         pytest.skip("commands.py not ported to Windows.")
-    cfg_pattern = _get_benchmarking_path("example_airl_seals_ant_best_hp_eval.json")
+    cfg_pattern = _get_benchmarking_path("airl_seals_ant_best_hp_eval.json")
     commands = _run_commands_from_flags(cfg_pattern=cfg_pattern)
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial airl \
 --capture=sys --name=run0 \
 --file_storage=output/sacred/$USER-cmd-run0-airl-0-400e1558 \
-with benchmarking/example_airl_seals_ant_best_hp_eval.json \
+with benchmarking/airl_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
 

From 4ff006d1f2162c8f5085c1f824a19090846dd23c Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Tue, 18 Jul 2023 12:06:30 +0530
Subject: [PATCH 27/47] Fix file name test errors

---
 experiments/commands.py   | 2 +-
 tests/test_experiments.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/experiments/commands.py b/experiments/commands.py
index 9021d3738..738a55011 100644
--- a/experiments/commands.py
+++ b/experiments/commands.py
@@ -85,7 +85,7 @@ def _get_algo_name(cfg_file: str) -> str:
     """Get the algorithm name from the given config filename."""
     algo_names = set()
     for key in _ALGO_NAME_TO_SCRIPT_NAME:
-        if cfg_file.find("_" + key + "_") != -1:
+        if cfg_file.find(key + "_") != -1:
             algo_names.add(key)
 
     if len(algo_names) == 0:
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index 0d431d0e9..b2417a9f9 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -250,7 +250,7 @@ def test_commands_bc_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation bc \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-bc-0-138a1475 \
+$USER-cmd-run0-bc-0-78e5112a \
 with benchmarking/bc_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -264,7 +264,7 @@ def test_commands_dagger_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_imitation dagger \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-dagger-0-6a49161a \
+$USER-cmd-run0-dagger-0-c27812cf \
 with benchmarking/dagger_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -278,7 +278,7 @@ def test_commands_gail_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial gail \
 --capture=sys --name=run0 --file_storage=output/sacred/\
-$USER-cmd-run0-gail-0-3ec8154d \
+$USER-cmd-run0-gail-0-9d8d1202 \
 with benchmarking/gail_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected
@@ -292,7 +292,7 @@ def test_commands_airl_config():
     assert len(commands) == 1
     expected = """python -m imitation.scripts.train_adversarial airl \
 --capture=sys --name=run0 \
---file_storage=output/sacred/$USER-cmd-run0-airl-0-400e1558 \
+--file_storage=output/sacred/$USER-cmd-run0-airl-0-9ed3120d \
 with benchmarking/airl_seals_ant_best_hp_eval.json \
 seed=0 logging.log_root=output"""
     assert commands[0] == expected

From 6933afacb22c555fcd70a833041bd716d2d78807 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 14:41:39 +0530
Subject: [PATCH 28/47] Add tune_run_kwargs in parallel script

---
 src/imitation/scripts/config/parallel.py |  3 --
 src/imitation/scripts/parallel.py        | 39 +++++++++++-------------
 2 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index a591f3d9a..4773b713e 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -34,13 +34,10 @@ def config():
         "config_updates": {},
     }  # `config` argument to `ray.tune.run(trainable, config)`
 
-    local_dir = None  # `local_dir` arg for `ray.tune.run`
-    upload_dir = None  # `upload_dir` arg for `ray.tune.run`
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1  # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
-    syncer = None  # Sacred syncer to use
 
 
 # Debug named configs
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 93aa932b9..7bf3db16f 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -26,12 +26,9 @@ def parallel(
     base_config_updates: Mapping[str, Any],
     resources_per_trial: Dict[str, Any],
     init_kwargs: Mapping[str, Any],
-    local_dir: Optional[str],
-    upload_dir: Optional[str],
     repeat: int,
-    search_alg: Optional[str],
     experiment_checkpoint_path: str,
-    syncer,
+    tune_run_kwargs: Dict[str, Any],
 ) -> ray.tune.ExperimentAnalysis:
     """Parallelize multiple runs of another Sacred Experiment using Ray Tune.
 
@@ -70,17 +67,13 @@ def parallel(
             generated Ray directory name, unlike config updates from `search_space`.
         resources_per_trial: Argument to `ray.tune.run()`.
         init_kwargs: Arguments to pass to `ray.init`.
-        local_dir: `local_dir` argument to `ray.tune.run()`.
-        upload_dir: `upload_dir` argument to `ray.tune.run()`.
-        search_alg: can be either "optuna" or None. Setting `None` allows for
-            adding grid_search to the `search_space` hyperparameters but doesn't allow
-            for trials to be repeated.
         repeat: Number of runs to repeat each trial for.
-            Not used if `search_alg` is None.
+            If `repeat` > 1, then optuna is used as the default search algorithm
+            unless specified otherwise in `tune_run_kwargs`.
         experiment_checkpoint_path: Path containing the checkpoints of a previous
             experiment ran using this script. Useful for evaluating the best trial
             of the experiment.
-        syncer: `syncer` argument to `ray.tune.syncer.SyncConfig`.
+        tune_run_kwargs: Other arguments to pass to `ray.tune.run()`.
 
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
@@ -118,11 +111,18 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
-    if search_alg == "optuna":
-        algo = search.Repeater(optuna.OptunaSearch(), repeat=repeat)
-    else:
-        assert repeat == 1  # repeat should not be used if search_alg is None
-        algo = None
+    if repeat > 1:
+        if "search_alg" not in tune_run_kwargs:
+            tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
+        try:
+            algo = tune_run_kwargs["search_alg"]
+            algo = search.Repeater(algo, repeat)
+            tune_run_kwargs["search_alg"] = algo
+        except AttributeError:
+            raise ValueError(
+                "repeat > 1 but search_alg is not an instance of "
+                "ray.tune.search.SearchAlgorithm",
+            )
 
     if sacred_ex_name == "train_rl":
         return_key = "monitor_return_mean"
@@ -145,15 +145,10 @@ def parallel(
                 config=search_space,
                 num_samples=num_samples * repeat,
                 name=run_name,
-                local_dir=local_dir,
                 resources_per_trial=resources_per_trial,
-                sync_config=ray.tune.syncer.SyncConfig(
-                    upload_dir=upload_dir,
-                    syncer=syncer,
-                ),
-                search_alg=algo,
                 metric=return_key,
                 mode="max",
+                **tune_run_kwargs,
             )
         return result
     finally:

From 77f9d9b74ddcb42e9181f9f493ca2f144b6a443f Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 16:10:15 +0530
Subject: [PATCH 29/47] Fix test errors

---
 src/imitation/scripts/config/parallel.py |  1 +
 src/imitation/scripts/parallel.py        | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index 4773b713e..bdc591422 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -38,6 +38,7 @@ def config():
     repeat = 1  # Number of times to repeat a sampled configuration
     search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
+    tune_run_kwargs = {}  # Additional kwargs to pass to `tune.run`
 
 
 # Debug named configs
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 7bf3db16f..65a72eae3 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -111,13 +111,14 @@ def parallel(
     )
 
     ray.init(**init_kwargs)
+    updated_tune_run_kwargs = copy.deepcopy(tune_run_kwargs)
     if repeat > 1:
-        if "search_alg" not in tune_run_kwargs:
-            tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
+        if "search_alg" not in updated_tune_run_kwargs:
+            updated_tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
         try:
-            algo = tune_run_kwargs["search_alg"]
+            algo = updated_tune_run_kwargs["search_alg"]
             algo = search.Repeater(algo, repeat)
-            tune_run_kwargs["search_alg"] = algo
+            updated_tune_run_kwargs["search_alg"] = algo
         except AttributeError:
             raise ValueError(
                 "repeat > 1 but search_alg is not an instance of "
@@ -148,7 +149,7 @@ def parallel(
                 resources_per_trial=resources_per_trial,
                 metric=return_key,
                 mode="max",
-                **tune_run_kwargs,
+                **updated_tune_run_kwargs,
             )
         return result
     finally:

From 54eb8a6f44ea599236b6165fa5de9079df7ca49a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 16:31:49 +0530
Subject: [PATCH 30/47] Fix test

---
 tests/scripts/test_scripts.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index e17765471..146048c42 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -969,7 +969,10 @@ def test_analyze_gather_tb(tmpdir: str):
     if os.name == "nt":  # pragma: no cover
         pytest.skip("gather_tb uses symlinks: not supported by Windows")
     num_runs = 2
-    config_updates: Dict[str, Any] = dict(local_dir=tmpdir, run_name="test")
+    config_updates: Dict[str, Any] = dict(
+        tune_run_kwargs=dict(local_dir=tmpdir),
+        run_name="test",
+    )
     config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE)
     config_updates.update(num_samples=num_runs)
     parallel_run = parallel.parallel_ex.run(

From d50238f1b900b05296d081954624cac9e2bcf6ab Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 17:02:37 +0530
Subject: [PATCH 31/47] Fix lint

---
 src/imitation/scripts/parallel.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 65a72eae3..a7a08064b 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -4,7 +4,7 @@
 import copy
 import glob
 import pathlib
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence
+from typing import Any, Callable, Dict, Mapping, Sequence
 
 import ray
 import ray.tune
@@ -77,6 +77,8 @@ def parallel(
 
     Raises:
         TypeError: Named configs not string sequences or config updates not mappings.
+        ValueError: `repeat` > 1 but `search_alg` is not an instance of
+            `ray.tune.search.SearchAlgorithm`.
 
     Returns:
         The result of running the parallel experiment with `ray.tune.run()`.

From 3fe22d4e6904c60c581a69004788b08b0184c8ed Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 19 Jul 2023 21:37:18 +0530
Subject: [PATCH 32/47] Updates from review

---
 benchmarking/tuning.py                   | 21 +++++++++++++++------
 src/imitation/scripts/config/parallel.py |  1 -
 src/imitation/scripts/parallel.py        |  2 +-
 tests/scripts/test_scripts.py            |  1 -
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index 324032088..409d0b5af 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -7,6 +7,7 @@
 import numpy as np
 import ray
 from pandas.api import types as pd_types
+from ray.tune.search import optuna
 from sacred.observers import FileStorageObserver
 from tuning_config import parallel_ex, tuning_ex
 
@@ -32,7 +33,15 @@ def tune(
     Raises:
         ValueError: If no trials are returned by the parallel run of tuning.
     """
-    run = parallel_ex.run(config_updates=parallel_run_config)
+    search_alg = optuna.OptunaSearch()
+    updated_parallel_run_config = copy.deepcopy(parallel_run_config)
+    if "tune_run_kwargs" not in updated_parallel_run_config:
+        tune_run_kwargs = {}
+    else:
+        tune_run_kwargs = updated_parallel_run_config["tune_run_kwargs"]
+    tune_run_kwargs.update(search_alg=search_alg)
+    updated_parallel_run_config.update(tune_run_kwargs=tune_run_kwargs)
+    run = parallel_ex.run(config_updates=updated_parallel_run_config)
     experiment_analysis = run.result
     if not experiment_analysis.trials:
         raise ValueError(
@@ -42,23 +51,23 @@ def tune(
         )
 
     return_key = "imit_stats/monitor_return_mean"
-    if parallel_run_config["sacred_ex_name"] == "train_rl":
+    if updated_parallel_run_config["sacred_ex_name"] == "train_rl":
         return_key = "monitor_return_mean"
     best_trial = find_best_trial(experiment_analysis, return_key, print_return=True)
 
     if num_eval_seeds > 0:  # evaluate the best trial
         resources_per_trial_eval = copy.deepcopy(
-            parallel_run_config["resources_per_trial"],
+            updated_parallel_run_config["resources_per_trial"],
         )
         # update cpus per trial only if it is provided in `resources_per_trial`
         # Uses the default values (cpu=1) if it is not provided
-        if "cpu" in parallel_run_config["resources_per_trial"]:
+        if "cpu" in updated_parallel_run_config["resources_per_trial"]:
             resources_per_trial_eval["cpu"] *= eval_best_trial_resource_multiplier
         evaluate_trial(
             best_trial,
             num_eval_seeds,
-            parallel_run_config["run_name"] + "_best_hp_eval",
-            parallel_run_config,
+            updated_parallel_run_config["run_name"] + "_best_hp_eval",
+            updated_parallel_run_config,
             resources_per_trial_eval,
             return_key,
         )
diff --git a/src/imitation/scripts/config/parallel.py b/src/imitation/scripts/config/parallel.py
index bdc591422..c9c898feb 100644
--- a/src/imitation/scripts/config/parallel.py
+++ b/src/imitation/scripts/config/parallel.py
@@ -36,7 +36,6 @@ def config():
 
     num_samples = 1  # Number of samples per grid search configuration
     repeat = 1  # Number of times to repeat a sampled configuration
-    search_alg = "optuna"  # search algorithm to use
     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
     tune_run_kwargs = {}  # Additional kwargs to pass to `tune.run`
 
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index a7a08064b..57503d6e0 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -34,7 +34,7 @@ def parallel(
 
     A Sacred FileObserver is attached to the inner experiment and writes Sacred
     logs to "{RAY_LOCAL_DIR}/sacred/". These files are automatically copied over
-    to `upload_dir` if that argument is provided.
+    to `upload_dir` if that argument is provided in `tune_run_kwargs`.
 
     Args:
         sacred_ex_name: The Sacred experiment to tune. Either "train_rl" or
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 146048c42..7ff241323 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -802,7 +802,6 @@ def test_train_rl_cnn_policy(tmpdir: str, rng):
             # Need absolute path because raylet runs in different working directory.
             "demonstrations.path": CARTPOLE_TEST_ROLLOUT_PATH.absolute(),
         },
-        search_alg=None,  # Use default search algorithm of ray.
         search_space={
             "command_name": "airl",
             "config_updates": {"total_timesteps": tune.choice([5, 10])},

From c50aa20ddfa9f7ce5987a3fd08083d22757925a7 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Thu, 20 Jul 2023 16:19:04 +0530
Subject: [PATCH 33/47] Simplify few lines of code

---
 benchmarking/tuning.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/benchmarking/tuning.py b/benchmarking/tuning.py
index 409d0b5af..9c3f52498 100644
--- a/benchmarking/tuning.py
+++ b/benchmarking/tuning.py
@@ -33,14 +33,12 @@ def tune(
     Raises:
         ValueError: If no trials are returned by the parallel run of tuning.
     """
-    search_alg = optuna.OptunaSearch()
     updated_parallel_run_config = copy.deepcopy(parallel_run_config)
-    if "tune_run_kwargs" not in updated_parallel_run_config:
-        tune_run_kwargs = {}
+    search_alg = optuna.OptunaSearch()
+    if "tune_run_kwargs" in updated_parallel_run_config:
+        updated_parallel_run_config["tune_run_kwargs"]["search_alg"] = search_alg
     else:
-        tune_run_kwargs = updated_parallel_run_config["tune_run_kwargs"]
-    tune_run_kwargs.update(search_alg=search_alg)
-    updated_parallel_run_config.update(tune_run_kwargs=tune_run_kwargs)
+        updated_parallel_run_config["tune_run_kwargs"] = dict(search_alg=search_alg)
     run = parallel_ex.run(config_updates=updated_parallel_run_config)
     experiment_analysis = run.result
     if not experiment_analysis.trials:

From 000af616fb159c165f4806df11d865ee2a6b3663 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 21:54:48 +0530
Subject: [PATCH 34/47] Updates from review

---
 benchmarking/README.md                           |  3 ++-
 src/imitation/scripts/analyze.py                 |  3 ---
 .../scripts/config/train_adversarial.py          |  4 ++++
 .../config/train_preference_comparisons.py       |  4 ++++
 src/imitation/scripts/config/train_rl.py         |  5 +++++
 src/imitation/scripts/parallel.py                | 16 +++++++---------
 tests/scripts/test_scripts.py                    |  3 +++
 7 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 3973c6181..ba89da69d 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -15,7 +15,8 @@ python -m imitation.scripts.<train_script> <algo> with benchmarking/<config_name
 
 ```python
 ...
-ex.add_config('benchmarking/<config_name>.json')
+from imitation.scripts.<train_script> import <train_ex>
+<train_ex>.run(command_name="<algo>", named_configs=["benchmarking/<config_name>.json"])
 ```
 
 # Tuning Hyperparameters
diff --git a/src/imitation/scripts/analyze.py b/src/imitation/scripts/analyze.py
index 8977fed47..96b34bd6e 100644
--- a/src/imitation/scripts/analyze.py
+++ b/src/imitation/scripts/analyze.py
@@ -166,9 +166,6 @@ def _get_algo_name(sd: sacred_util.SacredDicts) -> str:
 
 def _return_summaries(sd: sacred_util.SacredDicts) -> dict:
     imit_stats = get(sd.run, "result.imit_stats")
-    if imit_stats is None:
-        # stored in rollout key for preference comparison
-        imit_stats = get(sd.run, "result.rollout")
     expert_stats = get(sd.run, "result.expert_stats")
 
     expert_return_summary = None
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
index ef675eab6..acc842095 100644
--- a/src/imitation/scripts/config/train_adversarial.py
+++ b/src/imitation/scripts/config/train_adversarial.py
@@ -8,6 +8,10 @@
 from imitation.scripts.ingredients import logging as logging_ingredient
 from imitation.scripts.ingredients import policy_evaluation, reward, rl
 
+# Note: All the hyperparameter configs in the file are of the tuned
+# hyperparameters of the RL algorithm of the respective environment.
+# Taken from imitation/scripts/config/train_rl.py
+
 train_adversarial_ex = sacred.Experiment(
     "train_adversarial",
     ingredients=[
diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 4fe9c793e..4d8531732 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -8,6 +8,10 @@
 from imitation.scripts.ingredients import logging as logging_ingredient
 from imitation.scripts.ingredients import policy_evaluation, reward, rl
 
+# Note: All the hyperparameter configs in the file are of the tuned
+# hyperparameters of the RL algorithm of the respective environment.
+# Taken from imitation/scripts/config/train_rl.py
+
 train_preference_comparisons_ex = sacred.Experiment(
     "train_preference_comparisons",
     ingredients=[
diff --git a/src/imitation/scripts/config/train_rl.py b/src/imitation/scripts/config/train_rl.py
index a5475540d..e4ab71da1 100644
--- a/src/imitation/scripts/config/train_rl.py
+++ b/src/imitation/scripts/config/train_rl.py
@@ -8,6 +8,11 @@
 from imitation.scripts.ingredients import logging as logging_ingredient
 from imitation.scripts.ingredients import policy_evaluation, rl
 
+# Note: All the hyperparameter configs in the file are tuned
+# for the PPO algorithm on the respective environment using the
+# RL Baselines Zoo library:
+# https://github.com/HumanCompatibleAI/rl-baselines3-zoo/
+
 train_rl_ex = sacred.Experiment(
     "train_rl",
     ingredients=[
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 57503d6e0..9f5478a6e 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -24,7 +24,7 @@ def parallel(
     search_space: Mapping[str, Any],
     base_named_configs: Sequence[str],
     base_config_updates: Mapping[str, Any],
-    resources_per_trial: Dict[str, Any],
+    resources_per_trial: Mapping[str, Any],
     init_kwargs: Mapping[str, Any],
     repeat: int,
     experiment_checkpoint_path: str,
@@ -115,17 +115,15 @@ def parallel(
     ray.init(**init_kwargs)
     updated_tune_run_kwargs = copy.deepcopy(tune_run_kwargs)
     if repeat > 1:
-        if "search_alg" not in updated_tune_run_kwargs:
-            updated_tune_run_kwargs["search_alg"] = optuna.OptunaSearch()
         try:
-            algo = updated_tune_run_kwargs["search_alg"]
-            algo = search.Repeater(algo, repeat)
-            updated_tune_run_kwargs["search_alg"] = algo
-        except AttributeError:
+            # Use optuna as the default search algorithm for repeat runs.
+            algo = tune_run_kwargs.get("search_alg", optuna.OptunaSearch())
+            updated_tune_run_kwargs["search_alg"] = search.Repeater(algo, repeat)
+        except AttributeError as e:
             raise ValueError(
                 "repeat > 1 but search_alg is not an instance of "
                 "ray.tune.search.SearchAlgorithm",
-            )
+            ) from e
 
     if sacred_ex_name == "train_rl":
         return_key = "monitor_return_mean"
@@ -198,7 +196,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: dict[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
diff --git a/tests/scripts/test_scripts.py b/tests/scripts/test_scripts.py
index 7ff241323..b0271d83b 100644
--- a/tests/scripts/test_scripts.py
+++ b/tests/scripts/test_scripts.py
@@ -889,6 +889,9 @@ def test_parallel_train_adversarial_custom_env(tmpdir):
             logging=dict(log_root=tmpdir),
             demonstrations=dict(path=path),
         ),
+        # specifying repeat=2 uses the optuna search algorithm which
+        # requires the search space to be non-empty. So we provide
+        # the command name using tune.choice.
         search_space=dict(command_name=tune.choice(["gail"])),
     )
     config_updates.update(PARALLEL_CONFIG_LOW_RESOURCE)

From 8b551341a89a5008fd5c35e04110710ea746d52a Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 23:11:15 +0530
Subject: [PATCH 35/47] Fix test

---
 .../algorithms/adversarial/common.py          | 37 +++++++++++++++----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index 62b459a0d..545109b0d 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -2,13 +2,13 @@
 import abc
 import dataclasses
 import logging
-from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload
+from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload, List
 
 import numpy as np
 import torch as th
 import torch.utils.tensorboard as thboard
 import tqdm
-from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env
+from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env, callbacks
 from stable_baselines3.sac import policies as sac_policies
 from torch.nn import functional as F
 
@@ -86,6 +86,30 @@ def compute_train_stats(
     }
 
 
+class TrainDiscriminatorCallback(callbacks.BaseCallback):
+    """Callback for training discriminator after collecting rollouts."""
+
+    def __init__(self, adversarial_trainer, *args, **kwargs):
+        """Builds TrainDiscriminatorCallback.
+
+        Args:
+            *args: Passed through to `callbacks.BaseCallback`.
+            **kwargs: Passed through to `callbacks.BaseCallback`.
+        """
+        self.adversarial_trainer = adversarial_trainer
+        super().__init__(*args, **kwargs)
+
+    def _on_step(self) -> bool:
+        return True
+
+    def _on_rollout_end(self) -> None:
+        self.adversarial_trainer.model.train_disc()
+        for _ in range(self.adversarial_trainer.n_disc_updates_per_round):
+            with networks.training(self.adversarial_trainer.reward_train):
+                # switch to training mode (affects dropout, normalization)
+                self.adversarial_trainer.train_disc()
+
+
 class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]):
     """Base class for adversarial imitation learning algorithms like GAIL and AIRL."""
 
@@ -222,16 +246,17 @@ def __init__(
 
         self.venv_buffering = wrappers.BufferingWrapper(self.venv)
 
+        self.disc_trainer_callback = TrainDiscriminatorCallback(self)
         if debug_use_ground_truth:
             # Would use an identity reward fn here, but RewardFns can't see rewards.
             self.venv_wrapped = self.venv_buffering
-            self.gen_callback = None
+            self.gen_callback: List[callbacks.BaseCallback] = [self.disc_trainer_callback]
         else:
             self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                 self.venv_buffering,
                 reward_fn=self.reward_train.predict_processed,
             )
-            self.gen_callback = self.venv_wrapped.make_log_callback()
+            self.gen_callback = [self.venv_wrapped.make_log_callback(), self.disc_trainer_callback]
         self.venv_train = self.venv_wrapped
 
         self.gen_algo.set_env(self.venv_train)
@@ -446,10 +471,6 @@ def train(
         )
         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
             self.train_gen(self.gen_train_timesteps)
-            for _ in range(self.n_disc_updates_per_round):
-                with networks.training(self.reward_train):
-                    # switch to training mode (affects dropout, normalization)
-                    self.train_disc()
             if callback:
                 callback(r)
             self.logger.dump(self._global_step)

From f3ba2b5ec01331f03295856e4219c68212fc7aee Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 23:13:59 +0530
Subject: [PATCH 36/47] Revert "Fix test"

This reverts commit 8b551341a89a5008fd5c35e04110710ea746d52a.
---
 .../algorithms/adversarial/common.py          | 37 ++++---------------
 1 file changed, 8 insertions(+), 29 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index 545109b0d..62b459a0d 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -2,13 +2,13 @@
 import abc
 import dataclasses
 import logging
-from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload, List
+from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload
 
 import numpy as np
 import torch as th
 import torch.utils.tensorboard as thboard
 import tqdm
-from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env, callbacks
+from stable_baselines3.common import base_class, on_policy_algorithm, policies, vec_env
 from stable_baselines3.sac import policies as sac_policies
 from torch.nn import functional as F
 
@@ -86,30 +86,6 @@ def compute_train_stats(
     }
 
 
-class TrainDiscriminatorCallback(callbacks.BaseCallback):
-    """Callback for training discriminator after collecting rollouts."""
-
-    def __init__(self, adversarial_trainer, *args, **kwargs):
-        """Builds TrainDiscriminatorCallback.
-
-        Args:
-            *args: Passed through to `callbacks.BaseCallback`.
-            **kwargs: Passed through to `callbacks.BaseCallback`.
-        """
-        self.adversarial_trainer = adversarial_trainer
-        super().__init__(*args, **kwargs)
-
-    def _on_step(self) -> bool:
-        return True
-
-    def _on_rollout_end(self) -> None:
-        self.adversarial_trainer.model.train_disc()
-        for _ in range(self.adversarial_trainer.n_disc_updates_per_round):
-            with networks.training(self.adversarial_trainer.reward_train):
-                # switch to training mode (affects dropout, normalization)
-                self.adversarial_trainer.train_disc()
-
-
 class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]):
     """Base class for adversarial imitation learning algorithms like GAIL and AIRL."""
 
@@ -246,17 +222,16 @@ def __init__(
 
         self.venv_buffering = wrappers.BufferingWrapper(self.venv)
 
-        self.disc_trainer_callback = TrainDiscriminatorCallback(self)
         if debug_use_ground_truth:
             # Would use an identity reward fn here, but RewardFns can't see rewards.
             self.venv_wrapped = self.venv_buffering
-            self.gen_callback: List[callbacks.BaseCallback] = [self.disc_trainer_callback]
+            self.gen_callback = None
         else:
             self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                 self.venv_buffering,
                 reward_fn=self.reward_train.predict_processed,
             )
-            self.gen_callback = [self.venv_wrapped.make_log_callback(), self.disc_trainer_callback]
+            self.gen_callback = self.venv_wrapped.make_log_callback()
         self.venv_train = self.venv_wrapped
 
         self.gen_algo.set_env(self.venv_train)
@@ -471,6 +446,10 @@ def train(
         )
         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
             self.train_gen(self.gen_train_timesteps)
+            for _ in range(self.n_disc_updates_per_round):
+                with networks.training(self.reward_train):
+                    # switch to training mode (affects dropout, normalization)
+                    self.train_disc()
             if callback:
                 callback(r)
             self.logger.dump(self._global_step)

From f8251c70e98f0ccf29e10f1b1ac35ce08e25a580 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Fri, 4 Aug 2023 23:14:49 +0530
Subject: [PATCH 37/47] Fix test

---
 src/imitation/scripts/parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index 9f5478a6e..bb90f6174 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -196,7 +196,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: dict[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:

From 664fc37c0dfd118768186e83006fc06def21a48b Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Mon, 7 Aug 2023 22:58:00 +0530
Subject: [PATCH 38/47] Convert Dict to Mapping in input argument

---
 src/imitation/scripts/parallel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index bb90f6174..38881ee2b 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -196,7 +196,7 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
@@ -212,7 +212,7 @@ def inner(config: Dict[str, Any], reporter) -> Mapping[str, Any]:
         # TODO(shwang): Stop modifying CAPTURE_MODE once the issue is fixed.
         sacred.SETTINGS.CAPTURE_MODE = "sys"
 
-        run_kwargs = config
+        run_kwargs = dict(**config)
         updated_run_kwargs: Dict[str, Any] = {}
         # Import inside function rather than in module because Sacred experiments
         # are not picklable, and Ray requires this function to be picklable.

From 8690e1dcb01fc96fcfa1813c038f2b1ac26f4a3c Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 30 Aug 2023 10:47:28 +0200
Subject: [PATCH 39/47] Ignore coverage in script configurations.

---
 setup.cfg | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index f39db322f..85dedb3e3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -42,6 +42,8 @@ source = imitation
 include=
     src/*
     tests/*
+omit =
+    src/imitation/scripts/config/*
 
 [coverage:report]
 exclude_lines =

From dd9eb6a5b7e62b5cf1faf84d9111bac9bef77e9d Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 30 Aug 2023 11:12:10 +0200
Subject: [PATCH 40/47] Pin huggingface_sb3 version.

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1781a4031..6d1f2489c 100644
--- a/setup.py
+++ b/setup.py
@@ -207,7 +207,9 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         STABLE_BASELINES3,
         "sacred>=0.8.4",
         "tensorboard>=1.14",
-        "huggingface_sb3>=2.2.1",
+        # TODO: remove once https://github.com/huggingface/huggingface_sb3/issues/37 is
+        #  fixed
+        "huggingface_sb3==2.2.5",
         "optuna>=3.0.1",
         "datasets>=2.8.0",
     ],

From 40d87ef2e99dcb8a34041d27dd62327ec8faf8b4 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 26 Sep 2023 16:46:04 +0200
Subject: [PATCH 41/47] Update to the newest seals environment versions.

---
 benchmarking/airl_seals_ant_best_hp_eval.json            | 2 +-
 benchmarking/airl_seals_half_cheetah_best_hp_eval.json   | 2 +-
 benchmarking/airl_seals_hopper_best_hp_eval.json         | 2 +-
 benchmarking/airl_seals_swimmer_best_hp_eval.json        | 4 ++--
 benchmarking/airl_seals_walker_best_hp_eval.json         | 4 ++--
 benchmarking/bc_seals_ant_best_hp_eval.json              | 2 +-
 benchmarking/bc_seals_half_cheetah_best_hp_eval.json     | 2 +-
 benchmarking/bc_seals_hopper_best_hp_eval.json           | 2 +-
 benchmarking/bc_seals_swimmer_best_hp_eval.json          | 2 +-
 benchmarking/bc_seals_walker_best_hp_eval.json           | 2 +-
 benchmarking/dagger_seals_ant_best_hp_eval.json          | 2 +-
 benchmarking/dagger_seals_half_cheetah_best_hp_eval.json | 2 +-
 benchmarking/dagger_seals_hopper_best_hp_eval.json       | 2 +-
 benchmarking/dagger_seals_swimmer_best_hp_eval.json      | 2 +-
 benchmarking/dagger_seals_walker_best_hp_eval.json       | 2 +-
 benchmarking/gail_seals_ant_best_hp_eval.json            | 2 +-
 benchmarking/gail_seals_half_cheetah_best_hp_eval.json   | 2 +-
 benchmarking/gail_seals_hopper_best_hp_eval.json         | 2 +-
 benchmarking/gail_seals_swimmer_best_hp_eval.json        | 4 ++--
 benchmarking/gail_seals_walker_best_hp_eval.json         | 4 ++--
 20 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/benchmarking/airl_seals_ant_best_hp_eval.json b/benchmarking/airl_seals_ant_best_hp_eval.json
index 17f969ff0..d4131433e 100644
--- a/benchmarking/airl_seals_ant_best_hp_eval.json
+++ b/benchmarking/airl_seals_ant_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
index 754ba6736..f69ba5cb5 100644
--- a/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/airl_seals_half_cheetah_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/airl_seals_hopper_best_hp_eval.json b/benchmarking/airl_seals_hopper_best_hp_eval.json
index 91080d7ce..58c2475f5 100644
--- a/benchmarking/airl_seals_hopper_best_hp_eval.json
+++ b/benchmarking/airl_seals_hopper_best_hp_eval.json
@@ -75,6 +75,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/airl_seals_swimmer_best_hp_eval.json b/benchmarking/airl_seals_swimmer_best_hp_eval.json
index fcca8e6b3..8529c58b5 100644
--- a/benchmarking/airl_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/airl_seals_swimmer_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Swimmer-v0",
+      "gym_id": "seals/Swimmer-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/airl_seals_walker_best_hp_eval.json b/benchmarking/airl_seals_walker_best_hp_eval.json
index c63070751..edd99806d 100644
--- a/benchmarking/airl_seals_walker_best_hp_eval.json
+++ b/benchmarking/airl_seals_walker_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Walker2d-v0",
+      "gym_id": "seals/Walker2d-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }
diff --git a/benchmarking/bc_seals_ant_best_hp_eval.json b/benchmarking/bc_seals_ant_best_hp_eval.json
index 108a93ce7..e9baa8fc1 100644
--- a/benchmarking/bc_seals_ant_best_hp_eval.json
+++ b/benchmarking/bc_seals_ant_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
index ecaff2eb0..041f159b0 100644
--- a/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/bc_seals_half_cheetah_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/bc_seals_hopper_best_hp_eval.json b/benchmarking/bc_seals_hopper_best_hp_eval.json
index e8c821841..9a7872d37 100644
--- a/benchmarking/bc_seals_hopper_best_hp_eval.json
+++ b/benchmarking/bc_seals_hopper_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/bc_seals_swimmer_best_hp_eval.json b/benchmarking/bc_seals_swimmer_best_hp_eval.json
index 30884c9c4..8a8f2456a 100644
--- a/benchmarking/bc_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/bc_seals_swimmer_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/bc_seals_walker_best_hp_eval.json b/benchmarking/bc_seals_walker_best_hp_eval.json
index 0ca30120e..f33e6c5a2 100644
--- a/benchmarking/bc_seals_walker_best_hp_eval.json
+++ b/benchmarking/bc_seals_walker_best_hp_eval.json
@@ -43,6 +43,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_ant_best_hp_eval.json b/benchmarking/dagger_seals_ant_best_hp_eval.json
index de75b80f1..e02828667 100644
--- a/benchmarking/dagger_seals_ant_best_hp_eval.json
+++ b/benchmarking/dagger_seals_ant_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
index 7f42bfdf9..d1c9e5923 100644
--- a/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/dagger_seals_half_cheetah_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_hopper_best_hp_eval.json b/benchmarking/dagger_seals_hopper_best_hp_eval.json
index 1cf29a1a4..b91f66298 100644
--- a/benchmarking/dagger_seals_hopper_best_hp_eval.json
+++ b/benchmarking/dagger_seals_hopper_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_swimmer_best_hp_eval.json b/benchmarking/dagger_seals_swimmer_best_hp_eval.json
index c112db680..545761cbc 100644
--- a/benchmarking/dagger_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/dagger_seals_swimmer_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/dagger_seals_walker_best_hp_eval.json b/benchmarking/dagger_seals_walker_best_hp_eval.json
index e59bef464..7b694c8d2 100644
--- a/benchmarking/dagger_seals_walker_best_hp_eval.json
+++ b/benchmarking/dagger_seals_walker_best_hp_eval.json
@@ -47,6 +47,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }
diff --git a/benchmarking/gail_seals_ant_best_hp_eval.json b/benchmarking/gail_seals_ant_best_hp_eval.json
index 81399b00c..3d43b34ba 100644
--- a/benchmarking/gail_seals_ant_best_hp_eval.json
+++ b/benchmarking/gail_seals_ant_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Ant-v0"
+    "gym_id": "seals/Ant-v1"
   }
 }
diff --git a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
index 1d2f26648..914f3712a 100644
--- a/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
+++ b/benchmarking/gail_seals_half_cheetah_best_hp_eval.json
@@ -62,6 +62,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/HalfCheetah-v0"
+    "gym_id": "seals/HalfCheetah-v1"
   }
 }
diff --git a/benchmarking/gail_seals_hopper_best_hp_eval.json b/benchmarking/gail_seals_hopper_best_hp_eval.json
index 70787ff7e..cebdae71c 100644
--- a/benchmarking/gail_seals_hopper_best_hp_eval.json
+++ b/benchmarking/gail_seals_hopper_best_hp_eval.json
@@ -75,6 +75,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Hopper-v0"
+    "gym_id": "seals/Hopper-v1"
   }
 }
diff --git a/benchmarking/gail_seals_swimmer_best_hp_eval.json b/benchmarking/gail_seals_swimmer_best_hp_eval.json
index 650c5f46a..b0bd0e645 100644
--- a/benchmarking/gail_seals_swimmer_best_hp_eval.json
+++ b/benchmarking/gail_seals_swimmer_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Swimmer-v0",
+      "gym_id": "seals/Swimmer-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Swimmer-v0"
+    "gym_id": "seals/Swimmer-v1"
   }
 }
diff --git a/benchmarking/gail_seals_walker_best_hp_eval.json b/benchmarking/gail_seals_walker_best_hp_eval.json
index d85eb46d5..2626b4c43 100644
--- a/benchmarking/gail_seals_walker_best_hp_eval.json
+++ b/benchmarking/gail_seals_walker_best_hp_eval.json
@@ -12,7 +12,7 @@
   },
   "expert": {
     "loader_kwargs": {
-      "gym_id": "seals/Walker2d-v0",
+      "gym_id": "seals/Walker2d-v1",
       "organization": "HumanCompatibleAI"
     }
   },
@@ -81,6 +81,6 @@
     "n_episodes_eval": 50
   },
   "environment": {
-    "gym_id": "seals/Walker2d-v0"
+    "gym_id": "seals/Walker2d-v1"
   }
 }

From 71f6c9283a387d35ed94f832ca660711942052e3 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 27 Sep 2023 09:49:28 +0200
Subject: [PATCH 42/47] Push gymnasium dependency to 0.29 to ensure mujoco envs
 work.

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7bc4051a9..0384014ee 100644
--- a/setup.py
+++ b/setup.py
@@ -187,7 +187,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
     #   encode only known incompatibilities here. This prevents nasty dependency issues
     #   for our users.
     install_requires=[
-        "gymnasium[classic-control]~=0.28.1",
+        "gymnasium[classic-control]~=0.29",
         "matplotlib",
         "numpy>=1.15",
         "torch>=1.4.0",
@@ -220,7 +220,7 @@ def get_local_version(version: "ScmVersion", time_format="%Y%m%d") -> str:
         "docs": DOCS_REQUIRE,
         "parallel": PARALLEL_REQUIRE,
         "mujoco": [
-            "gymnasium[classic-control,mujoco]~=0.28.1",
+            "gymnasium[classic-control,mujoco]~=0.29",
         ],
         "atari": ATARI_REQUIRE,
     },

From 53c121264d44fd3455888c86eb087a51b7919f9d Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 9 Aug 2023 03:31:10 +0530
Subject: [PATCH 43/47] Update adversarial algorithm

---
 .../algorithms/adversarial/common.py          | 88 +++++++++++++++++--
 .../policies/replay_buffer_wrapper.py         | 44 +++++++++-
 2 files changed, 124 insertions(+), 8 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index ece30b011..e7369719c 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -2,7 +2,7 @@
 import abc
 import dataclasses
 import logging
-from typing import Callable, Iterable, Iterator, Mapping, Optional, Type, overload
+from typing import Callable, Iterable, Iterator, List, Mapping, Optional, Type, overload
 
 import numpy as np
 import torch as th
@@ -10,7 +10,9 @@
 import tqdm
 from stable_baselines3.common import (
     base_class,
+    callbacks,
     distributions,
+    off_policy_algorithm,
     on_policy_algorithm,
     policies,
     vec_env,
@@ -20,6 +22,7 @@
 
 from imitation.algorithms import base
 from imitation.data import buffer, rollout, types, wrappers
+from imitation.policies import replay_buffer_wrapper
 from imitation.rewards import reward_nets, reward_wrapper
 from imitation.util import logger, networks, util
 
@@ -92,6 +95,47 @@ def compute_train_stats(
     }
 
 
+class TrainDiscriminatorCallback(callbacks.BaseCallback):
+    """Callback for training discriminator after collecting rollouts."""
+
+    def __init__(self, adversarial_trainer, *args, **kwargs):
+        """Builds TrainDiscriminatorCallback.
+
+        Args:
+            *args: Passed through to `callbacks.BaseCallback`.
+            **kwargs: Passed through to `callbacks.BaseCallback`.
+        """
+        self.adversarial_trainer = adversarial_trainer
+        self.gen_ctx_manager = None
+        super().__init__(*args, **kwargs)
+
+    def _on_step(self) -> bool:
+        return True
+
+    def _on_rollout_end(self) -> None:
+        gen_trajs, ep_lens = self.adversarial_trainer.venv_buffering.pop_trajectories()
+        self.adversarial_trainer._check_fixed_horizon(ep_lens)
+        gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
+        self.adversarial_trainer._gen_replay_buffer.store(gen_samples)
+
+        for _ in range(self.adversarial_trainer.n_disc_updates_per_round):
+            with networks.training(self.adversarial_trainer.reward_train):
+                # switch to training mode (affects dropout, normalization)
+                self.adversarial_trainer.train_disc()
+
+        # update the rollouts with the reward of the latest discriminator
+        self.adversarial_trainer.update_rewards_of_rollouts()
+
+        # This is a hacky way to enable logger.accumulate_means for generator
+        # This is done to avoid nested loggers of discriminator and generator
+        self.gen_ctx_manager = self.adversarial_trainer.logger.accumulate_means("gen")
+        self.gen_ctx_manager.__enter__()
+
+    def _on_training_end(self) -> None:
+        assert self.gen_ctx_manager is not None
+        self.gen_ctx_manager.__exit__(None, None, None)
+
+
 class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]):
     """Base class for adversarial imitation learning algorithms like GAIL and AIRL."""
 
@@ -228,16 +272,22 @@ def __init__(
 
         self.venv_buffering = wrappers.BufferingWrapper(self.venv)
 
+        self.disc_trainer_callback = TrainDiscriminatorCallback(self)
         if debug_use_ground_truth:
             # Would use an identity reward fn here, but RewardFns can't see rewards.
             self.venv_wrapped = self.venv_buffering
-            self.gen_callback = None
+            self.gen_callback: List[callbacks.BaseCallback] = [
+                self.disc_trainer_callback
+            ]
         else:
             self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                 self.venv_buffering,
                 reward_fn=self.reward_train.predict_processed,
             )
-            self.gen_callback = self.venv_wrapped.make_log_callback()
+            self.gen_callback = [
+                self.venv_wrapped.make_log_callback(),
+                self.disc_trainer_callback,
+            ]
         self.venv_train = self.venv_wrapped
 
         self.gen_algo.set_env(self.venv_train)
@@ -314,6 +364,34 @@ def _next_expert_batch(self) -> Mapping:
         assert self._endless_expert_iterator is not None
         return next(self._endless_expert_iterator)
 
+    def update_rewards_of_rollouts(self) -> None:
+        """Updates the rewards of the rollouts using the latest discriminator."""
+        if isinstance(self.gen_algo, on_policy_algorithm.OnPolicyAlgorithm):
+            buffer = self.gen_algo.rollout_buffer
+            assert buffer is not None
+            reward_fn_inputs = replay_buffer_wrapper._rollout_buffer_to_reward_fn_input(
+                self.gen_algo.rollout_buffer
+            )
+            rewards = self._reward_net.predict(**reward_fn_inputs)
+            rewards = rewards.reshape(buffer.rewards.shape)
+            last_values = buffer.advantages[-1] - buffer.rewards[-1] + buffer.values[-1]
+            last_values = last_values / buffer.gamma
+            # here we assume that the actual last_values cannot exactly be 0.0 and so if
+            # last_values is 0.0 then we know that the episode terminated
+            last_dones = last_values == 0.0
+            self.gen_algo.rollout_buffer.rewards[:] = rewards
+            self.gen_algo.rollout_buffer.compute_returns_and_advantage(
+                th.tensor(last_values), last_dones
+            )
+        elif isinstance(self.gen_algo, off_policy_algorithm.OffPolicyAlgorithm):
+            buffer = self.gen_algo.replay_buffer
+            assert buffer is not None
+            reward_fn_inputs = replay_buffer_wrapper._replay_buffer_to_reward_fn_input(
+                buffer
+            )
+            rewards = self._reward_net.predict(**reward_fn_inputs)
+            buffer.rewards[:] = rewards.reshape(buffer.rewards.shape)
+
     def train_disc(
         self,
         *,
@@ -452,10 +530,6 @@ def train(
         )
         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
             self.train_gen(self.gen_train_timesteps)
-            for _ in range(self.n_disc_updates_per_round):
-                with networks.training(self.reward_train):
-                    # switch to training mode (affects dropout, normalization)
-                    self.train_disc()
             if callback:
                 callback(r)
             self.logger.dump(self._global_step)
diff --git a/src/imitation/policies/replay_buffer_wrapper.py b/src/imitation/policies/replay_buffer_wrapper.py
index 7177e2dc1..a8649f78f 100644
--- a/src/imitation/policies/replay_buffer_wrapper.py
+++ b/src/imitation/policies/replay_buffer_wrapper.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from gymnasium import spaces
-from stable_baselines3.common.buffers import ReplayBuffer
+from stable_baselines3.common.buffers import ReplayBuffer, RolloutBuffer
 from stable_baselines3.common.type_aliases import ReplayBufferSamples
 
 from imitation.rewards.reward_function import RewardFn
@@ -23,6 +23,48 @@ def _samples_to_reward_fn_input(
     )
 
 
+def _rollout_buffer_to_reward_fn_input(
+    buffer: RolloutBuffer,
+) -> Mapping[str, np.ndarray]:
+    """Convert a sample from a rollout buffer to a numpy array."""
+    assert buffer.observations is not None
+    assert buffer.actions is not None
+    obs = buffer.observations
+    next_obs = obs[1:]
+    next_obs = np.concatenate([next_obs, obs[-1:]], axis=0)  # last obs not available
+    actions = buffer.actions
+    dones = buffer.episode_starts
+    dones = np.roll(dones, -1, axis=0)
+    dones[-1] = np.ones_like(dones[-1])  # last dones not available
+
+    return dict(
+        state=obs.reshape(-1, *obs.shape[2:]),
+        action=actions.reshape(-1, *actions.shape[2:]),
+        next_state=next_obs.reshape(-1, *next_obs.shape[2:]),
+        done=dones.reshape(-1),
+    )
+
+
+def _replay_buffer_to_reward_fn_input(
+    buffer: ReplayBuffer,
+) -> Mapping[str, np.ndarray]:
+    """Convert a sample from a replay buffer to a numpy array."""
+    assert buffer.observations is not None
+    assert buffer.next_observations is not None
+    assert buffer.actions is not None
+    obs = buffer.observations
+    next_obs = buffer.next_observations
+    actions = buffer.actions
+    dones = buffer.dones
+
+    return dict(
+        state=obs.reshape(-1, *obs.shape[2:]),
+        action=actions.reshape(-1, *actions.shape[2:]),
+        next_state=next_obs.reshape(-1, *next_obs.shape[2:]),
+        done=dones.reshape(-1),
+    )
+
+
 class ReplayBufferRewardWrapper(ReplayBuffer):
     """Relabel the rewards in transitions sampled from a ReplayBuffer."""
 

From 47b38741beceddc9c68de8b787baf40b0e3efe13 Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 9 Aug 2023 05:10:46 +0530
Subject: [PATCH 44/47] Fix test errors

---
 .../algorithms/adversarial/common.py          | 22 ++++++++++++-------
 tests/algorithms/test_adversarial.py          |  5 +++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index e7369719c..a95e550e3 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -102,6 +102,8 @@ def __init__(self, adversarial_trainer, *args, **kwargs):
         """Builds TrainDiscriminatorCallback.
 
         Args:
+            adversarial_trainer: The AdversarialTrainer instance in which
+                this callback will be called.
             *args: Passed through to `callbacks.BaseCallback`.
             **kwargs: Passed through to `callbacks.BaseCallback`.
         """
@@ -277,7 +279,7 @@ def __init__(
             # Would use an identity reward fn here, but RewardFns can't see rewards.
             self.venv_wrapped = self.venv_buffering
             self.gen_callback: List[callbacks.BaseCallback] = [
-                self.disc_trainer_callback
+                self.disc_trainer_callback,
             ]
         else:
             self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
@@ -370,7 +372,7 @@ def update_rewards_of_rollouts(self) -> None:
             buffer = self.gen_algo.rollout_buffer
             assert buffer is not None
             reward_fn_inputs = replay_buffer_wrapper._rollout_buffer_to_reward_fn_input(
-                self.gen_algo.rollout_buffer
+                self.gen_algo.rollout_buffer,
             )
             rewards = self._reward_net.predict(**reward_fn_inputs)
             rewards = rewards.reshape(buffer.rewards.shape)
@@ -381,13 +383,14 @@ def update_rewards_of_rollouts(self) -> None:
             last_dones = last_values == 0.0
             self.gen_algo.rollout_buffer.rewards[:] = rewards
             self.gen_algo.rollout_buffer.compute_returns_and_advantage(
-                th.tensor(last_values), last_dones
+                th.tensor(last_values),
+                last_dones,
             )
         elif isinstance(self.gen_algo, off_policy_algorithm.OffPolicyAlgorithm):
             buffer = self.gen_algo.replay_buffer
             assert buffer is not None
             reward_fn_inputs = replay_buffer_wrapper._replay_buffer_to_reward_fn_input(
-                buffer
+                buffer,
             )
             rewards = self._reward_net.predict(**reward_fn_inputs)
             buffer.rewards[:] = rewards.reshape(buffer.rewards.shape)
@@ -466,13 +469,15 @@ def train_disc(
 
         return train_stats
 
-    def train_gen(
+    def train_gen_with_disc(
         self,
         total_timesteps: Optional[int] = None,
         learn_kwargs: Optional[Mapping] = None,
     ) -> None:
         """Trains the generator to maximize the discriminator loss.
 
+        The discriminator is also trained after the rollouts are collected and before
+        the generator is trained.
         After the end of training populates the generator replay buffer (used in
         discriminator training) with `self.disc_batch_size` transitions.
 
@@ -509,7 +514,7 @@ def train(
     ) -> None:
         """Alternates between training the generator and discriminator.
 
-        Every "round" consists of a call to `train_gen(self.gen_train_timesteps)`,
+        Every "round" consists of a call to `train_gen_with_disc(self.gen_train_timesteps)`,
         a call to `train_disc`, and finally a call to `callback(round)`.
 
         Training ends once an additional "round" would cause the number of transitions
@@ -529,7 +534,7 @@ def train(
             f"total_timesteps={total_timesteps})!"
         )
         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
-            self.train_gen(self.gen_train_timesteps)
+            self.train_gen_with_disc(self.gen_train_timesteps)
             if callback:
                 callback(r)
             self.logger.dump(self._global_step)
@@ -621,7 +626,8 @@ def _make_disc_train_batches(
         if gen_samples is None:
             if self._gen_replay_buffer.size() == 0:
                 raise RuntimeError(
-                    "No generator samples for training. " "Call `train_gen()` first.",
+                    "No generator samples for training. "
+                    "Call `train_gen_with_disc()` first.",
                 )
             gen_samples_dataclass = self._gen_replay_buffer.sample(batch_size)
             gen_samples = types.dataclass_quick_asdict(gen_samples_dataclass)
diff --git a/tests/algorithms/test_adversarial.py b/tests/algorithms/test_adversarial.py
index d3609efaa..769b2d52f 100644
--- a/tests/algorithms/test_adversarial.py
+++ b/tests/algorithms/test_adversarial.py
@@ -231,8 +231,9 @@ def test_train_gen_train_disc_no_crash(
     trainer_parametrized: common.AdversarialTrainer,
     n_updates: int = 2,
 ) -> None:
-    trainer_parametrized.train_gen(n_updates * trainer_parametrized.gen_train_timesteps)
-    trainer_parametrized.train_disc()
+    trainer_parametrized.train_gen_with_disc(
+        n_updates * trainer_parametrized.gen_train_timesteps
+    )
 
 
 @pytest.fixture

From 9fa8969dcab32262934ae5f44d6ad44920354b2b Mon Sep 17 00:00:00 2001
From: taufeeque9 <9taufeeque9@gmail.com>
Date: Wed, 9 Aug 2023 05:20:59 +0530
Subject: [PATCH 45/47] Fix test errors

---
 src/imitation/algorithms/adversarial/common.py | 12 +++++++++---
 tests/algorithms/test_adversarial.py           |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index a95e550e3..c876c6f69 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -115,6 +115,8 @@ def _on_step(self) -> bool:
         return True
 
     def _on_rollout_end(self) -> None:
+        if self.gen_ctx_manager is not None:
+            self.exit_gen_ctx_manager()
         gen_trajs, ep_lens = self.adversarial_trainer.venv_buffering.pop_trajectories()
         self.adversarial_trainer._check_fixed_horizon(ep_lens)
         gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
@@ -133,9 +135,13 @@ def _on_rollout_end(self) -> None:
         self.gen_ctx_manager = self.adversarial_trainer.logger.accumulate_means("gen")
         self.gen_ctx_manager.__enter__()
 
-    def _on_training_end(self) -> None:
+    def exit_gen_ctx_manager(self) -> None:
         assert self.gen_ctx_manager is not None
         self.gen_ctx_manager.__exit__(None, None, None)
+        self.gen_ctx_manager = None
+
+    def _on_training_end(self) -> None:
+        self.exit_gen_ctx_manager()
 
 
 class AdversarialTrainer(base.DemonstrationAlgorithm[types.Transitions]):
@@ -514,8 +520,8 @@ def train(
     ) -> None:
         """Alternates between training the generator and discriminator.
 
-        Every "round" consists of a call to `train_gen_with_disc(self.gen_train_timesteps)`,
-        a call to `train_disc`, and finally a call to `callback(round)`.
+        Every "round" consists of a call to
+        `train_gen_with_disc(self.gen_train_timesteps)` and a call to `callback(round)`.
 
         Training ends once an additional "round" would cause the number of transitions
         sampled from the environment to exceed `total_timesteps`.
diff --git a/tests/algorithms/test_adversarial.py b/tests/algorithms/test_adversarial.py
index 769b2d52f..3a53e35ca 100644
--- a/tests/algorithms/test_adversarial.py
+++ b/tests/algorithms/test_adversarial.py
@@ -232,7 +232,7 @@ def test_train_gen_train_disc_no_crash(
     n_updates: int = 2,
 ) -> None:
     trainer_parametrized.train_gen_with_disc(
-        n_updates * trainer_parametrized.gen_train_timesteps
+        n_updates * trainer_parametrized.gen_train_timesteps,
     )
 
 

From 3edf518608bbc39b2301a7ab3deca2c8fdbea81b Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 26 Sep 2023 16:03:10 +0200
Subject: [PATCH 46/47] Don't enter the generator logging ctx twice.

---
 src/imitation/algorithms/adversarial/common.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index c876c6f69..87bd43d4a 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -499,14 +499,13 @@ def train_gen_with_disc(
         if learn_kwargs is None:
             learn_kwargs = {}
 
-        with self.logger.accumulate_means("gen"):
-            self.gen_algo.learn(
-                total_timesteps=total_timesteps,
-                reset_num_timesteps=False,
-                callback=self.gen_callback,
-                **learn_kwargs,
-            )
-            self._global_step += 1
+        self.gen_algo.learn(
+            total_timesteps=total_timesteps,
+            reset_num_timesteps=False,
+            callback=self.gen_callback,
+            **learn_kwargs,
+        )
+        self._global_step += 1
 
         gen_trajs, ep_lens = self.venv_buffering.pop_trajectories()
         self._check_fixed_horizon(ep_lens)

From ce8c87ddace0017801e6a5e8fcfcd2ca0dc24cf7 Mon Sep 17 00:00:00 2001
From: Mohammad Taufeeque <9taufeeque9@gmail.com>
Date: Wed, 27 Sep 2023 05:35:23 +0530
Subject: [PATCH 47/47] Update common.py to fix test errors

---
 src/imitation/algorithms/adversarial/common.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/imitation/algorithms/adversarial/common.py b/src/imitation/algorithms/adversarial/common.py
index 87bd43d4a..c9e880c07 100644
--- a/src/imitation/algorithms/adversarial/common.py
+++ b/src/imitation/algorithms/adversarial/common.py
@@ -507,11 +507,6 @@ def train_gen_with_disc(
         )
         self._global_step += 1
 
-        gen_trajs, ep_lens = self.venv_buffering.pop_trajectories()
-        self._check_fixed_horizon(ep_lens)
-        gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
-        self._gen_replay_buffer.store(gen_samples)
-
     def train(
         self,
         total_timesteps: int,