From 24598170017a577c4e2794613bb6ce35adcb0299 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 4 Dec 2023 11:50:59 +0100
Subject: [PATCH 01/36] Fix and expand hyperparameter search space for PC.

---
 src/imitation/scripts/config/tuning.py | 46 ++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py
index 73313770a..daed7c1a0 100644
--- a/src/imitation/scripts/config/tuning.py
+++ b/src/imitation/scripts/config/tuning.py
@@ -188,38 +188,42 @@ def pc():
     parallel_run_config = dict(
         sacred_ex_name="train_preference_comparisons",
         run_name="pc_tuning",
-        base_named_configs=["logging.wandb_logging"],
+        base_named_configs=[],
         base_config_updates={
             "environment": {"num_vec": 1},
-            "demonstrations": {"source": "huggingface"},
             "total_timesteps": 2e7,
-            "total_comparisons": 5000,
-            "query_schedule": "hyperbolic",
-            "gatherer_kwargs": {"sample": True},
+            "total_comparisons": 1000,
+            "active_selection": True,
         },
         search_space={
-            "named_configs": [
-                ["reward.normalize_output_disable"],
-            ],
+            "named_configs": ["reward.reward_ensemble"],
             "config_updates": {
-                "train": {
-                    "policy_kwargs": {
-                        "activation_fn": tune.choice(
-                            [
-                                nn.ReLU,
-                            ],
-                        ),
-                    },
+                "active_selection_oversampling": tune.randint(1, 11),
+                "comparison_queue_size": tune.randint(1, 1001),  # upper bound determined by total_comparisons=1000
+                "exploration_frac": tune.uniform(0.0, 0.5),
+                "fragment_length": tune.randint(1, 1001),  # trajectories are 1000 steps long
+                "gatherer_kwargs": {
+                    "temperature": tune.uniform(0.0, 2.0),
+                    "discount_factor": tune.uniform(0.95, 1.0),
+                    "sample": tune.choice([True, False]),
                 },
-                "num_iterations": tune.choice([25, 50]),
-                "initial_comparison_frac": tune.choice([0.1, 0.25]),
+                "initial_comparison_frac": tune.uniform(0.01, 1.0),
+                "num_iterations": tune.randint(1, 51),
+                "preference_model_kwargs": {
+                    "noise_prob": tune.uniform(0.0, 0.1),
+                    "discount_factor": tune.uniform(0.95, 1.0),
+                },
+                "query_schedule": tune.choice(["hyperbolic", "constant", "inverse_quadratic"]),
+                "trajectory_generator_kwargs": {
+                    "switch_prob": tune.uniform(0.1, 1),
+                    "random_prob": tune.uniform(0.1, 0.9),
+                },
+                "transition_oversampling": tune.uniform(0.9, 2.0),
                 "reward_trainer_kwargs": {
-                    "epochs": tune.choice([1, 3, 6]),
+                    "epochs": tune.randint(1, 11),
                 },
                 "rl": {
-                    "batch_size": tune.choice([512, 2048, 8192]),
                     "rl_kwargs": {
-                        "learning_rate": tune.loguniform(1e-5, 1e-2),
                         "ent_coef": tune.loguniform(1e-7, 1e-3),
                     },
                 },

From c9ccf5bc47523e3ccc24f499dc2339afb1dcaae5 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Sat, 6 Jan 2024 13:50:06 +0100
Subject: [PATCH 02/36] Upgrade environment versions in the
 train_preference_comparisons config.

---
 .../scripts/config/train_preference_comparisons.py   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index 4d8531732..d5c6711d3 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -77,7 +77,7 @@ def cartpole():
 
 @train_preference_comparisons_ex.named_config
 def seals_ant():
-    environment = dict(gym_id="seals/Ant-v0")
+    environment = dict(gym_id="seals/Ant-v1")
     rl = dict(
         batch_size=2048,
         rl_kwargs=dict(
@@ -104,7 +104,7 @@ def half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_half_cheetah():
-    environment = dict(gym_id="seals/HalfCheetah-v0")
+    environment = dict(gym_id="seals/HalfCheetah-v1")
     rl = dict(
         batch_size=512,
         rl_kwargs=dict(
@@ -125,7 +125,7 @@ def seals_half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
-    environment = dict(gym_id="seals/Hopper-v0")
+    environment = dict(gym_id="seals/Hopper-v1")
     policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
@@ -151,7 +151,7 @@ def seals_hopper():
 
 @train_preference_comparisons_ex.named_config
 def seals_swimmer():
-    environment = dict(gym_id="seals/Swimmer-v0")
+    environment = dict(gym_id="seals/Swimmer-v1")
     policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
@@ -178,7 +178,7 @@ def seals_swimmer():
 
 @train_preference_comparisons_ex.named_config
 def seals_walker():
-    environment = dict(gym_id="seals/Walker2d-v0")
+    environment = dict(gym_id="seals/Walker2d-v1")
     policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
@@ -206,7 +206,7 @@ def seals_walker():
 @train_preference_comparisons_ex.named_config
 def seals_humanoid():
     locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Humanoid-v0")
+    environment = dict(gym_id="seals/Humanoid-v1")
     total_timesteps = int(4e6)
 
 

From d7a7da8cafcd0e8fe6d27ec630386632f71faa6a Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 10 Jan 2024 16:04:27 +0100
Subject: [PATCH 03/36] Upgrade to ray 2.9.0.

---
 setup.py                          | 2 +-
 src/imitation/scripts/parallel.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 1c2c85af6..1a76e49fb 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 
 IS_NOT_WINDOWS = os.name != "nt"
 
-PARALLEL_REQUIRE = ["ray[debug,tune]~=2.0.0"]
+PARALLEL_REQUIRE = ["ray[debug,tune]~=2.9.0"]
 ATARI_REQUIRE = [
     "seals[atari]~=0.2.1",
 ]
diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
index d5e5e2378..76a068224 100644
--- a/src/imitation/scripts/parallel.py
+++ b/src/imitation/scripts/parallel.py
@@ -188,13 +188,12 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Mapping[str, Any]) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
             config: Keyword arguments for `ex.run()`, where `ex` is the
                 `sacred.Experiment` instance associated with `sacred_ex_name`.
-            reporter: Callback to report progress to Ray.
 
         Returns:
             Result from `ray.Run` object.

From 55aa6eb74cd58c3c809b272f2950f0bc7ed91c2f Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 8 Jan 2024 16:32:34 +0100
Subject: [PATCH 04/36] Ensure that PC does at least one comparison per
 iteration.

---
 src/imitation/algorithms/preference_comparisons.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 14a8fad5b..1b0a2b01b 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1678,6 +1678,8 @@ def train(
         unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations))
         probs = unnormalized_probs / np.sum(unnormalized_probs)
         shares = util.oric(probs * total_comparisons)
+        shares[shares <= 0] = 1  # ensure we at least request one comparison per iteration
+
         schedule = [initial_comparisons] + shares.tolist()
         print(f"Query schedule: {schedule}")
 

From 78553c9f7351de811cacc264b914c5ff82bf5b77 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 11 Jan 2024 11:10:46 +0100
Subject: [PATCH 05/36] Add initial epoch multiplier as a parameter to the PC
 script.

---
 src/imitation/scripts/config/train_preference_comparisons.py | 2 ++
 src/imitation/scripts/train_preference_comparisons.py        | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
index d5c6711d3..b053d3f38 100644
--- a/src/imitation/scripts/config/train_preference_comparisons.py
+++ b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -42,6 +42,8 @@ def train_defaults():
     transition_oversampling = 1
     # fraction of total_comparisons that will be sampled right at the beginning
     initial_comparison_frac = 0.1
+    # factor by which to oversample the number of epochs in the first iteration
+    initial_epoch_multiplier = 200.0
     # fraction of sampled trajectories that will include some random actions
     exploration_frac = 0.0
     preference_model_kwargs = {}
diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
index 71363daee..428c98381 100644
--- a/src/imitation/scripts/train_preference_comparisons.py
+++ b/src/imitation/scripts/train_preference_comparisons.py
@@ -68,6 +68,7 @@ def train_preference_comparisons(
     fragment_length: int,
     transition_oversampling: float,
     initial_comparison_frac: float,
+    initial_epoch_multiplier: float,
     exploration_frac: float,
     trajectory_path: Optional[str],
     trajectory_generator_kwargs: Mapping[str, Any],
@@ -106,6 +107,9 @@ def train_preference_comparisons(
             sampled before the rest of training begins (using the randomly initialized
             agent). This can be used to pretrain the reward model before the agent
             is trained on the learned reward.
+        initial_epoch_multiplier: before agent training begins, train the reward
+                model for this many more epochs than usual (on fragments sampled from a
+                random agent).
         exploration_frac: fraction of trajectory samples that will be created using
             partially random actions, rather than the current policy. Might be helpful
             if the learned policy explores too little and gets stuck with a wrong
@@ -258,6 +262,7 @@ def train_preference_comparisons(
             fragment_length=fragment_length,
             transition_oversampling=transition_oversampling,
             initial_comparison_frac=initial_comparison_frac,
+            initial_epoch_multiplier=initial_epoch_multiplier,
             custom_logger=custom_logger,
             allow_variable_horizon=allow_variable_horizon,
             query_schedule=query_schedule,

From 1145c07c7a476bb9a3187140e97b5197e00da9a0 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 18 Dec 2023 17:57:43 +0100
Subject: [PATCH 06/36] Add tuning folder and move section about hp-tuning from
 the benchmarking folder to the tuning folder.

---
 benchmarking/README.md | 24 +-----------------------
 tuning/README.md       | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 23 deletions(-)
 create mode 100644 tuning/README.md

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 5566a684c..4eac8c904 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -185,26 +185,4 @@ where:
 - `algo` is the algorithm you want to compare against
 
 If `your_runs_dir` contains runs for more than one algorithm, you will have to
-disambiguate using the `--algo` option.
-
-## Tuning Hyperparameters
-
-The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`.
-The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
-the search space defined in the `scripts/config/tuning.py`.
-
-The tuning script proceeds in two phases:
-1. Tune the hyperparameters using the search space provided.
-2. Re-evaluate the best hyperparameter config found in the first phase based on the maximum mean return on a separate set of seeds. Report the mean and standard deviation of these trials.
-
-To use it with the default search space:
-```bash
-python -m imitation.scripts.tuning with <algo> 'parallel_run_config.base_named_configs=["<env>"]'
-```
-
-In this command:
-- `<algo>` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py`
-- `<env>` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial|imitation|preference_comparisons|rl].py` files. For the already tuned environments, use the `<algo>_<env>` named configs here.
-
-See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be
-provided through the command line to change the tuning behavior.
+disambiguate using the `--algo` option.
\ No newline at end of file
diff --git a/tuning/README.md b/tuning/README.md
new file mode 100644
index 000000000..068efcc6f
--- /dev/null
+++ b/tuning/README.md
@@ -0,0 +1,23 @@
+# Tuning Hyperparameters
+
+The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`.
+The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
+the search space defined in the `scripts/config/tuning.py`.
+
+The tuning script proceeds in two phases:
+1. Tune the hyperparameters using the search space provided.
+2. Re-evaluate the best hyperparameter config found in the first phase 
+   based on the maximum mean return on a separate set of seeds. 
+   Report the mean and standard deviation of these trials.
+
+To use it with the default search space:
+```bash
+python -m imitation.scripts.tuning with <algo> 'parallel_run_config.base_named_configs=["<env>"]'
+```
+
+In this command:
+- `<algo>` provides the default search space and settings for the specific algorithm, which is defined in the `scripts/config/tuning.py`
+- `<env>` sets the environment to tune the algorithm in. They are defined in the algo-specifc `scripts/config/train_[adversarial|imitation|preference_comparisons|rl].py` files. For the already tuned environments, use the `<algo>_<env>` named configs here.
+
+See the documentation of `scripts/tuning.py` and `scripts/parallel.py` for many other arguments that can be
+provided through the command line to change the tuning behavior.

From 6ecfc34621c431deab1eede418e34d89f3265dff Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 10 Jan 2024 17:53:42 +0100
Subject: [PATCH 07/36] Add pure optuna tuning script.

---
 tuning/tune.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 tuning/tune.py

diff --git a/tuning/tune.py b/tuning/tune.py
new file mode 100644
index 000000000..be528a17a
--- /dev/null
+++ b/tuning/tune.py
@@ -0,0 +1,90 @@
+import argparse
+
+import optuna
+import optuna.distributions as dist
+import sacred
+
+import imitation.scripts.train_preference_comparisons
+
+
+def suggest_pc_run_params(trial: optuna.Trial) -> dict:
+    return dict(
+        sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
+        named_configs=["reward.reward_ensemble"],
+        config_updates={
+            "seed": trial.number,
+            "environment": {"num_vec": 1},
+            "total_timesteps": 2e7,
+            "total_comparisons": 1000,
+            "active_selection": True,
+            "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
+            "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
+            "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
+            "fragment_length": trial.suggest_int("fragment_length", 1, 1001),  # trajectories are 1000 steps long
+            "gatherer_kwargs": {
+                "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
+                "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
+                "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
+            },
+            "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
+            "num_iterations": trial.suggest_int("num_iterations", 1, 51),
+            "preference_model_kwargs": {
+                "noise_prob": trial.suggest_float("preference_model_noise_prob", 0.0, 0.1),
+                "discount_factor": trial.suggest_float("preference_model_discount_factor", 0.95, 1.0),
+            },
+            "query_schedule": trial.suggest_categorical("query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]),
+            "trajectory_generator_kwargs": {
+                "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
+                "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
+            },
+            "transition_oversampling": trial.suggest_float("transition_oversampling", 0.9, 2.0),
+            "reward_trainer_kwargs": {
+                "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
+            },
+            "rl": {
+                "rl_kwargs": {
+                    "ent_coef": trial.suggest_float("rl_ent_coef", 1e-7, 1e-3, log=True),
+                },
+            },
+        },
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--algo", type=str, default="pc")
+
+    args = parser.parse_args()
+
+    if args.algo != "pc":
+        raise NotImplementedError(f"Tuning algorithm '{args.algo}' not implemented.")
+
+    study: optuna.Study = optuna.create_study(
+        study_name=f"tuning_{args.algo}"
+    )
+
+    def objective(trial: optuna.Trial) -> float:
+        run_params = suggest_pc_run_params(trial)
+        trial.set_user_attr("config_updates", run_params["config_updates"])
+        trial.set_user_attr("named_configs", run_params["named_configs"])
+        experiment: sacred.Experiment = run_params["sacred_ex"]
+        result = experiment.run(
+            config_updates=run_params["config_updates"],
+            named_configs=run_params["named_configs"],
+            options={"--name": study.study_name, "--file_storage": "sacred"},
+        )
+        if result.status != "COMPLETED":
+            raise RuntimeError(
+                f"Trial failed with {result.fail_trace()} and status {result.status}."
+            )
+        return result.result['imit_stats']['monitor_return_mean']
+
+    study.optimize(
+        objective,
+        callbacks=[optuna.study.MaxTrialsCallback(100)]
+    )
+
+
+if __name__ == '__main__':
+    main()
+

From 8828a352c0b519bada7d7e9327c11b55611da6d8 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Fri, 12 Jan 2024 20:31:11 +0100
Subject: [PATCH 08/36] Use functor to execute sacred experiments as optuna
 trials.

---
 tuning/tune.py | 156 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 124 insertions(+), 32 deletions(-)

diff --git a/tuning/tune.py b/tuning/tune.py
index be528a17a..42c66ce59 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -1,17 +1,69 @@
 import argparse
+import dataclasses
+from typing import List, Mapping, Any, Callable, Dict
 
 import optuna
-import optuna.distributions as dist
 import sacred
 
 import imitation.scripts.train_preference_comparisons
 
 
-def suggest_pc_run_params(trial: optuna.Trial) -> dict:
-    return dict(
+@dataclasses.dataclass
+class RunSacredAsTrial:
+    """Runs a sacred experiment as an optuna trial.
+
+    Assumes that the sacred experiment returns a dict with a key 'imit_stats' that
+    contains a dict with a key 'monitor_return_mean'.
+    """
+
+    """The sacred experiment to run."""
+    sacred_ex: sacred.Experiment
+
+    """A function that returns a list of named configs to pass to sacred.run."""
+    suggest_named_configs: Callable[[optuna.Trial], List[str]]
+
+    """A function that returns a dict of config updates to pass to sacred.run."""
+    suggest_config_updates: Callable[[optuna.Trial], Mapping[str, Any]]
+
+    def __call__(
+            self,
+            trial: optuna.Trial,
+            run_options: Dict,
+            extra_named_configs: List[str]
+    ) -> float:
+        """Run the sacred experiment and return the performance.
+
+        Args:
+            trial: The optuna trial to sample hyperparameters for.
+            run_options: Options to pass to sacred.run(options=).
+            extra_named_configs: Additional named configs to pass to sacred.run.
+        """
+
+        config_updates = self.suggest_config_updates(trial)
+        named_configs = self.suggest_named_configs(trial) + extra_named_configs
+
+        trial.set_user_attr("config_updates", config_updates)
+        trial.set_user_attr("named_configs", named_configs)
+
+        experiment: sacred.Experiment = self.sacred_ex
+        result = experiment.run(
+            config_updates=config_updates,
+            named_configs=named_configs,
+            options=run_options,
+        )
+        if result.status != "COMPLETED":
+            raise RuntimeError(
+                f"Trial failed with {result.fail_trace()} and status {result.status}."
+            )
+        return result.result['imit_stats']['monitor_return_mean']
+
+
+"""A mapping from algorithm names to functions that run the algorithm as an optuna trial."""
+objectives_by_algo = dict(
+    pc=RunSacredAsTrial(
         sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
-        named_configs=["reward.reward_ensemble"],
-        config_updates={
+        suggest_named_configs=lambda _: ["reward.reward_ensemble"],
+        suggest_config_updates=lambda trial: {
             "seed": trial.number,
             "environment": {"num_vec": 1},
             "total_timesteps": 2e7,
@@ -47,44 +99,84 @@ def suggest_pc_run_params(trial: optuna.Trial) -> dict:
                 },
             },
         },
-    )
+    ),
+)
 
 
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--algo", type=str, default="pc")
+def make_parser() -> argparse.ArgumentParser:
+    example_usage = "python -m imitation.scripts.tune pc seals_swimmer"
+    possible_named_configs = "\n".join(
+        f"  - {algo}: {', '.join(objective.sacred_ex.named_configs.keys())}"
+        for algo, objective in objectives_by_algo.items()
+    )
 
-    args = parser.parse_args()
+    parser = argparse.ArgumentParser(
+        description="Tune hyperparameters for imitation learning algorithms.",
+        epilog=f"Example usage:\n{example_usage}\n\nPossible named configs:\n{possible_named_configs}",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "algo",
+        type=str,
+        default="pc",
+        choices=objectives_by_algo.keys(),
+        help="What algorithm to tune.",
+    )
+    parser.add_argument(
+        "named_configs",
+        type=str,
+        nargs="+",
+        default=[],
+        help="Additional named configs to pass to the sacred experiment. "
+             "Use this to select the environment to tune on.",
+    )
+    parser.add_argument(
+        "--num_trials",
+        type=int,
+        default=100,
+        help="Number of trials to run."
+    )
+    parser.add_argument(
+        "-j",
+        "--journal-log",
+        type=str,
+        default=None,
+        help="A journal file to synchronize multiple instances of this script. "
+             "Works on NFS storage."
+    )
+    return parser
 
-    if args.algo != "pc":
-        raise NotImplementedError(f"Tuning algorithm '{args.algo}' not implemented.")
 
-    study: optuna.Study = optuna.create_study(
-        study_name=f"tuning_{args.algo}"
+def make_study(args: argparse.Namespace) -> optuna.Study:
+    if args.journal_log is not None:
+        storage = optuna.storages.JournalStorage(
+            optuna.storages.JournalFileStorage(args.journal_log)
+        )
+    else:
+        storage = None
+
+    return optuna.create_study(
+        study_name=f"tuning_{args.algo}_with_{'_'.join(args.named_configs)}",
+        storage=storage,
+        load_if_exists=True,
+        direction="maximize",
     )
 
-    def objective(trial: optuna.Trial) -> float:
-        run_params = suggest_pc_run_params(trial)
-        trial.set_user_attr("config_updates", run_params["config_updates"])
-        trial.set_user_attr("named_configs", run_params["named_configs"])
-        experiment: sacred.Experiment = run_params["sacred_ex"]
-        result = experiment.run(
-            config_updates=run_params["config_updates"],
-            named_configs=run_params["named_configs"],
-            options={"--name": study.study_name, "--file_storage": "sacred"},
-        )
-        if result.status != "COMPLETED":
-            raise RuntimeError(
-                f"Trial failed with {result.fail_trace()} and status {result.status}."
-            )
-        return result.result['imit_stats']['monitor_return_mean']
+
+def main():
+    parser = make_parser()
+    args = parser.parse_args()
+    study = make_study(args)
 
     study.optimize(
-        objective,
-        callbacks=[optuna.study.MaxTrialsCallback(100)]
+        lambda trial: objectives_by_algo[args.algo](
+            trial,
+            run_options={"--name": study.study_name, "--file_storage": "sacred"},
+            extra_named_configs=args.named_configs
+        ),
+        callbacks=[optuna.study.MaxTrialsCallback(args.num_trials)]
     )
 
 
 if __name__ == '__main__':
     main()
-

From 8e67dd4bb9ab204d7722f0edddc98b3897f75541 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Fri, 12 Jan 2024 21:07:29 +0100
Subject: [PATCH 09/36] Fix usage string.

---
 tuning/tune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/tune.py b/tuning/tune.py
index 42c66ce59..4619e9a6c 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -104,7 +104,7 @@ def __call__(
 
 
 def make_parser() -> argparse.ArgumentParser:
-    example_usage = "python -m imitation.scripts.tune pc seals_swimmer"
+    example_usage = "python tune.py pc seals_swimmer"
     possible_named_configs = "\n".join(
         f"  - {algo}: {', '.join(objective.sacred_ex.named_configs.keys())}"
         for algo, objective in objectives_by_algo.items()

From 865c1c5b36b2d63f551b290157ade498e8ef49a8 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Fri, 12 Jan 2024 21:08:56 +0100
Subject: [PATCH 10/36] First draft of tune_on_slurm.sh

---
 tuning/tune_on_slurm.sh | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 tuning/tune_on_slurm.sh

diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
new file mode 100644
index 000000000..a7750a4bb
--- /dev/null
+++ b/tuning/tune_on_slurm.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#SBATCH --array=1-10
+# Avoid cluttering the root directory with log files:
+#SBATCH --output=slurm/%A_%a.out
+#SBATCH --cpus-per-task=8
+#SBATCH --gpus=0
+#SBATCH --mem=8gb
+#SBATCH --time=70:00:00
+#SBATCH --qos=scavenger
+
+# This script assumes that you set up imitation in your NAS home directory and
+# installed it in a venv located in the imitation directory.
+
+# Call this script with the <algo> <env_named_config> parameters to be passed to the
+# tune.py script.
+
+cd "/nas/ucb/$(whoami)/" || exit
+source imitation/venv/bin/activate
+
+# Note: we run each worker in a separate working directory to avoid race
+# conditions when writing sacred outputs to the same folder.
+mkdir workdir_"$1"_"$2"_"$SLURM_ARRAY_TASK_ID"
+cd workdir_"$1"_"$2"_"$SLURM_ARRAY_TASK_ID" || exit
+
+srun python ../imitation/tuning/tune.py -j ../"$1"_"$2".log "$1" "$2"

From 8e31713848f25beb84a2d7175450bb6e9299dbe7 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 16 Jan 2024 17:08:37 +0100
Subject: [PATCH 11/36] Move hyperparameter search space definitoins to a
 separate file.

---
 tuning/hp_search_spaces.py | 114 +++++++++++++++++++++++++++++++++++++
 tuning/tune.py             | 100 +-------------------------------
 2 files changed, 115 insertions(+), 99 deletions(-)
 create mode 100644 tuning/hp_search_spaces.py

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
new file mode 100644
index 000000000..304f64eaa
--- /dev/null
+++ b/tuning/hp_search_spaces.py
@@ -0,0 +1,114 @@
+"""Definitions for search spaces used when tuning hyperparameters.
+
+To add a new search space, add a new entry to the `objectives_by_algo` dict.
+The key should be the name of the algorithm, and the value should be a RunSacredAsTrial
+object that specifies what sacred experiment to run and how to sample hyperparameters.
+
+Note: you could specify multiple search spaces for the same algorithm. Just make sure
+to give them different names, and then specify the name when running the tuning script.
+For example, to use different spaces for different classes of environments, you could
+have a "pc-classic-control" and a "pc-mujoco" search space.
+"""
+
+import dataclasses
+from typing import Callable, List, Mapping, Any, Dict
+
+import optuna
+import sacred
+
+import imitation.scripts.train_preference_comparisons
+
+
+@dataclasses.dataclass
+class RunSacredAsTrial:
+    """Runs a sacred experiment as an optuna trial.
+
+    Assumes that the sacred experiment returns a dict with a key 'imit_stats' that
+    contains a dict with a key 'monitor_return_mean'.
+    """
+
+    """The sacred experiment to run."""
+    sacred_ex: sacred.Experiment
+
+    """A function that returns a list of named configs to pass to sacred.run."""
+    suggest_named_configs: Callable[[optuna.Trial], List[str]]
+
+    """A function that returns a dict of config updates to pass to sacred.run."""
+    suggest_config_updates: Callable[[optuna.Trial], Mapping[str, Any]]
+
+    def __call__(
+            self,
+            trial: optuna.Trial,
+            run_options: Dict,
+            extra_named_configs: List[str]
+    ) -> float:
+        """Run the sacred experiment and return the performance.
+
+        Args:
+            trial: The optuna trial to sample hyperparameters for.
+            run_options: Options to pass to sacred.run(options=).
+            extra_named_configs: Additional named configs to pass to sacred.run.
+        """
+
+        config_updates = self.suggest_config_updates(trial)
+        named_configs = self.suggest_named_configs(trial) + extra_named_configs
+
+        trial.set_user_attr("config_updates", config_updates)
+        trial.set_user_attr("named_configs", named_configs)
+
+        experiment: sacred.Experiment = self.sacred_ex
+        result = experiment.run(
+            config_updates=config_updates,
+            named_configs=named_configs,
+            options=run_options,
+        )
+        if result.status != "COMPLETED":
+            raise RuntimeError(
+                f"Trial failed with {result.fail_trace()} and status {result.status}."
+            )
+        return result.result['imit_stats']['monitor_return_mean']
+
+
+"""A mapping from algorithm names to functions that run the algorithm as an optuna trial."""
+objectives_by_algo = dict(
+    pc=RunSacredAsTrial(
+        sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
+        suggest_named_configs=lambda _: ["reward.reward_ensemble"],
+        suggest_config_updates=lambda trial: {
+            "seed": trial.number,
+            "environment": {"num_vec": 1},
+            "total_timesteps": 2e7,
+            "total_comparisons": 1000,
+            "active_selection": True,
+            "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
+            "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
+            "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
+            "fragment_length": trial.suggest_int("fragment_length", 1, 1001),  # trajectories are 1000 steps long
+            "gatherer_kwargs": {
+                "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
+                "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
+                "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
+            },
+            "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
+            "num_iterations": trial.suggest_int("num_iterations", 1, 51),
+            "preference_model_kwargs": {
+                "noise_prob": trial.suggest_float("preference_model_noise_prob", 0.0, 0.1),
+                "discount_factor": trial.suggest_float("preference_model_discount_factor", 0.95, 1.0),
+            },
+            "query_schedule": trial.suggest_categorical("query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]),
+            "trajectory_generator_kwargs": {
+                "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
+                "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
+            },
+            "transition_oversampling": trial.suggest_float("transition_oversampling", 0.9, 2.0),
+            "reward_trainer_kwargs": {
+                "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
+            },
+            "rl": {
+                "rl_kwargs": {
+                    "ent_coef": trial.suggest_float("rl_ent_coef", 1e-7, 1e-3, log=True),
+                },
+            },
+        },
+    ),
+)
\ No newline at end of file
diff --git a/tuning/tune.py b/tuning/tune.py
index 4619e9a6c..1a12763a0 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -1,106 +1,8 @@
 import argparse
-import dataclasses
-from typing import List, Mapping, Any, Callable, Dict
 
 import optuna
-import sacred
 
-import imitation.scripts.train_preference_comparisons
-
-
-@dataclasses.dataclass
-class RunSacredAsTrial:
-    """Runs a sacred experiment as an optuna trial.
-
-    Assumes that the sacred experiment returns a dict with a key 'imit_stats' that
-    contains a dict with a key 'monitor_return_mean'.
-    """
-
-    """The sacred experiment to run."""
-    sacred_ex: sacred.Experiment
-
-    """A function that returns a list of named configs to pass to sacred.run."""
-    suggest_named_configs: Callable[[optuna.Trial], List[str]]
-
-    """A function that returns a dict of config updates to pass to sacred.run."""
-    suggest_config_updates: Callable[[optuna.Trial], Mapping[str, Any]]
-
-    def __call__(
-            self,
-            trial: optuna.Trial,
-            run_options: Dict,
-            extra_named_configs: List[str]
-    ) -> float:
-        """Run the sacred experiment and return the performance.
-
-        Args:
-            trial: The optuna trial to sample hyperparameters for.
-            run_options: Options to pass to sacred.run(options=).
-            extra_named_configs: Additional named configs to pass to sacred.run.
-        """
-
-        config_updates = self.suggest_config_updates(trial)
-        named_configs = self.suggest_named_configs(trial) + extra_named_configs
-
-        trial.set_user_attr("config_updates", config_updates)
-        trial.set_user_attr("named_configs", named_configs)
-
-        experiment: sacred.Experiment = self.sacred_ex
-        result = experiment.run(
-            config_updates=config_updates,
-            named_configs=named_configs,
-            options=run_options,
-        )
-        if result.status != "COMPLETED":
-            raise RuntimeError(
-                f"Trial failed with {result.fail_trace()} and status {result.status}."
-            )
-        return result.result['imit_stats']['monitor_return_mean']
-
-
-"""A mapping from algorithm names to functions that run the algorithm as an optuna trial."""
-objectives_by_algo = dict(
-    pc=RunSacredAsTrial(
-        sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
-        suggest_named_configs=lambda _: ["reward.reward_ensemble"],
-        suggest_config_updates=lambda trial: {
-            "seed": trial.number,
-            "environment": {"num_vec": 1},
-            "total_timesteps": 2e7,
-            "total_comparisons": 1000,
-            "active_selection": True,
-            "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
-            "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
-            "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
-            "fragment_length": trial.suggest_int("fragment_length", 1, 1001),  # trajectories are 1000 steps long
-            "gatherer_kwargs": {
-                "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
-                "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
-                "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
-            },
-            "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
-            "num_iterations": trial.suggest_int("num_iterations", 1, 51),
-            "preference_model_kwargs": {
-                "noise_prob": trial.suggest_float("preference_model_noise_prob", 0.0, 0.1),
-                "discount_factor": trial.suggest_float("preference_model_discount_factor", 0.95, 1.0),
-            },
-            "query_schedule": trial.suggest_categorical("query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]),
-            "trajectory_generator_kwargs": {
-                "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
-                "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
-            },
-            "transition_oversampling": trial.suggest_float("transition_oversampling", 0.9, 2.0),
-            "reward_trainer_kwargs": {
-                "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
-            },
-            "rl": {
-                "rl_kwargs": {
-                    "ent_coef": trial.suggest_float("rl_ent_coef", 1e-7, 1e-3, log=True),
-                },
-            },
-        },
-    ),
-)
+from hp_search_spaces import objectives_by_algo
 
 
 def make_parser() -> argparse.ArgumentParser:

From 2f8cb4dedfc77827dc855160e9dd999e971cf22c Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 18 Jan 2024 14:15:12 +0100
Subject: [PATCH 12/36] Add initial_epoch_multiplier to hyperparameter search
 space of PC

---
 tuning/hp_search_spaces.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 304f64eaa..d5913272a 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -89,6 +89,7 @@ def __call__(
                 "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
                 "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
             },
+            "initial_epoch_multiplier": trial.suggest_float("initial_epoch_multiplier", 1, 200.0),
             "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
             "num_iterations": trial.suggest_int("num_iterations", 1, 51),
             "preference_model_kwargs": {

From d9c4b57c22fa01914c904d6fd0a7bb457c4b8e44 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 18 Jan 2024 14:41:39 +0100
Subject: [PATCH 13/36] Speed up trials by vectorizing environments

---
 tuning/hp_search_spaces.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index d5913272a..60f0cd0ab 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -76,7 +76,7 @@ def __call__(
         suggest_named_configs=lambda _: ["reward.reward_ensemble"],
         suggest_config_updates=lambda trial: {
             "seed": trial.number,
-            "environment": {"num_vec": 1},
+            "environment": {"num_vec": 8},
             "total_timesteps": 2e7,
             "total_comparisons": 1000,
             "active_selection": True,

From be085d30eee747ceb6c202ab631aeb37aa7a34f8 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 18 Jan 2024 14:57:56 +0100
Subject: [PATCH 14/36] Improve tune_on_slum.sh

---
 tuning/tune_on_slurm.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index a7750a4bb..b7f34f920 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
-#SBATCH --array=1-10
+#SBATCH --array=1-100
 # Avoid cluttering the root directory with log files:
-#SBATCH --output=slurm/%A_%a.out
+#SBATCH --output=%A/%a/cout.txt
 #SBATCH --cpus-per-task=8
 #SBATCH --gpus=0
 #SBATCH --mem=8gb
 #SBATCH --time=70:00:00
 #SBATCH --qos=scavenger
+#SBATCH --export=ALL
 
 # This script assumes that you set up imitation in your NAS home directory and
 # installed it in a venv located in the imitation directory.
@@ -14,12 +15,11 @@
 # Call this script with the <algo> <env_named_config> parameters to be passed to the
 # tune.py script.
 
-cd "/nas/ucb/$(whoami)/" || exit
-source imitation/venv/bin/activate
+source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 
 # Note: we run each worker in a separate working directory to avoid race
 # conditions when writing sacred outputs to the same folder.
-mkdir workdir_"$1"_"$2"_"$SLURM_ARRAY_TASK_ID"
-cd workdir_"$1"_"$2"_"$SLURM_ARRAY_TASK_ID" || exit
+mkdir -p "$SLURM_ARRAY_JOB_ID"/"$SLURM_ARRAY_TASK_ID"
+cd "$SLURM_ARRAY_JOB_ID"/"$SLURM_ARRAY_TASK_ID" || exit
 
-srun python ../imitation/tuning/tune.py -j ../"$1"_"$2".log "$1" "$2"
+srun python ../../tune.py --num_trials 400 -j ../"$1"_"$2".log "$1" "$2"
\ No newline at end of file

From efc2dbfcb35c3ecbef29b920c65a961ea14f2249 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 22 Jan 2024 16:46:15 +0100
Subject: [PATCH 15/36] Add script to tune PC on all environments.

---
 tuning/tune_all_on_slurm.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 tuning/tune_all_on_slurm.sh

diff --git a/tuning/tune_all_on_slurm.sh b/tuning/tune_all_on_slurm.sh
new file mode 100644
index 000000000..7787cc138
--- /dev/null
+++ b/tuning/tune_all_on_slurm.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+sbatch --job-name=tuning_pc_on_cartpole tune_on_slurm.sh pc cartpole
+sbatch --job-name=tuning_pc_on_seals_ant tune_on_slurm.sh pc seals_ant
+sbatch --job-name=tuning_pc_on_seals_half_cheetah tune_on_slurm.sh pc seals_half_cheetah
+sbatch --job-name=tuning_pc_on_seals_hopper tune_on_slurm.sh pc seals_hopper
+sbatch --job-name=tuning_pc_on_seals_swimmer tune_on_slurm.sh pc seals_swimmer
+sbatch --job-name=tuning_pc_on_seals_walker tune_on_slurm.sh pc seals_walker
+sbatch --job-name=tuning_pc_on_seals_humanoid tune_on_slurm.sh pc seals_humanoid
+sbatch --job-name=tuning_pc_on_seals_cartpole tune_on_slurm.sh pc seals_cartpole
+sbatch --job-name=tuning_pc_on_pendulum tune_on_slurm.sh pc pendulum
+sbatch --job-name=tuning_pc_on_seals_mountain_car tune_on_slurm.sh pc seals_mountain_car
\ No newline at end of file

From 2520508b4db4805eb857ea066aa097d8e2e2c15a Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 22 Jan 2024 17:41:46 +0100
Subject: [PATCH 16/36] Improve documentation for hyperparameter tuning.

---
 tuning/README.md           | 15 +++++++++++++++
 tuning/hp_search_spaces.py |  7 +++++--
 tuning/tune_on_slurm.sh    | 31 +++++++++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/tuning/README.md b/tuning/README.md
index 068efcc6f..f49e54935 100644
--- a/tuning/README.md
+++ b/tuning/README.md
@@ -1,4 +1,19 @@
 # Tuning Hyperparameters
+This directory contains scripts for tuning hyperparameters for imitation learning algorithms.
+Additional helper scripts allow for running multiple tuning jobs in parallel on a SLURM cluster.
+
+Use `tune.py` to tune hyperparameters for a single algorithm and environment using Optuna.
+If you want to specify a custom algorithm and search space, add it to the dict in `hp_search_spaces.py`.
+
+You can tune using multiple workers in parallel by running multiple instances of `tune.py` that all point to the same journal log file (see `tune.py --help` for details).
+To easily launch multiple workers on a SLURM cluster and ensure they don't conflict with each other,
+use the `tune_on_slurm.py` script. 
+This script will launch a SLURM job array with the specified number of workers.
+If you want to tune all algorithms on all environments on SLURM, use `tune_all_on_slurm.sh`.
+
+# Legacy Tuning Scripts
+
+Note: There are some legacy tuning scripts that can be used like this: 
 
 The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`.
 The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 60f0cd0ab..aabd8e056 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -4,10 +4,13 @@
 The key should be the name of the algorithm, and the value should be a RunSacredAsTrial
 object that specifies what sacred experiment to run and how to sample hyperparameters.
 
-Note: you could specify multiple search spaces for the same algorithm. Just make sure
-to give them different names, and then specify the name when running the tuning script.
+Note: you could specify multiple search spaces for the same algorithm. Make sure to give
+them different names, and then specify the name when running the tuning script.
 For example, to use different spaces for different classes of environments, you could
 have a "pc-classic-control" and a "pc-mujoco" search space.
+Note: avoid using underscores in the search space names, as they are used to separate
+the algorithm name from the search space name when inferring the algorithm name from
+the study name.
 """
 
 import dataclasses
diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index b7f34f920..451a94d10 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -9,11 +9,38 @@
 #SBATCH --qos=scavenger
 #SBATCH --export=ALL
 
+# DESCRIPTION:
+# This script is used to tune the hyperparameters of an algorithm on a given
+# environment in parallel on a SLURM cluster with 400 trials and 100 workers.
+
+# PREREQUISITES:
 # This script assumes that you set up imitation in your NAS home directory and
 # installed it in a venv located in the imitation directory.
+# /nas/ucb/(your username)/imitation/venv/
+# Do this by running the following commands:
+# cd /nas/ucb/(your username)/
+# git clone https://github.com/HumanCompatibleAI/imitation.git
+# srun python3 -m venv venv
+# source venv/bin/activate
+# srun pip install -e .
+# It is important to set up the venv using srun to ensure that the venv is working
+# properly on the compute nodes.
+
+# USAGE:
+# Run this script with sbatch and pass it the algorithm and the environment
+# named-config. For example, to tune PC on CartPole, run:
+# sbatch --job-name=tuning_pc_on_cartpole tune_on_slurm.sh pc cartpole
+# To change the number of workers, change the --array parameter above.
+# To change the number of trials, change the --num_trials parameter below.
+# Supported are all algorithms and environments that are supported by the tune.py
+# Run tune.py --help for more information.
 
-# Call this script with the <algo> <env_named_config> parameters to be passed to the
-# tune.py script.
+# OUTPUT:
+# This script creates a folder with the name of the SLURM job ID and a numbered
+# subfolder for each worker: <SLURM_JOB_ID>/<SLURM_ARRAY_TASK_ID>
+# The main folder contains the optuna journal for synchronizing the workers.
+# Each worker is executed within it's own subfolder to ensure that their outputs
+# do not conflict with each other. The output of each worker is written to a cout.txt.
 
 source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 

From 147c996a89e379c74013c7a2df05b065f33a1178 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 22 Jan 2024 17:52:57 +0100
Subject: [PATCH 17/36] Add script to re-run the best trials from a
 hyperparameter tuning run.

---
 tuning/tune.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuning/tune.py b/tuning/tune.py
index 1a12763a0..cfd3f2bf6 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -1,3 +1,4 @@
+"""Script to tune hyperparameters for imitation learning algorithms using optuna."""
 import argparse
 
 import optuna

From b2a98d75fe05d622e3673318eafde34798783688 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 5 Feb 2024 15:18:59 +0100
Subject: [PATCH 18/36] Add gc_after_trial flag to avoid memory issues.

---
 tuning/tune.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tuning/tune.py b/tuning/tune.py
index cfd3f2bf6..76b9076c8 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -77,7 +77,8 @@ def main():
             run_options={"--name": study.study_name, "--file_storage": "sacred"},
             extra_named_configs=args.named_configs
         ),
-        callbacks=[optuna.study.MaxTrialsCallback(args.num_trials)]
+        callbacks=[optuna.study.MaxTrialsCallback(args.num_trials)],
+        gc_after_trial=True,
     )
 
 

From 6f2e30cee07fe58bdd5c6c1af2042b3b386e4c58 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 5 Feb 2024 15:53:11 +0100
Subject: [PATCH 19/36] Add feature to re-run a tuning run.

---
 tuning/tune_on_slurm.sh | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index 451a94d10..cab9dec96 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --array=1-100
 # Avoid cluttering the root directory with log files:
-#SBATCH --output=%A/%a/cout.txt
+#SBATCH --output=%x/%a/cout.txt
 #SBATCH --cpus-per-task=8
 #SBATCH --gpus=0
 #SBATCH --mem=8gb
@@ -36,17 +36,35 @@
 # Run tune.py --help for more information.
 
 # OUTPUT:
-# This script creates a folder with the name of the SLURM job ID and a numbered
-# subfolder for each worker: <SLURM_JOB_ID>/<SLURM_ARRAY_TASK_ID>
-# The main folder contains the optuna journal for synchronizing the workers.
+# This script creates a folder with the name of the SLURM job a numbered subfolder for
+# each worker: <SLURM_JOB_NAME>/<SLURM_ARRAY_TASK_ID>
+# The main folder contains the optuna journal .log for synchronizing the workers.
 # Each worker is executed within it's own subfolder to ensure that their outputs
 # do not conflict with each other. The output of each worker is written to a cout.txt.
 
+# CONTINUING A TUNING RUN:
+# Often it is desirable to continue an existing job or add more workers to it while it
+# is running. Just run run this batch job again but change the --array parameter to
+# ensure that the new workers do not conflict with the old ones. E.g. if you first ran
+# the batch script with --array=1-100 (the default), a subsequent run should be launched
+# with the --array=101-150 (for another 50 workers). For this you do not need to modify
+# this file. You can pass it to sbatch to override.
+
 source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 
-# Note: we run each worker in a separate working directory to avoid race
-# conditions when writing sacred outputs to the same folder.
-mkdir -p "$SLURM_ARRAY_JOB_ID"/"$SLURM_ARRAY_TASK_ID"
-cd "$SLURM_ARRAY_JOB_ID"/"$SLURM_ARRAY_TASK_ID" || exit
+mkdir -p "$SLURM_JOB_NAME"
+
+if [ -d "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID" ]; then
+  echo "The study folder for $SLURM_JOB_NAME already contains a folder for job $SLURM_ARRAY_TASK_ID!"
+  echo "Are you trying to continue on an existing study? Then adapt the sbatch array range!"
+  echo "E.g. if the highest folder number in $SLURM_JOB_NAME/ is 100 and you want to continue the study with another 50 runners, start this script using `sbatch --job-name=$SLURM_JOB_NAME --array=101-50 tune_on_slurm.sh $1 $2`"
+  exit 1
+else
+  # Note: we run each worker in a separate working directory to avoid race
+  # conditions when writing sacred outputs to the same folder.
+  mkdir "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID"
+fi
+
+cd "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID" || exit
 
-srun python ../../tune.py --num_trials 400 -j ../"$1"_"$2".log "$1" "$2"
\ No newline at end of file
+srun python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"
\ No newline at end of file

From 454c393d51d4d4c680403a7880466ae8c82bbf82 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 5 Feb 2024 17:21:32 +0100
Subject: [PATCH 20/36] Write output of sbatch script and output of training
 script to separate files.

---
 tuning/tune_on_slurm.sh | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index cab9dec96..6a6cc84df 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --array=1-100
 # Avoid cluttering the root directory with log files:
-#SBATCH --output=%x/%a/cout.txt
+#SBATCH --output=%x/%a/sbatch_cout.txt
 #SBATCH --cpus-per-task=8
 #SBATCH --gpus=0
 #SBATCH --mem=8gb
@@ -52,9 +52,10 @@
 
 source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 
-mkdir -p "$SLURM_JOB_NAME"
-
-if [ -d "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID" ]; then
+if [ -f "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID/cout.txt" ]; then
+  # Note: this will just be written to sbatch_cout.txt and not to cout.txt to avoid
+  # overriding existing cout.txt files. Unfortunately sbatch won't print this for us
+  # so it is not very useful information.
   echo "The study folder for $SLURM_JOB_NAME already contains a folder for job $SLURM_ARRAY_TASK_ID!"
   echo "Are you trying to continue on an existing study? Then adapt the sbatch array range!"
   echo "E.g. if the highest folder number in $SLURM_JOB_NAME/ is 100 and you want to continue the study with another 50 runners, start this script using `sbatch --job-name=$SLURM_JOB_NAME --array=101-50 tune_on_slurm.sh $1 $2`"
@@ -62,9 +63,9 @@ if [ -d "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID" ]; then
 else
   # Note: we run each worker in a separate working directory to avoid race
   # conditions when writing sacred outputs to the same folder.
-  mkdir "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID"
+  mkdir -p "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID"
 fi
 
-cd "$SLURM_JOB_NAME"/"$SLURM_ARRAY_TASK_ID" || exit
+cd "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID" || exit
 
-srun python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"
\ No newline at end of file
+srun --output=%x/%a/cout.txt python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"
\ No newline at end of file

From 5012b157873d006affb7ff560472e3e79ec364f0 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 5 Feb 2024 17:53:31 +0100
Subject: [PATCH 21/36] Add scripts to rerun the top trials of a hyperparameter
 sweep.

---
 tuning/rerun_best_trial.py | 108 +++++++++++++++++++++++++++++++++++++
 tuning/rerun_on_slurm.sh   |  50 +++++++++++++++++
 2 files changed, 158 insertions(+)
 create mode 100644 tuning/rerun_best_trial.py
 create mode 100644 tuning/rerun_on_slurm.sh

diff --git a/tuning/rerun_best_trial.py b/tuning/rerun_best_trial.py
new file mode 100644
index 000000000..7467729a4
--- /dev/null
+++ b/tuning/rerun_best_trial.py
@@ -0,0 +1,108 @@
+"""Script to re-run the best trials from a previous hyperparameter tuning run."""
+import argparse
+import random
+from typing import List, Tuple
+
+import optuna
+import sacred
+
+import hp_search_spaces
+
+
+def make_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=
+        "Re-run the best trials from a previous tuning run.",
+        epilog=f"Example usage:\n"
+               f"python rerun_best_trials.py tuning_run.json\n",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--algo",
+        type=str,
+        default=None,
+        choices=hp_search_spaces.objectives_by_algo.keys(),
+        help="The algorithm that has been tuned. "
+             "Can usually be deduced from the study name.",
+    )
+    parser.add_argument(
+        "--top-k",
+        type=int,
+        default=1,
+        help="Chooses the kth best trial to re-run."
+    )
+    parser.add_argument(
+        "journal_log",
+        type=str,
+        help="The optuna journal file of the previous tuning run."
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=random.randint(0, 2**32 - 1),
+        help="The seed to use for the re-run. A random seed is used by default."
+    )
+    return parser
+
+
+def infer_algo_name(study: optuna.Study) -> Tuple[str, List[str]]:
+    """Infer the algo name from the study name.
+
+    Assumes that the study name is of the form "tuning_{algo}_with_{named_configs}".
+    """
+    assert study.study_name.startswith("tuning_")
+    assert "_with_" in study.study_name
+    return study.study_name[len("tuning_"):].split("_with_")[0]
+
+
+def get_top_k_trial(study: optuna.Study, k: int) -> optuna.trial.Trial:
+    if k <= 0:
+        raise ValueError(f"--top-k must be positive, but is {k}.")
+    finished_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
+    if len(finished_trials) == 0:
+        raise ValueError("No trials have completed.")
+    if len(finished_trials) < k:
+        raise ValueError(
+            f"Only {len(finished_trials)} trials have completed, but --top-k is {k}."
+        )
+
+    return sorted(
+        finished_trials,
+        key=lambda t: t.value, reverse=True,
+    )[k-1]
+
+
+def main():
+    parser = make_parser()
+    args = parser.parse_args()
+    study: optuna.Study = optuna.load_study(
+        storage=optuna.storages.JournalStorage(
+            optuna.storages.JournalFileStorage(args.journal_log)
+        ),
+        # in our case, we have one journal file per study so the study name can be
+        # inferred
+        study_name=None,
+    )
+    trial = get_top_k_trial(study, args.top_k)
+
+    print(trial.value, trial.params)
+
+    algo_name = args.algo or infer_algo_name(study)
+    sacred_experiment: sacred.Experiment = hp_search_spaces.objectives_by_algo[algo_name].sacred_ex
+
+    config_updates = trial.user_attrs["config_updates"].copy()
+    config_updates["seed"] = args.seed
+    result = sacred_experiment.run(
+        config_updates=config_updates,
+        named_configs=trial.user_attrs["named_configs"],
+        options={"--name": study.study_name, "--file_storage": "sacred"},
+
+    )
+    if result.status != "COMPLETED":
+        raise RuntimeError(
+            f"Trial failed with {result.fail_trace()} and status {result.status}."
+        )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tuning/rerun_on_slurm.sh b/tuning/rerun_on_slurm.sh
new file mode 100644
index 000000000..1f684b79b
--- /dev/null
+++ b/tuning/rerun_on_slurm.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#SBATCH --array=1-5
+# Avoid cluttering the root directory with log files:
+#SBATCH --output=%x/%a/sbatch_cout.txt
+#SBATCH --cpus-per-task=8
+#SBATCH --gpus=0
+#SBATCH --mem=8gb
+#SBATCH --time=70:00:00
+#SBATCH --qos=scavenger
+#SBATCH --export=ALL
+
+# DESCRIPTION:
+# Reruns the top trials from a previous hyperparameter sweep.
+
+# PREREQUISITES:
+# A folder with a hyperparameter sweep as started by tune_on_slurm.sh.
+
+# USAGE:
+# sbatch rerun_on_slurm <tune_folder> <top-k>
+#
+# Picks the top-k trial from the optuna study in <tune_folder> and reruns them with
+# the same hyperparameters but different seeds.
+
+# OUTPUT:
+# Creates a subfolder in the given tune_folder for each worker:
+# <tune_folder>/reruns/top_<top-k>/<seed>
+# The output of each worker is written to a cout.txt.
+
+
+source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
+
+if [ -z $2 ]; then
+  top_k=1
+else
+  top_k=$2
+fi
+
+worker_dir="$1/reruns/top_$top_k/$SLURM_ARRAY_TASK_ID/"
+
+if [ -f "$worker_dir/cout.txt" ]; then
+  exit 1
+else
+  # Note: we run each worker in a separate working directory to avoid race
+  # conditions when writing sacred outputs to the same folder.
+  mkdir -p "$worker_dir"
+fi
+
+cd "$worker_dir" || exit
+
+srun --output="$worker_dir/cout.txt" python ../../rerun_on_slurm.py "$1/optuna_study.log" --top_k "$top_k" --seed "$SLURM_ARRAY_TASK_ID"

From 2c978af1734f06a32eaf90b4f50b83f1370ce440 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 13 Feb 2024 09:34:23 +0100
Subject: [PATCH 22/36] Fix documentation and cout.txt placement in
 tune_on_slurm.sh

---
 tuning/tune_on_slurm.sh | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index 6a6cc84df..914f450af 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -30,17 +30,21 @@
 # Run this script with sbatch and pass it the algorithm and the environment
 # named-config. For example, to tune PC on CartPole, run:
 # sbatch --job-name=tuning_pc_on_cartpole tune_on_slurm.sh pc cartpole
-# To change the number of workers, change the --array parameter above.
+# To change the number of workers, change the --array parameter above
+# or pass the --array argument to sbatch.
 # To change the number of trials, change the --num_trials parameter below.
 # Supported are all algorithms and environments that are supported by the tune.py
 # Run tune.py --help for more information.
 
 # OUTPUT:
-# This script creates a folder with the name of the SLURM job a numbered subfolder for
+# This script creates a folder with the name of the SLURM job a numbered sub-folder for
 # each worker: <SLURM_JOB_NAME>/<SLURM_ARRAY_TASK_ID>
 # The main folder contains the optuna journal .log for synchronizing the workers.
-# Each worker is executed within it's own subfolder to ensure that their outputs
-# do not conflict with each other. The output of each worker is written to a cout.txt.
+# It is suitable to place this log on a nfs drive shared among all workers.
+# Each worker is executed within it's own sub-folder to ensure that their outputs
+# do not conflict with each other.
+# The output of each worker is written to a cout.txt.
+# The output of the sbatch command is written to sbatch_cout.txt.
 
 # CONTINUING A TUNING RUN:
 # Often it is desirable to continue an existing job or add more workers to it while it
@@ -68,4 +72,4 @@ fi
 
 cd "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID" || exit
 
-srun --output=%x/%a/cout.txt python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"
\ No newline at end of file
+srun --output=cout.txt python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"
\ No newline at end of file

From 7178fee3bc37b52089bdfdd5e57bbb9a60f6595e Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 19 Feb 2024 10:22:27 +0100
Subject: [PATCH 23/36] Increase default memory for cheetah and humanoid.

---
 tuning/tune_all_on_slurm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tuning/tune_all_on_slurm.sh b/tuning/tune_all_on_slurm.sh
index 7787cc138..19324b3b4 100644
--- a/tuning/tune_all_on_slurm.sh
+++ b/tuning/tune_all_on_slurm.sh
@@ -2,11 +2,11 @@
 
 sbatch --job-name=tuning_pc_on_cartpole tune_on_slurm.sh pc cartpole
 sbatch --job-name=tuning_pc_on_seals_ant tune_on_slurm.sh pc seals_ant
-sbatch --job-name=tuning_pc_on_seals_half_cheetah tune_on_slurm.sh pc seals_half_cheetah
+sbatch --job-name=tuning_pc_on_seals_half_cheetah --mem=16gb tune_on_slurm.sh pc seals_half_cheetah
 sbatch --job-name=tuning_pc_on_seals_hopper tune_on_slurm.sh pc seals_hopper
 sbatch --job-name=tuning_pc_on_seals_swimmer tune_on_slurm.sh pc seals_swimmer
 sbatch --job-name=tuning_pc_on_seals_walker tune_on_slurm.sh pc seals_walker
-sbatch --job-name=tuning_pc_on_seals_humanoid tune_on_slurm.sh pc seals_humanoid
+sbatch --job-name=tuning_pc_on_seals_humanoid --mem=32gb tune_on_slurm.sh pc seals_humanoid
 sbatch --job-name=tuning_pc_on_seals_cartpole tune_on_slurm.sh pc seals_cartpole
 sbatch --job-name=tuning_pc_on_pendulum tune_on_slurm.sh pc pendulum
 sbatch --job-name=tuning_pc_on_seals_mountain_car tune_on_slurm.sh pc seals_mountain_car
\ No newline at end of file

From 263db3e50c881083d227bd7d0342d05c647b7c88 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 19 Feb 2024 10:30:13 +0100
Subject: [PATCH 24/36] Remove top-k feature when re-running trials.

---
 tuning/rerun_best_trial.py | 31 +++----------------------------
 tuning/rerun_on_slurm.sh   | 24 +++++++++++-------------
 2 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/tuning/rerun_best_trial.py b/tuning/rerun_best_trial.py
index 7467729a4..ed269c30f 100644
--- a/tuning/rerun_best_trial.py
+++ b/tuning/rerun_best_trial.py
@@ -12,7 +12,7 @@
 def make_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         description=
-        "Re-run the best trials from a previous tuning run.",
+        "Re-run the best trial from a previous tuning run.",
         epilog=f"Example usage:\n"
                f"python rerun_best_trials.py tuning_run.json\n",
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -25,12 +25,6 @@ def make_parser() -> argparse.ArgumentParser:
         help="The algorithm that has been tuned. "
              "Can usually be deduced from the study name.",
     )
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        default=1,
-        help="Chooses the kth best trial to re-run."
-    )
     parser.add_argument(
         "journal_log",
         type=str,
@@ -45,7 +39,7 @@ def make_parser() -> argparse.ArgumentParser:
     return parser
 
 
-def infer_algo_name(study: optuna.Study) -> Tuple[str, List[str]]:
+def infer_algo_name(study: optuna.Study) -> str:
     """Infer the algo name from the study name.
 
     Assumes that the study name is of the form "tuning_{algo}_with_{named_configs}".
@@ -55,23 +49,6 @@ def infer_algo_name(study: optuna.Study) -> Tuple[str, List[str]]:
     return study.study_name[len("tuning_"):].split("_with_")[0]
 
 
-def get_top_k_trial(study: optuna.Study, k: int) -> optuna.trial.Trial:
-    if k <= 0:
-        raise ValueError(f"--top-k must be positive, but is {k}.")
-    finished_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
-    if len(finished_trials) == 0:
-        raise ValueError("No trials have completed.")
-    if len(finished_trials) < k:
-        raise ValueError(
-            f"Only {len(finished_trials)} trials have completed, but --top-k is {k}."
-        )
-
-    return sorted(
-        finished_trials,
-        key=lambda t: t.value, reverse=True,
-    )[k-1]
-
-
 def main():
     parser = make_parser()
     args = parser.parse_args()
@@ -83,9 +60,7 @@ def main():
         # inferred
         study_name=None,
     )
-    trial = get_top_k_trial(study, args.top_k)
-
-    print(trial.value, trial.params)
+    trial = study.best_trial
 
     algo_name = args.algo or infer_algo_name(study)
     sacred_experiment: sacred.Experiment = hp_search_spaces.objectives_by_algo[algo_name].sacred_ex
diff --git a/tuning/rerun_on_slurm.sh b/tuning/rerun_on_slurm.sh
index 1f684b79b..d1cb9d301 100644
--- a/tuning/rerun_on_slurm.sh
+++ b/tuning/rerun_on_slurm.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --array=1-5
 # Avoid cluttering the root directory with log files:
-#SBATCH --output=%x/%a/sbatch_cout.txt
+#SBATCH --output=%x/reruns/%a/sbatch_cout.txt
 #SBATCH --cpus-per-task=8
 #SBATCH --gpus=0
 #SBATCH --mem=8gb
@@ -16,28 +16,26 @@
 # A folder with a hyperparameter sweep as started by tune_on_slurm.sh.
 
 # USAGE:
-# sbatch rerun_on_slurm <tune_folder> <top-k>
+# sbatch --job-name=<name of previous tuning job> rerun_on_slurm.sh
 #
-# Picks the top-k trial from the optuna study in <tune_folder> and reruns them with
+# Picks the best trial from the optuna study in <tune_folder> and reruns them with
 # the same hyperparameters but different seeds.
 
 # OUTPUT:
-# Creates a subfolder in the given tune_folder for each worker:
-# <tune_folder>/reruns/top_<top-k>/<seed>
+# Creates a sub-folder in the given tune_folder for each worker:
+# <tune_folder>/reruns/<seed>
 # The output of each worker is written to a cout.txt.
 
 
 source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 
-if [ -z $2 ]; then
-  top_k=1
-else
-  top_k=$2
-fi
-
-worker_dir="$1/reruns/top_$top_k/$SLURM_ARRAY_TASK_ID/"
+worker_dir="$SLURM_JOB_NAME/reruns/$SLURM_ARRAY_TASK_ID/"
 
 if [ -f "$worker_dir/cout.txt" ]; then
+  # This indicates that there is already a worker running in that directory.
+  # So we better abort!
+  echo "There is already a worker running in this directory. \
+    Try different seeds by picking a different array range!"
   exit 1
 else
   # Note: we run each worker in a separate working directory to avoid race
@@ -47,4 +45,4 @@ fi
 
 cd "$worker_dir" || exit
 
-srun --output="$worker_dir/cout.txt" python ../../rerun_on_slurm.py "$1/optuna_study.log" --top_k "$top_k" --seed "$SLURM_ARRAY_TASK_ID"
+srun --output="$worker_dir/cout.txt" python ../../../rerun_best_trial.py "$SLURM_JOB_NAME/optuna_study.log" --seed "$SLURM_ARRAY_TASK_ID"

From 7311d1c62aebab4f3b2a07a8f4fbd71bc4cecec4 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 19 Feb 2024 11:01:05 +0100
Subject: [PATCH 25/36] Fix paths in rerun batch script.

---
 tuning/rerun_on_slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/rerun_on_slurm.sh b/tuning/rerun_on_slurm.sh
index d1cb9d301..d1e59df69 100644
--- a/tuning/rerun_on_slurm.sh
+++ b/tuning/rerun_on_slurm.sh
@@ -45,4 +45,4 @@ fi
 
 cd "$worker_dir" || exit
 
-srun --output="$worker_dir/cout.txt" python ../../../rerun_best_trial.py "$SLURM_JOB_NAME/optuna_study.log" --seed "$SLURM_ARRAY_TASK_ID"
+srun --output="cout.txt" python ../../../rerun_best_trial.py "../../optuna_study.log" --seed "$SLURM_ARRAY_TASK_ID"

From 5769fc6ac2c1f70484a7a0e183755d2bab9bd433 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 26 Feb 2024 17:22:19 +0100
Subject: [PATCH 26/36] Add hyper parameters for SQIL.

---
 tuning/hp_search_spaces.py | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index aabd8e056..4f5a7ee9d 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -18,7 +18,9 @@
 
 import optuna
 import sacred
+import stable_baselines3 as sb3
 
+import imitation.scripts.train_imitation
 import imitation.scripts.train_preference_comparisons
 
 
@@ -33,12 +35,16 @@ class RunSacredAsTrial:
     """The sacred experiment to run."""
     sacred_ex: sacred.Experiment
 
+
     """A function that returns a list of named configs to pass to sacred.run."""
     suggest_named_configs: Callable[[optuna.Trial], List[str]]
 
     """A function that returns a dict of config updates to pass to sacred.run."""
     suggest_config_updates: Callable[[optuna.Trial], Mapping[str, Any]]
 
+    """Command name to pass to sacred.run."""
+    command_name: str = None
+
     def __call__(
             self,
             trial: optuna.Trial,
@@ -61,6 +67,7 @@ def __call__(
 
         experiment: sacred.Experiment = self.sacred_ex
         result = experiment.run(
+            command_name=self.command_name,
             config_updates=config_updates,
             named_configs=named_configs,
             options=run_options,
@@ -115,4 +122,41 @@ def __call__(
             },
         },
     ),
+    sqil=RunSacredAsTrial(
+        sacred_ex=imitation.scripts.train_imitation.train_imitation_ex,
+        command_name="sqil",
+        suggest_named_configs=lambda _: [],
+        suggest_config_updates=lambda trial: {
+            "seed": trial.number,
+            "demonstrations": {
+                "n_expert_demos": 100,
+                "source": "generated",
+            },
+            "sqil": {
+                "total_timesteps": 1e6,
+                "train_kwargs": {
+
+                }
+            },
+            "rl": {
+                "rl_cls": sb3.DQN,
+                "rl_kwargs": {
+                    "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
+                    "buffer_size": trial.suggest_int("buffer_size", 1000, 100000),
+                    "learning_starts": trial.suggest_int("learning_starts", 1000, 10000),
+                    "batch_size": trial.suggest_int("batch_size", 32, 128),
+                    "tau": trial.suggest_float("tau", 0., 1.),
+                    "gamma": trial.suggest_float("gamma", 0.9, 0.999),
+                    "train_freq": trial.suggest_int("train_freq", 1, 40),
+                    "gradient_steps": trial.suggest_int("gradient_steps", 1, 10),
+                    "target_update_interval": trial.suggest_int("target_update_interval", 1, 10000),
+                    "exploration_fraction": trial.suggest_float("exploration_fraction", 0.01, 0.5),
+                    "exploration_final_eps": trial.suggest_float("exploration_final_eps", 0.01, 1.0),
+                    "exploration_initial_eps": trial.suggest_float("exploration_initial_eps", 0.01, 0.5),
+                    "max_grad_norm": trial.suggest_float("max_grad_norm", 0.1, 10.0),
+
+                },
+            },
+        },
+    ),
 )
\ No newline at end of file

From d3860a340a0a884e54ea2259b2e6194dbdab190d Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Mon, 26 Feb 2024 17:36:31 +0100
Subject: [PATCH 27/36] Turn DQN into a named config.

---
 src/imitation/scripts/ingredients/rl.py | 6 ++++++
 tuning/hp_search_spaces.py              | 6 +-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/imitation/scripts/ingredients/rl.py b/src/imitation/scripts/ingredients/rl.py
index d5373c773..9a829aae4 100644
--- a/src/imitation/scripts/ingredients/rl.py
+++ b/src/imitation/scripts/ingredients/rl.py
@@ -98,6 +98,12 @@ def sac():
     locals()  # quieten flake8
 
 
+@rl_ingredient.named_config
+def dqn():
+    rl_cls = sb3.DQN
+
+
+
 def _maybe_add_relabel_buffer(
     rl_kwargs: Dict[str, Any],
     relabel_reward_fn: Optional[RewardFn] = None,
diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 4f5a7ee9d..0dbf18f62 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -125,7 +125,7 @@ def __call__(
     sqil=RunSacredAsTrial(
         sacred_ex=imitation.scripts.train_imitation.train_imitation_ex,
         command_name="sqil",
-        suggest_named_configs=lambda _: [],
+        suggest_named_configs=lambda _: ["rl.dqn"],
         suggest_config_updates=lambda trial: {
             "seed": trial.number,
             "demonstrations": {
@@ -134,12 +134,8 @@ def __call__(
             },
             "sqil": {
                 "total_timesteps": 1e6,
-                "train_kwargs": {
-
-                }
             },
             "rl": {
-                "rl_cls": sb3.DQN,
                 "rl_kwargs": {
                     "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
                     "buffer_size": trial.suggest_int("buffer_size", 1000, 100000),

From 80edc626f0a6a817714e5afb1a4a15a6f10e80fe Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 27 Feb 2024 17:52:45 +0100
Subject: [PATCH 28/36] Add namedconfigs for all seals envs and set the number
 of timesteps in namedconfig for SQIL.

---
 .../scripts/config/train_imitation.py         | 30 ++++++++++++++++++-
 tuning/hp_search_spaces.py                    |  3 --
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
index f151e768e..b189895b3 100644
--- a/src/imitation/scripts/config/train_imitation.py
+++ b/src/imitation/scripts/config/train_imitation.py
@@ -45,6 +45,13 @@ def seals_mountain_car():
     environment = dict(gym_id="seals/MountainCar-v0")
     bc = dict(l2_weight=0.0)
     dagger = dict(total_timesteps=20000)
+    sqil = dict(total_timesteps=1e5)
+
+
+@train_imitation_ex.named_config
+def seals_ant():
+    environment = dict(gym_id="seals/Ant-v1")
+    sqil = dict(total_timesteps=2e6)
 
 
 @train_imitation_ex.named_config
@@ -57,11 +64,13 @@ def cartpole():
 def seals_cartpole():
     environment = dict(gym_id="seals/CartPole-v0")
     dagger = dict(total_timesteps=20000)
+    sqil = dict(total_timesteps=1e5)
 
 
 @train_imitation_ex.named_config
 def pendulum():
     environment = dict(gym_id="Pendulum-v1")
+    sqil = dict(total_timesteps=1e5)
 
 
 @train_imitation_ex.named_config
@@ -76,6 +85,24 @@ def half_cheetah():
     dagger = dict(total_timesteps=60000)
 
 
+@train_imitation_ex.named_config
+def seals_half_cheetah():
+    environment = dict(gym_id="seals/HalfCheetah-v1")
+    sqil = dict(total_timesteps=2e6)
+
+
+@train_imitation_ex.named_config
+def seals_hopper():
+    environment = dict(gym_id="seals/Hopper-v1")
+    sqil = dict(total_timesteps=2e6)
+
+
+@train_imitation_ex.named_config
+def seals_walker():
+    environment = dict(gym_id="seals/Walker2d-v1")
+    sqil = dict(total_timesteps=2e6)
+
+
 @train_imitation_ex.named_config
 def humanoid():
     environment = dict(gym_id="Humanoid-v2")
@@ -83,7 +110,8 @@ def humanoid():
 
 @train_imitation_ex.named_config
 def seals_humanoid():
-    environment = dict(gym_id="seals/Humanoid-v0")
+    environment = dict(gym_id="seals/Humanoid-v1")
+    sqil = dict(total_timesteps=2e6)
 
 
 @train_imitation_ex.named_config
diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 0dbf18f62..196397cfa 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -132,9 +132,6 @@ def __call__(
                 "n_expert_demos": 100,
                 "source": "generated",
             },
-            "sqil": {
-                "total_timesteps": 1e6,
-            },
             "rl": {
                 "rl_kwargs": {
                     "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),

From 9bb89c95c23f2e3f60338f6c75ebd7539c87225b Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 27 Feb 2024 18:02:09 +0100
Subject: [PATCH 29/36] Store command name as user attr in trial.

---
 tuning/hp_search_spaces.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 196397cfa..3beed298c 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -64,6 +64,7 @@ def __call__(
 
         trial.set_user_attr("config_updates", config_updates)
         trial.set_user_attr("named_configs", named_configs)
+        trial.set_user_attr("command_name", self.command_name)
 
         experiment: sacred.Experiment = self.sacred_ex
         result = experiment.run(

From 57018d500e6268c90185d894962eb9879ce7af2a Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Tue, 27 Feb 2024 18:14:06 +0100
Subject: [PATCH 30/36] Add hp search space for PC/classic control.

---
 tuning/hp_search_spaces.py | 41 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 3beed298c..24b1752f4 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -123,6 +123,47 @@ def __call__(
             },
         },
     ),
+    pc_classic_control=RunSacredAsTrial(
+        sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
+        suggest_named_configs=lambda _: ["reward.reward_ensemble"],
+        suggest_config_updates=lambda trial: {
+            "seed": trial.number,
+            "environment": {"num_vec": 8},
+            "total_timesteps": 1e6,
+            "total_comparisons": 1000,
+            "active_selection": True,
+            "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
+            "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
+            "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
+            "fragment_length": trial.suggest_int("fragment_length", 1, 1001),  # trajectories are 1000 steps long
+            "gatherer_kwargs": {
+                "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
+                "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
+                "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
+            },
+            "initial_epoch_multiplier": trial.suggest_float("initial_epoch_multiplier", 1, 200.0),
+            "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
+            "num_iterations": trial.suggest_int("num_iterations", 1, 51),
+            "preference_model_kwargs": {
+                "noise_prob": trial.suggest_float("preference_model_noise_prob", 0.0, 0.1),
+                "discount_factor": trial.suggest_float("preference_model_discount_factor", 0.95, 1.0),
+            },
+            "query_schedule": trial.suggest_categorical("query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]),
+            "trajectory_generator_kwargs": {
+                "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
+                "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
+            },
+            "transition_oversampling": trial.suggest_float("transition_oversampling", 0.9, 2.0),
+            "reward_trainer_kwargs": {
+                "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
+            },
+            "rl": {
+                "rl_kwargs": {
+                    "ent_coef": trial.suggest_float("rl_ent_coef", 1e-7, 1e-3, log=True),
+                },
+            },
+        },
+    ),
     sqil=RunSacredAsTrial(
         sacred_ex=imitation.scripts.train_imitation.train_imitation_ex,
         command_name="sqil",

From b7ca23020cd768addbb3138fabfc62924c578e6c Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 28 Feb 2024 13:19:32 +0100
Subject: [PATCH 31/36] Fix fragment length for classic control environments.

---
 tuning/hp_search_spaces.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 24b1752f4..7af141958 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -135,7 +135,7 @@ def __call__(
             "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
             "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
             "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
-            "fragment_length": trial.suggest_int("fragment_length", 1, 1001),  # trajectories are 1000 steps long
+            "fragment_length": trial.suggest_int("fragment_length", 1, 201),  # trajectories are 1000 steps long
             "gatherer_kwargs": {
                 "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
                 "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),

From 288d38d897debd219d6340bff448aa897ea2364b Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Wed, 28 Feb 2024 13:36:27 +0100
Subject: [PATCH 32/36] Add sbatch jobs for SQIL to tune_all_on_slurm.

---
 tuning/tune_all_on_slurm.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tuning/tune_all_on_slurm.sh b/tuning/tune_all_on_slurm.sh
index 19324b3b4..8a8d7a4bd 100644
--- a/tuning/tune_all_on_slurm.sh
+++ b/tuning/tune_all_on_slurm.sh
@@ -9,4 +9,7 @@ sbatch --job-name=tuning_pc_on_seals_walker tune_on_slurm.sh pc seals_walker
 sbatch --job-name=tuning_pc_on_seals_humanoid --mem=32gb tune_on_slurm.sh pc seals_humanoid
 sbatch --job-name=tuning_pc_on_seals_cartpole tune_on_slurm.sh pc seals_cartpole
 sbatch --job-name=tuning_pc_on_pendulum tune_on_slurm.sh pc pendulum
-sbatch --job-name=tuning_pc_on_seals_mountain_car tune_on_slurm.sh pc seals_mountain_car
\ No newline at end of file
+sbatch --job-name=tuning_pc_on_seals_mountain_car tune_on_slurm.sh pc seals_mountain_car
+
+sbatch --job-name=tuning_sqil_on_seals_mountain_car tune_on_slurm.sh sqil seals_mountain_car
+sbatch --job-name=tuning_sqil_on_seals_cartpole tune_on_slurm.sh sqil seals_cartpole
\ No newline at end of file

From 2e356b7e88b3601364245d603ccad091349ecf90 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 29 Feb 2024 19:07:40 +0100
Subject: [PATCH 33/36] Add benchmark analysis notebook.

---
 tuning/benchmark_analysis.ipynb | 213 ++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 tuning/benchmark_analysis.ipynb

diff --git a/tuning/benchmark_analysis.ipynb b/tuning/benchmark_analysis.ipynb
new file mode 100644
index 000000000..016a66f8f
--- /dev/null
+++ b/tuning/benchmark_analysis.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5c06148d9ff6b57",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "This notebook loads all the optuna studies in the \"tuning\" folder and arranges them in a dataframe. It also loads the performance of the best model from the paper and the rerun results.\n",
+    "\n",
+    "It can serve as a starting point for further analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31e6f532-15c3-494a-8a3a-de25ecc1ee90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load all the studies into a dataframe\n",
+    "\n",
+    "import optuna\n",
+    "from collections import Counter\n",
+    "from optuna.trial import TrialState\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import datetime\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import imitation.util.sacred_file_parsing as sfp\n",
+    "\n",
+    "\n",
+    "experiment_log_files = list(Path().glob(\"*/*.log\"))\n",
+    "\n",
+    "experiment_log_files\n",
+    "\n",
+    "raw_study_data = []\n",
+    "\n",
+    "for log_file in experiment_log_files:\n",
+    "    d = dict()\n",
+    "    \n",
+    "    d['logfile'] = log_file\n",
+    "    \n",
+    "    study = optuna.load_study(storage=optuna.storages.JournalStorage(\n",
+    "            optuna.storages.JournalFileStorage(str(log_file))\n",
+    "        ),\n",
+    "        # in our case, we have one journal file per study so the study name can be\n",
+    "        # inferred\n",
+    "        study_name=None,\n",
+    "    )\n",
+    "    d['study'] = study\n",
+    "    d['study_name'] = study.study_name\n",
+    "    \n",
+    "    trial_state_counter = Counter(t.state for t in study.trials)\n",
+    "    n_completed_trials = trial_state_counter[TrialState.COMPLETE]\n",
+    "    d['trials'] = n_completed_trials\n",
+    "    d['trials_running'] = Counter(t.state for t in study.trials)[TrialState.RUNNING]\n",
+    "    d['trials_failed'] = Counter(t.state for t in study.trials)[TrialState.FAIL]\n",
+    "    d['all_trials'] = len(study.trials)\n",
+    "    \n",
+    "    if n_completed_trials > 0:\n",
+    "        d['best_value'] = round(study.best_trial.value, 2)\n",
+    "        \n",
+    "    assert \"_\" in study.study_name\n",
+    "    study_segments = study.study_name.split(\"_\") \n",
+    "    assert len(study_segments) > 3\n",
+    "    tuning, algo, with_ = study_segments[:3]\n",
+    "    assert (tuning, with_) == (\"tuning\", \"with\")\n",
+    "    \n",
+    "    d['algo'] = algo\n",
+    "    d['env'] = \"_\".join(study_segments[3:])\n",
+    "    d['best_trial_duration'] = study.best_trial.duration\n",
+    "    d['mean_duration'] = sum([t.duration for t in study.trials if t.state == TrialState.COMPLETE], datetime.timedelta())/n_completed_trials\n",
+    "    \n",
+    "    reruns_folder = log_file.parent / \"reruns\"\n",
+    "    rerun_results = [round(run['result']['imit_stats']['monitor_return_mean'], 2)\n",
+    "                     for conf, run in sfp.find_sacred_runs(reruns_folder, only_completed_runs=True)]\n",
+    "    d['rerun_values'] = rerun_results\n",
+    "    \n",
+    "    raw_study_data.append(d)\n",
+    "    \n",
+    "study_data = pd.DataFrame(raw_study_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b604bc7e-2e61-4f7f-acfe-87b57e8a2f5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add performance of the best model from the paper\n",
+    "import pandas as pd\n",
+    "\n",
+    "environments = [\n",
+    "    \"seals_ant\",\n",
+    "    \"seals_half_cheetah\",\n",
+    "    \"seals_hopper\",\n",
+    "    \"seals_swimmer\",\n",
+    "    \"seals_walker\",\n",
+    "    \"seals_humanoid\",\n",
+    "    \"seals_cartpole\",\n",
+    "    \"pendulum\",\n",
+    "    \"seals_mountain_car\"\n",
+    "]\n",
+    "\n",
+    "pc_paper_700 = dict(\n",
+    "    seals_ant=200,\n",
+    "    seals_half_cheetah=4700,\n",
+    "    seals_hopper=4500,\n",
+    "    seals_swimmer=170,\n",
+    "    seals_walker=4900,\n",
+    "    seals_humanoid=\"-\",\n",
+    "    seals_cartpole=\"-\",\n",
+    "    pendulum=1300,\n",
+    "    seals_mountain_car=\"-\",\n",
+    ")\n",
+    "\n",
+    "pc_paper_1400 = dict(\n",
+    "    seals_ant=100,\n",
+    "    seals_half_cheetah=5600,\n",
+    "    seals_hopper=4500,\n",
+    "    seals_swimmer=175,\n",
+    "    seals_walker=5900,\n",
+    "    seals_humanoid=\"-\",\n",
+    "    seals_cartpole=\"-\",\n",
+    "    pendulum=750,\n",
+    "    seals_mountain_car=\"-\",\n",
+    ")\n",
+    "\n",
+    "rl_paper = dict(\n",
+    "    seals_ant=16,\n",
+    "    seals_half_cheetah=420,\n",
+    "    seals_hopper=4210,\n",
+    "    seals_swimmer=175,\n",
+    "    seals_walker=5370,\n",
+    "    seals_humanoid=\"-\",\n",
+    "    seals_cartpole=\"-\",\n",
+    "    pendulum=1300,\n",
+    "    seals_mountain_car=\"-\",\n",
+    ")\n",
+    "\n",
+    "rl_ours = dict(\n",
+    "    seals_ant=3034,\n",
+    "    seals_half_cheetah=1675.76,\n",
+    "    seals_hopper=203.45,\n",
+    "    seals_swimmer=292.84,\n",
+    "    seals_walker=2465.56,\n",
+    "    seals_humanoid=3224.12,\n",
+    "    seals_cartpole=500.00,\n",
+    "    pendulum=-189.25,\n",
+    "    seals_mountain_car=-97.00,\n",
+    ")\n",
+    "\n",
+    "for algo, values_by_env in dict(\n",
+    "    pc_paper_700=pc_paper_700,\n",
+    "    pc_paper_1400=pc_paper_1400,\n",
+    "    rl_paper=rl_paper,\n",
+    "    rl_ours=rl_ours,\n",
+    ").items():\n",
+    "    for env, value in values_by_env.items():\n",
+    "        if value == \"-\":\n",
+    "            continue\n",
+    "        raw_study_data.append(dict(\n",
+    "            algo=algo,\n",
+    "            env=env,\n",
+    "            best_value=value,\n",
+    "        ))\n",
+    "        \n",
+    "study_data = pd.DataFrame(raw_study_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e9ae5ca-5002-411b-beaf-cb98eb12f54c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display\n",
+    "\n",
+    "print(\"Benchmark Data\")\n",
+    "display(study_data[[\"algo\", \"env\", \"best_value\"]])\n",
+    "\n",
+    "print(\"Rerun Data\")\n",
+    "display(study_data[[\"algo\", \"env\", \"best_value\", \"rerun_values\"]][study_data[\"rerun_values\"].map(np.std) > 0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From f4bcbdce9f606f2bd7148b1d749ea785fe4ee88d Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 29 Feb 2024 19:17:58 +0100
Subject: [PATCH 34/36] Some formatting fixes.

---
 benchmarking/README.md                        |   2 +-
 .../algorithms/preference_comparisons.py      |   4 +-
 src/imitation/scripts/config/tuning.py        |  12 +-
 src/imitation/scripts/ingredients/rl.py       |   1 -
 tuning/README.md                              |   8 +-
 tuning/benchmark_analysis.ipynb               |  83 +++++++-----
 tuning/hp_search_spaces.py                    | 127 ++++++++++++------
 tuning/rerun_best_trial.py                    |  24 ++--
 tuning/tune.py                                |  14 +-
 tuning/tune_all_on_slurm.sh                   |   2 +-
 tuning/tune_on_slurm.sh                       |   2 +-
 11 files changed, 173 insertions(+), 106 deletions(-)

diff --git a/benchmarking/README.md b/benchmarking/README.md
index 4eac8c904..2539597f7 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -185,4 +185,4 @@ where:
 - `algo` is the algorithm you want to compare against
 
 If `your_runs_dir` contains runs for more than one algorithm, you will have to
-disambiguate using the `--algo` option.
\ No newline at end of file
+disambiguate using the `--algo` option.
diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
index 1b0a2b01b..ba9a49b40 100644
--- a/src/imitation/algorithms/preference_comparisons.py
+++ b/src/imitation/algorithms/preference_comparisons.py
@@ -1678,7 +1678,9 @@ def train(
         unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations))
         probs = unnormalized_probs / np.sum(unnormalized_probs)
         shares = util.oric(probs * total_comparisons)
-        shares[shares <= 0] = 1  # ensure we at least request one comparison per iteration
+        shares[
+            shares <= 0
+        ] = 1  # ensure we at least request one comparison per iteration
 
         schedule = [initial_comparisons] + shares.tolist()
         print(f"Query schedule: {schedule}")
diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py
index daed7c1a0..faf0517cb 100644
--- a/src/imitation/scripts/config/tuning.py
+++ b/src/imitation/scripts/config/tuning.py
@@ -199,9 +199,13 @@ def pc():
             "named_configs": ["reward.reward_ensemble"],
             "config_updates": {
                 "active_selection_oversampling": tune.randint(1, 11),
-                "comparison_queue_size": tune.randint(1, 1001),  # upper bound determined by total_comparisons=1000
+                "comparison_queue_size": tune.randint(
+                    1, 1001
+                ),  # upper bound determined by total_comparisons=1000
                 "exploration_frac": tune.uniform(0.0, 0.5),
-                "fragment_length": tune.randint(1, 1001),  # trajectories are 1000 steps long
+                "fragment_length": tune.randint(
+                    1, 1001
+                ),  # trajectories are 1000 steps long
                 "gatherer_kwargs": {
                     "temperature": tune.uniform(0.0, 2.0),
                     "discount_factor": tune.uniform(0.95, 1.0),
@@ -213,7 +217,9 @@ def pc():
                     "noise_prob": tune.uniform(0.0, 0.1),
                     "discount_factor": tune.uniform(0.95, 1.0),
                 },
-                "query_schedule": tune.choice(["hyperbolic", "constant", "inverse_quadratic"]),
+                "query_schedule": tune.choice(
+                    ["hyperbolic", "constant", "inverse_quadratic"]
+                ),
                 "trajectory_generator_kwargs": {
                     "switch_prob": tune.uniform(0.1, 1),
                     "random_prob": tune.uniform(0.1, 0.9),
diff --git a/src/imitation/scripts/ingredients/rl.py b/src/imitation/scripts/ingredients/rl.py
index 9a829aae4..bf43a129f 100644
--- a/src/imitation/scripts/ingredients/rl.py
+++ b/src/imitation/scripts/ingredients/rl.py
@@ -103,7 +103,6 @@ def dqn():
     rl_cls = sb3.DQN
 
 
-
 def _maybe_add_relabel_buffer(
     rl_kwargs: Dict[str, Any],
     relabel_reward_fn: Optional[RewardFn] = None,
diff --git a/tuning/README.md b/tuning/README.md
index f49e54935..dee25da45 100644
--- a/tuning/README.md
+++ b/tuning/README.md
@@ -7,13 +7,13 @@ If you want to specify a custom algorithm and search space, add it to the dict i
 
 You can tune using multiple workers in parallel by running multiple instances of `tune.py` that all point to the same journal log file (see `tune.py --help` for details).
 To easily launch multiple workers on a SLURM cluster and ensure they don't conflict with each other,
-use the `tune_on_slurm.py` script. 
+use the `tune_on_slurm.py` script.
 This script will launch a SLURM job array with the specified number of workers.
 If you want to tune all algorithms on all environments on SLURM, use `tune_all_on_slurm.sh`.
 
 # Legacy Tuning Scripts
 
-Note: There are some legacy tuning scripts that can be used like this: 
+Note: There are some legacy tuning scripts that can be used like this:
 
 The hyperparameters of any algorithm in imitation can be tuned using `src/imitation/scripts/tuning.py`.
 The benchmarking hyperparameter configs were generated by tuning the hyperparameters using
@@ -21,8 +21,8 @@ the search space defined in the `scripts/config/tuning.py`.
 
 The tuning script proceeds in two phases:
 1. Tune the hyperparameters using the search space provided.
-2. Re-evaluate the best hyperparameter config found in the first phase 
-   based on the maximum mean return on a separate set of seeds. 
+2. Re-evaluate the best hyperparameter config found in the first phase
+   based on the maximum mean return on a separate set of seeds.
    Report the mean and standard deviation of these trials.
 
 To use it with the default search space:
diff --git a/tuning/benchmark_analysis.ipynb b/tuning/benchmark_analysis.ipynb
index 016a66f8f..a76c39e0d 100644
--- a/tuning/benchmark_analysis.ipynb
+++ b/tuning/benchmark_analysis.ipynb
@@ -40,47 +40,56 @@
     "\n",
     "for log_file in experiment_log_files:\n",
     "    d = dict()\n",
-    "    \n",
-    "    d['logfile'] = log_file\n",
-    "    \n",
-    "    study = optuna.load_study(storage=optuna.storages.JournalStorage(\n",
+    "\n",
+    "    d[\"logfile\"] = log_file\n",
+    "\n",
+    "    study = optuna.load_study(\n",
+    "        storage=optuna.storages.JournalStorage(\n",
     "            optuna.storages.JournalFileStorage(str(log_file))\n",
     "        ),\n",
     "        # in our case, we have one journal file per study so the study name can be\n",
     "        # inferred\n",
     "        study_name=None,\n",
     "    )\n",
-    "    d['study'] = study\n",
-    "    d['study_name'] = study.study_name\n",
-    "    \n",
+    "    d[\"study\"] = study\n",
+    "    d[\"study_name\"] = study.study_name\n",
+    "\n",
     "    trial_state_counter = Counter(t.state for t in study.trials)\n",
     "    n_completed_trials = trial_state_counter[TrialState.COMPLETE]\n",
-    "    d['trials'] = n_completed_trials\n",
-    "    d['trials_running'] = Counter(t.state for t in study.trials)[TrialState.RUNNING]\n",
-    "    d['trials_failed'] = Counter(t.state for t in study.trials)[TrialState.FAIL]\n",
-    "    d['all_trials'] = len(study.trials)\n",
-    "    \n",
+    "    d[\"trials\"] = n_completed_trials\n",
+    "    d[\"trials_running\"] = Counter(t.state for t in study.trials)[TrialState.RUNNING]\n",
+    "    d[\"trials_failed\"] = Counter(t.state for t in study.trials)[TrialState.FAIL]\n",
+    "    d[\"all_trials\"] = len(study.trials)\n",
+    "\n",
     "    if n_completed_trials > 0:\n",
-    "        d['best_value'] = round(study.best_trial.value, 2)\n",
-    "        \n",
+    "        d[\"best_value\"] = round(study.best_trial.value, 2)\n",
+    "\n",
     "    assert \"_\" in study.study_name\n",
-    "    study_segments = study.study_name.split(\"_\") \n",
+    "    study_segments = study.study_name.split(\"_\")\n",
     "    assert len(study_segments) > 3\n",
     "    tuning, algo, with_ = study_segments[:3]\n",
     "    assert (tuning, with_) == (\"tuning\", \"with\")\n",
-    "    \n",
-    "    d['algo'] = algo\n",
-    "    d['env'] = \"_\".join(study_segments[3:])\n",
-    "    d['best_trial_duration'] = study.best_trial.duration\n",
-    "    d['mean_duration'] = sum([t.duration for t in study.trials if t.state == TrialState.COMPLETE], datetime.timedelta())/n_completed_trials\n",
-    "    \n",
+    "\n",
+    "    d[\"algo\"] = algo\n",
+    "    d[\"env\"] = \"_\".join(study_segments[3:])\n",
+    "    d[\"best_trial_duration\"] = study.best_trial.duration\n",
+    "    d[\"mean_duration\"] = (\n",
+    "        sum(\n",
+    "            [t.duration for t in study.trials if t.state == TrialState.COMPLETE],\n",
+    "            datetime.timedelta(),\n",
+    "        )\n",
+    "        / n_completed_trials\n",
+    "    )\n",
+    "\n",
     "    reruns_folder = log_file.parent / \"reruns\"\n",
-    "    rerun_results = [round(run['result']['imit_stats']['monitor_return_mean'], 2)\n",
-    "                     for conf, run in sfp.find_sacred_runs(reruns_folder, only_completed_runs=True)]\n",
-    "    d['rerun_values'] = rerun_results\n",
-    "    \n",
+    "    rerun_results = [\n",
+    "        round(run[\"result\"][\"imit_stats\"][\"monitor_return_mean\"], 2)\n",
+    "        for conf, run in sfp.find_sacred_runs(reruns_folder, only_completed_runs=True)\n",
+    "    ]\n",
+    "    d[\"rerun_values\"] = rerun_results\n",
+    "\n",
     "    raw_study_data.append(d)\n",
-    "    \n",
+    "\n",
     "study_data = pd.DataFrame(raw_study_data)"
    ]
   },
@@ -103,7 +112,7 @@
     "    \"seals_humanoid\",\n",
     "    \"seals_cartpole\",\n",
     "    \"pendulum\",\n",
-    "    \"seals_mountain_car\"\n",
+    "    \"seals_mountain_car\",\n",
     "]\n",
     "\n",
     "pc_paper_700 = dict(\n",
@@ -163,12 +172,14 @@
     "    for env, value in values_by_env.items():\n",
     "        if value == \"-\":\n",
     "            continue\n",
-    "        raw_study_data.append(dict(\n",
-    "            algo=algo,\n",
-    "            env=env,\n",
-    "            best_value=value,\n",
-    "        ))\n",
-    "        \n",
+    "        raw_study_data.append(\n",
+    "            dict(\n",
+    "                algo=algo,\n",
+    "                env=env,\n",
+    "                best_value=value,\n",
+    "            )\n",
+    "        )\n",
+    "\n",
     "study_data = pd.DataFrame(raw_study_data)"
    ]
   },
@@ -185,7 +196,11 @@
     "display(study_data[[\"algo\", \"env\", \"best_value\"]])\n",
     "\n",
     "print(\"Rerun Data\")\n",
-    "display(study_data[[\"algo\", \"env\", \"best_value\", \"rerun_values\"]][study_data[\"rerun_values\"].map(np.std) > 0])"
+    "display(\n",
+    "    study_data[[\"algo\", \"env\", \"best_value\", \"rerun_values\"]][\n",
+    "        study_data[\"rerun_values\"].map(np.std) > 0\n",
+    "    ]\n",
+    ")"
    ]
   }
  ],
diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 7af141958..85da0e3f7 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -14,7 +14,7 @@
 """
 
 import dataclasses
-from typing import Callable, List, Mapping, Any, Dict
+from typing import Any, Callable, Dict, List, Mapping
 
 import optuna
 import sacred
@@ -35,7 +35,6 @@ class RunSacredAsTrial:
     """The sacred experiment to run."""
     sacred_ex: sacred.Experiment
 
-
     """A function that returns a list of named configs to pass to sacred.run."""
     suggest_named_configs: Callable[[optuna.Trial], List[str]]
 
@@ -46,10 +45,7 @@ class RunSacredAsTrial:
     command_name: str = None
 
     def __call__(
-            self,
-            trial: optuna.Trial,
-            run_options: Dict,
-            extra_named_configs: List[str]
+        self, trial: optuna.Trial, run_options: Dict, extra_named_configs: List[str]
     ) -> float:
         """Run the sacred experiment and return the performance.
 
@@ -77,7 +73,7 @@ def __call__(
             raise RuntimeError(
                 f"Trial failed with {result.fail_trace()} and status {result.status}."
             )
-        return result.result['imit_stats']['monitor_return_mean']
+        return result.result["imit_stats"]["monitor_return_mean"]
 
 
 """A mapping from algorithm names to functions that run the algorithm as an optuna trial."""
@@ -91,34 +87,56 @@ def __call__(
             "total_timesteps": 2e7,
             "total_comparisons": 1000,
             "active_selection": True,
-            "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
-            "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
+            "active_selection_oversampling": trial.suggest_int(
+                "active_selection_oversampling", 1, 11
+            ),
+            "comparison_queue_size": trial.suggest_int(
+                "comparison_queue_size", 1, 1001
+            ),  # upper bound determined by total_comparisons=1000
             "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
-            "fragment_length": trial.suggest_int("fragment_length", 1, 1001),  # trajectories are 1000 steps long
+            "fragment_length": trial.suggest_int(
+                "fragment_length", 1, 1001
+            ),  # trajectories are 1000 steps long
             "gatherer_kwargs": {
                 "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
-                "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
+                "discount_factor": trial.suggest_float(
+                    "gatherer_discount_factor", 0.95, 1.0
+                ),
                 "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
             },
-            "initial_epoch_multiplier": trial.suggest_float("initial_epoch_multiplier", 1, 200.0),
-            "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
+            "initial_epoch_multiplier": trial.suggest_float(
+                "initial_epoch_multiplier", 1, 200.0
+            ),
+            "initial_comparison_frac": trial.suggest_float(
+                "initial_comparison_frac", 0.01, 1.0
+            ),
             "num_iterations": trial.suggest_int("num_iterations", 1, 51),
             "preference_model_kwargs": {
-                "noise_prob": trial.suggest_float("preference_model_noise_prob", 0.0, 0.1),
-                "discount_factor": trial.suggest_float("preference_model_discount_factor", 0.95, 1.0),
+                "noise_prob": trial.suggest_float(
+                    "preference_model_noise_prob", 0.0, 0.1
+                ),
+                "discount_factor": trial.suggest_float(
+                    "preference_model_discount_factor", 0.95, 1.0
+                ),
             },
-            "query_schedule": trial.suggest_categorical("query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]),
+            "query_schedule": trial.suggest_categorical(
+                "query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]
+            ),
             "trajectory_generator_kwargs": {
                 "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
                 "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
             },
-            "transition_oversampling": trial.suggest_float("transition_oversampling", 0.9, 2.0),
+            "transition_oversampling": trial.suggest_float(
+                "transition_oversampling", 0.9, 2.0
+            ),
             "reward_trainer_kwargs": {
                 "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
             },
             "rl": {
                 "rl_kwargs": {
-                    "ent_coef": trial.suggest_float("rl_ent_coef", 1e-7, 1e-3, log=True),
+                    "ent_coef": trial.suggest_float(
+                        "rl_ent_coef", 1e-7, 1e-3, log=True
+                    ),
                 },
             },
         },
@@ -132,34 +150,56 @@ def __call__(
             "total_timesteps": 1e6,
             "total_comparisons": 1000,
             "active_selection": True,
-            "active_selection_oversampling": trial.suggest_int("active_selection_oversampling", 1, 11),
-            "comparison_queue_size": trial.suggest_int("comparison_queue_size", 1, 1001),  # upper bound determined by total_comparisons=1000
+            "active_selection_oversampling": trial.suggest_int(
+                "active_selection_oversampling", 1, 11
+            ),
+            "comparison_queue_size": trial.suggest_int(
+                "comparison_queue_size", 1, 1001
+            ),  # upper bound determined by total_comparisons=1000
             "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
-            "fragment_length": trial.suggest_int("fragment_length", 1, 201),  # trajectories are 1000 steps long
+            "fragment_length": trial.suggest_int(
+                "fragment_length", 1, 201
+            ),  # trajectories are 1000 steps long
             "gatherer_kwargs": {
                 "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
-                "discount_factor": trial.suggest_float("gatherer_discount_factor", 0.95, 1.0),
+                "discount_factor": trial.suggest_float(
+                    "gatherer_discount_factor", 0.95, 1.0
+                ),
                 "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
             },
-            "initial_epoch_multiplier": trial.suggest_float("initial_epoch_multiplier", 1, 200.0),
-            "initial_comparison_frac": trial.suggest_float("initial_comparison_frac", 0.01, 1.0),
+            "initial_epoch_multiplier": trial.suggest_float(
+                "initial_epoch_multiplier", 1, 200.0
+            ),
+            "initial_comparison_frac": trial.suggest_float(
+                "initial_comparison_frac", 0.01, 1.0
+            ),
             "num_iterations": trial.suggest_int("num_iterations", 1, 51),
             "preference_model_kwargs": {
-                "noise_prob": trial.suggest_float("preference_model_noise_prob", 0.0, 0.1),
-                "discount_factor": trial.suggest_float("preference_model_discount_factor", 0.95, 1.0),
+                "noise_prob": trial.suggest_float(
+                    "preference_model_noise_prob", 0.0, 0.1
+                ),
+                "discount_factor": trial.suggest_float(
+                    "preference_model_discount_factor", 0.95, 1.0
+                ),
             },
-            "query_schedule": trial.suggest_categorical("query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]),
+            "query_schedule": trial.suggest_categorical(
+                "query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]
+            ),
             "trajectory_generator_kwargs": {
                 "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
                 "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
             },
-            "transition_oversampling": trial.suggest_float("transition_oversampling", 0.9, 2.0),
+            "transition_oversampling": trial.suggest_float(
+                "transition_oversampling", 0.9, 2.0
+            ),
             "reward_trainer_kwargs": {
                 "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
             },
             "rl": {
                 "rl_kwargs": {
-                    "ent_coef": trial.suggest_float("rl_ent_coef", 1e-7, 1e-3, log=True),
+                    "ent_coef": trial.suggest_float(
+                        "rl_ent_coef", 1e-7, 1e-3, log=True
+                    ),
                 },
             },
         },
@@ -176,22 +216,33 @@ def __call__(
             },
             "rl": {
                 "rl_kwargs": {
-                    "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
+                    "learning_rate": trial.suggest_float(
+                        "learning_rate", 1e-6, 1e-2, log=True
+                    ),
                     "buffer_size": trial.suggest_int("buffer_size", 1000, 100000),
-                    "learning_starts": trial.suggest_int("learning_starts", 1000, 10000),
+                    "learning_starts": trial.suggest_int(
+                        "learning_starts", 1000, 10000
+                    ),
                     "batch_size": trial.suggest_int("batch_size", 32, 128),
-                    "tau": trial.suggest_float("tau", 0., 1.),
+                    "tau": trial.suggest_float("tau", 0.0, 1.0),
                     "gamma": trial.suggest_float("gamma", 0.9, 0.999),
                     "train_freq": trial.suggest_int("train_freq", 1, 40),
                     "gradient_steps": trial.suggest_int("gradient_steps", 1, 10),
-                    "target_update_interval": trial.suggest_int("target_update_interval", 1, 10000),
-                    "exploration_fraction": trial.suggest_float("exploration_fraction", 0.01, 0.5),
-                    "exploration_final_eps": trial.suggest_float("exploration_final_eps", 0.01, 1.0),
-                    "exploration_initial_eps": trial.suggest_float("exploration_initial_eps", 0.01, 0.5),
+                    "target_update_interval": trial.suggest_int(
+                        "target_update_interval", 1, 10000
+                    ),
+                    "exploration_fraction": trial.suggest_float(
+                        "exploration_fraction", 0.01, 0.5
+                    ),
+                    "exploration_final_eps": trial.suggest_float(
+                        "exploration_final_eps", 0.01, 1.0
+                    ),
+                    "exploration_initial_eps": trial.suggest_float(
+                        "exploration_initial_eps", 0.01, 0.5
+                    ),
                     "max_grad_norm": trial.suggest_float("max_grad_norm", 0.1, 10.0),
-
                 },
             },
         },
     ),
-)
\ No newline at end of file
+)
diff --git a/tuning/rerun_best_trial.py b/tuning/rerun_best_trial.py
index ed269c30f..d18ffb8ad 100644
--- a/tuning/rerun_best_trial.py
+++ b/tuning/rerun_best_trial.py
@@ -3,18 +3,15 @@
 import random
 from typing import List, Tuple
 
+import hp_search_spaces
 import optuna
 import sacred
 
-import hp_search_spaces
-
 
 def make_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
-        description=
-        "Re-run the best trial from a previous tuning run.",
-        epilog=f"Example usage:\n"
-               f"python rerun_best_trials.py tuning_run.json\n",
+        description="Re-run the best trial from a previous tuning run.",
+        epilog=f"Example usage:\n" f"python rerun_best_trials.py tuning_run.json\n",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument(
@@ -23,18 +20,18 @@ def make_parser() -> argparse.ArgumentParser:
         default=None,
         choices=hp_search_spaces.objectives_by_algo.keys(),
         help="The algorithm that has been tuned. "
-             "Can usually be deduced from the study name.",
+        "Can usually be deduced from the study name.",
     )
     parser.add_argument(
         "journal_log",
         type=str,
-        help="The optuna journal file of the previous tuning run."
+        help="The optuna journal file of the previous tuning run.",
     )
     parser.add_argument(
         "--seed",
         type=int,
         default=random.randint(0, 2**32 - 1),
-        help="The seed to use for the re-run. A random seed is used by default."
+        help="The seed to use for the re-run. A random seed is used by default.",
     )
     return parser
 
@@ -46,7 +43,7 @@ def infer_algo_name(study: optuna.Study) -> str:
     """
     assert study.study_name.startswith("tuning_")
     assert "_with_" in study.study_name
-    return study.study_name[len("tuning_"):].split("_with_")[0]
+    return study.study_name[len("tuning_") :].split("_with_")[0]
 
 
 def main():
@@ -63,7 +60,9 @@ def main():
     trial = study.best_trial
 
     algo_name = args.algo or infer_algo_name(study)
-    sacred_experiment: sacred.Experiment = hp_search_spaces.objectives_by_algo[algo_name].sacred_ex
+    sacred_experiment: sacred.Experiment = hp_search_spaces.objectives_by_algo[
+        algo_name
+    ].sacred_ex
 
     config_updates = trial.user_attrs["config_updates"].copy()
     config_updates["seed"] = args.seed
@@ -71,7 +70,6 @@ def main():
         config_updates=config_updates,
         named_configs=trial.user_attrs["named_configs"],
         options={"--name": study.study_name, "--file_storage": "sacred"},
-
     )
     if result.status != "COMPLETED":
         raise RuntimeError(
@@ -79,5 +77,5 @@ def main():
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tuning/tune.py b/tuning/tune.py
index 76b9076c8..7a77d400d 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -2,7 +2,6 @@
 import argparse
 
 import optuna
-
 from hp_search_spaces import objectives_by_algo
 
 
@@ -31,13 +30,10 @@ def make_parser() -> argparse.ArgumentParser:
         nargs="+",
         default=[],
         help="Additional named configs to pass to the sacred experiment. "
-             "Use this to select the environment to tune on.",
+        "Use this to select the environment to tune on.",
     )
     parser.add_argument(
-        "--num_trials",
-        type=int,
-        default=100,
-        help="Number of trials to run."
+        "--num_trials", type=int, default=100, help="Number of trials to run."
     )
     parser.add_argument(
         "-j",
@@ -45,7 +41,7 @@ def make_parser() -> argparse.ArgumentParser:
         type=str,
         default=None,
         help="A journal file to synchronize multiple instances of this script. "
-             "Works on NFS storage."
+        "Works on NFS storage.",
     )
     return parser
 
@@ -75,12 +71,12 @@ def main():
         lambda trial: objectives_by_algo[args.algo](
             trial,
             run_options={"--name": study.study_name, "--file_storage": "sacred"},
-            extra_named_configs=args.named_configs
+            extra_named_configs=args.named_configs,
         ),
         callbacks=[optuna.study.MaxTrialsCallback(args.num_trials)],
         gc_after_trial=True,
     )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tuning/tune_all_on_slurm.sh b/tuning/tune_all_on_slurm.sh
index 8a8d7a4bd..ce25231b1 100644
--- a/tuning/tune_all_on_slurm.sh
+++ b/tuning/tune_all_on_slurm.sh
@@ -12,4 +12,4 @@ sbatch --job-name=tuning_pc_on_pendulum tune_on_slurm.sh pc pendulum
 sbatch --job-name=tuning_pc_on_seals_mountain_car tune_on_slurm.sh pc seals_mountain_car
 
 sbatch --job-name=tuning_sqil_on_seals_mountain_car tune_on_slurm.sh sqil seals_mountain_car
-sbatch --job-name=tuning_sqil_on_seals_cartpole tune_on_slurm.sh sqil seals_cartpole
\ No newline at end of file
+sbatch --job-name=tuning_sqil_on_seals_cartpole tune_on_slurm.sh sqil seals_cartpole
diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index 914f450af..72472b081 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -72,4 +72,4 @@ fi
 
 cd "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID" || exit
 
-srun --output=cout.txt python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"
\ No newline at end of file
+srun --output=cout.txt python ../../tune.py --num_trials 400 -j ../optuna_study.log "$1" "$2"

From 60fc75a96ea41e8fef521dd42e005dc97aba3178 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 29 Feb 2024 19:33:20 +0100
Subject: [PATCH 35/36] Shellcheck fixes.

---
 tuning/rerun_on_slurm.sh | 2 +-
 tuning/tune_on_slurm.sh  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tuning/rerun_on_slurm.sh b/tuning/rerun_on_slurm.sh
index d1e59df69..c8c32a8e4 100644
--- a/tuning/rerun_on_slurm.sh
+++ b/tuning/rerun_on_slurm.sh
@@ -26,7 +26,7 @@
 # <tune_folder>/reruns/<seed>
 # The output of each worker is written to a cout.txt.
 
-
+# shellcheck disable=SC1090
 source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 
 worker_dir="$SLURM_JOB_NAME/reruns/$SLURM_ARRAY_TASK_ID/"
diff --git a/tuning/tune_on_slurm.sh b/tuning/tune_on_slurm.sh
index 72472b081..004228c59 100644
--- a/tuning/tune_on_slurm.sh
+++ b/tuning/tune_on_slurm.sh
@@ -54,6 +54,7 @@
 # with the --array=101-150 (for another 50 workers). For this you do not need to modify
 # this file. You can pass it to sbatch to override.
 
+# shellcheck disable=SC1090
 source "/nas/ucb/$(whoami)/imitation/venv/bin/activate"
 
 if [ -f "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID/cout.txt" ]; then
@@ -62,7 +63,7 @@ if [ -f "$SLURM_JOB_NAME/$SLURM_ARRAY_TASK_ID/cout.txt" ]; then
   # so it is not very useful information.
   echo "The study folder for $SLURM_JOB_NAME already contains a folder for job $SLURM_ARRAY_TASK_ID!"
   echo "Are you trying to continue on an existing study? Then adapt the sbatch array range!"
-  echo "E.g. if the highest folder number in $SLURM_JOB_NAME/ is 100 and you want to continue the study with another 50 runners, start this script using `sbatch --job-name=$SLURM_JOB_NAME --array=101-50 tune_on_slurm.sh $1 $2`"
+  echo "E.g. if the highest folder number in \"$SLURM_JOB_NAME/\" is 100 and you want to continue the study with another 50 runners, start this script using \"sbatch --job-name=$SLURM_JOB_NAME --array=101-150 tune_on_slurm.sh $1 $2\""
   exit 1
 else
   # Note: we run each worker in a separate working directory to avoid race

From e69dbd64c3c1595ccb9f2935f0557520bf418369 Mon Sep 17 00:00:00 2001
From: Maximilian Ernestus <maximilian@ernestus.de>
Date: Thu, 29 Feb 2024 20:01:39 +0100
Subject: [PATCH 36/36] More formatting fixes.

---
 src/imitation/scripts/config/tuning.py |   7 +-
 tuning/hp_search_spaces.py             | 149 ++++++++++++++++++-------
 tuning/rerun_best_trial.py             |  13 ++-
 tuning/tune.py                         |  10 +-
 4 files changed, 130 insertions(+), 49 deletions(-)

diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py
index faf0517cb..22d1d82fb 100644
--- a/src/imitation/scripts/config/tuning.py
+++ b/src/imitation/scripts/config/tuning.py
@@ -2,7 +2,6 @@
 
 import ray.tune as tune
 import sacred
-from torch import nn
 
 from imitation.algorithms import dagger as dagger_alg
 from imitation.scripts.parallel import parallel_ex
@@ -200,11 +199,11 @@ def pc():
             "config_updates": {
                 "active_selection_oversampling": tune.randint(1, 11),
                 "comparison_queue_size": tune.randint(
-                    1, 1001
+                    1, 1001,
                 ),  # upper bound determined by total_comparisons=1000
                 "exploration_frac": tune.uniform(0.0, 0.5),
                 "fragment_length": tune.randint(
-                    1, 1001
+                    1, 1001,
                 ),  # trajectories are 1000 steps long
                 "gatherer_kwargs": {
                     "temperature": tune.uniform(0.0, 2.0),
@@ -218,7 +217,7 @@ def pc():
                     "discount_factor": tune.uniform(0.95, 1.0),
                 },
                 "query_schedule": tune.choice(
-                    ["hyperbolic", "constant", "inverse_quadratic"]
+                    ["hyperbolic", "constant", "inverse_quadratic",]
                 ),
                 "trajectory_generator_kwargs": {
                     "switch_prob": tune.uniform(0.1, 1),
diff --git a/tuning/hp_search_spaces.py b/tuning/hp_search_spaces.py
index 85da0e3f7..5a4e7db1d 100644
--- a/tuning/hp_search_spaces.py
+++ b/tuning/hp_search_spaces.py
@@ -14,14 +14,13 @@
 """
 
 import dataclasses
-from typing import Any, Callable, Dict, List, Mapping
+from typing import Any, Callable, Dict, List, Mapping, Optional
 
 import optuna
 import sacred
-import stable_baselines3 as sb3
 
 import imitation.scripts.train_imitation
-import imitation.scripts.train_preference_comparisons
+import imitation.scripts.train_preference_comparisons as train_pc_script
 
 
 @dataclasses.dataclass
@@ -42,10 +41,13 @@ class RunSacredAsTrial:
     suggest_config_updates: Callable[[optuna.Trial], Mapping[str, Any]]
 
     """Command name to pass to sacred.run."""
-    command_name: str = None
+    command_name: Optional[str] = None
 
     def __call__(
-        self, trial: optuna.Trial, run_options: Dict, extra_named_configs: List[str]
+        self,
+        trial: optuna.Trial,
+        run_options: Dict,
+        extra_named_configs: List[str],
     ) -> float:
         """Run the sacred experiment and return the performance.
 
@@ -53,8 +55,13 @@ def __call__(
             trial: The optuna trial to sample hyperparameters for.
             run_options: Options to pass to sacred.run(options=).
             extra_named_configs: Additional named configs to pass to sacred.run.
-        """
 
+        Returns:
+            The performance of the trial.
+
+        Raises:
+            RuntimeError: If the trial fails.
+        """
         config_updates = self.suggest_config_updates(trial)
         named_configs = self.suggest_named_configs(trial) + extra_named_configs
 
@@ -71,15 +78,16 @@ def __call__(
         )
         if result.status != "COMPLETED":
             raise RuntimeError(
-                f"Trial failed with {result.fail_trace()} and status {result.status}."
+                f"Trial failed with {result.fail_trace()} and status {result.status}.",
             )
         return result.result["imit_stats"]["monitor_return_mean"]
 
 
-"""A mapping from algorithm names to functions that run the algorithm as an optuna trial."""
+"""A mapping from algorithm names to functions that run the algorithm as an optuna
+trial."""
 objectives_by_algo = dict(
     pc=RunSacredAsTrial(
-        sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
+        sacred_ex=train_pc_script.train_preference_comparisons_ex,
         suggest_named_configs=lambda _: ["reward.reward_ensemble"],
         suggest_config_updates=lambda trial: {
             "seed": trial.number,
@@ -88,46 +96,69 @@ def __call__(
             "total_comparisons": 1000,
             "active_selection": True,
             "active_selection_oversampling": trial.suggest_int(
-                "active_selection_oversampling", 1, 11
+                "active_selection_oversampling",
+                1,
+                11,
             ),
             "comparison_queue_size": trial.suggest_int(
-                "comparison_queue_size", 1, 1001
+                "comparison_queue_size",
+                1,
+                1001,
             ),  # upper bound determined by total_comparisons=1000
             "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
             "fragment_length": trial.suggest_int(
-                "fragment_length", 1, 1001
+                "fragment_length",
+                1,
+                1001,
             ),  # trajectories are 1000 steps long
             "gatherer_kwargs": {
                 "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
                 "discount_factor": trial.suggest_float(
-                    "gatherer_discount_factor", 0.95, 1.0
+                    "gatherer_discount_factor",
+                    0.95,
+                    1.0,
                 ),
                 "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
             },
             "initial_epoch_multiplier": trial.suggest_float(
-                "initial_epoch_multiplier", 1, 200.0
+                "initial_epoch_multiplier",
+                1,
+                200.0,
             ),
             "initial_comparison_frac": trial.suggest_float(
-                "initial_comparison_frac", 0.01, 1.0
+                "initial_comparison_frac",
+                0.01,
+                1.0,
             ),
             "num_iterations": trial.suggest_int("num_iterations", 1, 51),
             "preference_model_kwargs": {
                 "noise_prob": trial.suggest_float(
-                    "preference_model_noise_prob", 0.0, 0.1
+                    "preference_model_noise_prob",
+                    0.0,
+                    0.1,
                 ),
                 "discount_factor": trial.suggest_float(
-                    "preference_model_discount_factor", 0.95, 1.0
+                    "preference_model_discount_factor",
+                    0.95,
+                    1.0,
                 ),
             },
             "query_schedule": trial.suggest_categorical(
-                "query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]
+                "query_schedule",
+                [
+                    "hyperbolic",
+                    "constant",
+                    "inverse_quadratic",
+                ],
             ),
             "trajectory_generator_kwargs": {
                 "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
                 "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
             },
             "transition_oversampling": trial.suggest_float(
-                "transition_oversampling", 0.9, 2.0
+                "transition_oversampling",
+                0.9,
+                2.0,
             ),
             "reward_trainer_kwargs": {
                 "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
@@ -135,14 +166,17 @@ def __call__(
             "rl": {
                 "rl_kwargs": {
                     "ent_coef": trial.suggest_float(
-                        "rl_ent_coef", 1e-7, 1e-3, log=True
+                        "rl_ent_coef",
+                        1e-7,
+                        1e-3,
+                        log=True,
                     ),
                 },
             },
         },
     ),
     pc_classic_control=RunSacredAsTrial(
-        sacred_ex=imitation.scripts.train_preference_comparisons.train_preference_comparisons_ex,
+        sacred_ex=train_pc_script.train_preference_comparisons_ex,
         suggest_named_configs=lambda _: ["reward.reward_ensemble"],
         suggest_config_updates=lambda trial: {
             "seed": trial.number,
@@ -151,46 +185,69 @@ def __call__(
             "total_comparisons": 1000,
             "active_selection": True,
             "active_selection_oversampling": trial.suggest_int(
-                "active_selection_oversampling", 1, 11
+                "active_selection_oversampling",
+                1,
+                11,
             ),
             "comparison_queue_size": trial.suggest_int(
-                "comparison_queue_size", 1, 1001
+                "comparison_queue_size",
+                1,
+                1001,
             ),  # upper bound determined by total_comparisons=1000
             "exploration_frac": trial.suggest_float("exploration_frac", 0.0, 0.5),
             "fragment_length": trial.suggest_int(
-                "fragment_length", 1, 201
+                "fragment_length",
+                1,
+                201,
             ),  # trajectories are 1000 steps long
             "gatherer_kwargs": {
                 "temperature": trial.suggest_float("gatherer_temperature", 0.0, 2.0),
                 "discount_factor": trial.suggest_float(
-                    "gatherer_discount_factor", 0.95, 1.0
+                    "gatherer_discount_factor",
+                    0.95,
+                    1.0,
                 ),
                 "sample": trial.suggest_categorical("gatherer_sample", [True, False]),
             },
             "initial_epoch_multiplier": trial.suggest_float(
-                "initial_epoch_multiplier", 1, 200.0
+                "initial_epoch_multiplier",
+                1,
+                200.0,
             ),
             "initial_comparison_frac": trial.suggest_float(
-                "initial_comparison_frac", 0.01, 1.0
+                "initial_comparison_frac",
+                0.01,
+                1.0,
             ),
             "num_iterations": trial.suggest_int("num_iterations", 1, 51),
             "preference_model_kwargs": {
                 "noise_prob": trial.suggest_float(
-                    "preference_model_noise_prob", 0.0, 0.1
+                    "preference_model_noise_prob",
+                    0.0,
+                    0.1,
                 ),
                 "discount_factor": trial.suggest_float(
-                    "preference_model_discount_factor", 0.95, 1.0
+                    "preference_model_discount_factor",
+                    0.95,
+                    1.0,
                 ),
             },
             "query_schedule": trial.suggest_categorical(
-                "query_schedule", ["hyperbolic", "constant", "inverse_quadratic"]
+                "query_schedule",
+                [
+                    "hyperbolic",
+                    "constant",
+                    "inverse_quadratic",
+                ],
             ),
             "trajectory_generator_kwargs": {
                 "switch_prob": trial.suggest_float("tr_gen_switch_prob", 0.1, 1),
                 "random_prob": trial.suggest_float("tr_gen_random_prob", 0.1, 0.9),
             },
             "transition_oversampling": trial.suggest_float(
-                "transition_oversampling", 0.9, 2.0
+                "transition_oversampling",
+                0.9,
+                2.0,
             ),
             "reward_trainer_kwargs": {
                 "epochs": trial.suggest_int("reward_trainer_epochs", 1, 11),
@@ -198,7 +255,10 @@ def __call__(
             "rl": {
                 "rl_kwargs": {
                     "ent_coef": trial.suggest_float(
-                        "rl_ent_coef", 1e-7, 1e-3, log=True
+                        "rl_ent_coef",
+                        1e-7,
+                        1e-3,
+                        log=True,
                     ),
                 },
             },
@@ -217,11 +277,16 @@ def __call__(
             "rl": {
                 "rl_kwargs": {
                     "learning_rate": trial.suggest_float(
-                        "learning_rate", 1e-6, 1e-2, log=True
+                        "learning_rate",
+                        1e-6,
+                        1e-2,
+                        log=True,
                     ),
                     "buffer_size": trial.suggest_int("buffer_size", 1000, 100000),
                     "learning_starts": trial.suggest_int(
-                        "learning_starts", 1000, 10000
+                        "learning_starts",
+                        1000,
+                        10000,
                     ),
                     "batch_size": trial.suggest_int("batch_size", 32, 128),
                     "tau": trial.suggest_float("tau", 0.0, 1.0),
@@ -229,16 +294,24 @@ def __call__(
                     "train_freq": trial.suggest_int("train_freq", 1, 40),
                     "gradient_steps": trial.suggest_int("gradient_steps", 1, 10),
                     "target_update_interval": trial.suggest_int(
-                        "target_update_interval", 1, 10000
+                        "target_update_interval",
+                        1,
+                        10000,
                     ),
                     "exploration_fraction": trial.suggest_float(
-                        "exploration_fraction", 0.01, 0.5
+                        "exploration_fraction",
+                        0.01,
+                        0.5,
                     ),
                     "exploration_final_eps": trial.suggest_float(
-                        "exploration_final_eps", 0.01, 1.0
+                        "exploration_final_eps",
+                        0.01,
+                        1.0,
                     ),
                     "exploration_initial_eps": trial.suggest_float(
-                        "exploration_initial_eps", 0.01, 0.5
+                        "exploration_initial_eps",
+                        0.01,
+                        0.5,
                     ),
                     "max_grad_norm": trial.suggest_float("max_grad_norm", 0.1, 10.0),
                 },
diff --git a/tuning/rerun_best_trial.py b/tuning/rerun_best_trial.py
index d18ffb8ad..7b878a02e 100644
--- a/tuning/rerun_best_trial.py
+++ b/tuning/rerun_best_trial.py
@@ -1,7 +1,6 @@
 """Script to re-run the best trials from a previous hyperparameter tuning run."""
 import argparse
 import random
-from typing import List, Tuple
 
 import hp_search_spaces
 import optuna
@@ -11,7 +10,7 @@
 def make_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         description="Re-run the best trial from a previous tuning run.",
-        epilog=f"Example usage:\n" f"python rerun_best_trials.py tuning_run.json\n",
+        epilog="Example usage:\npython rerun_best_trials.py tuning_run.json\n",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument(
@@ -40,6 +39,12 @@ def infer_algo_name(study: optuna.Study) -> str:
     """Infer the algo name from the study name.
 
     Assumes that the study name is of the form "tuning_{algo}_with_{named_configs}".
+
+    Args:
+        study: The optuna study.
+
+    Returns:
+        algo name
     """
     assert study.study_name.startswith("tuning_")
     assert "_with_" in study.study_name
@@ -51,7 +56,7 @@ def main():
     args = parser.parse_args()
     study: optuna.Study = optuna.load_study(
         storage=optuna.storages.JournalStorage(
-            optuna.storages.JournalFileStorage(args.journal_log)
+            optuna.storages.JournalFileStorage(args.journal_log),
         ),
         # in our case, we have one journal file per study so the study name can be
         # inferred
@@ -73,7 +78,7 @@ def main():
     )
     if result.status != "COMPLETED":
         raise RuntimeError(
-            f"Trial failed with {result.fail_trace()} and status {result.status}."
+            f"Trial failed with {result.fail_trace()} and status {result.status}.",
         )
 
 
diff --git a/tuning/tune.py b/tuning/tune.py
index 7a77d400d..4a2e710c5 100644
--- a/tuning/tune.py
+++ b/tuning/tune.py
@@ -14,7 +14,8 @@ def make_parser() -> argparse.ArgumentParser:
 
     parser = argparse.ArgumentParser(
         description="Tune hyperparameters for imitation learning algorithms.",
-        epilog=f"Example usage:\n{example_usage}\n\nPossible named configs:\n{possible_named_configs}",
+        epilog=f"Example usage:\n{example_usage}\n\n"
+        f"Possible named configs:\n{possible_named_configs}",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument(
@@ -33,7 +34,10 @@ def make_parser() -> argparse.ArgumentParser:
         "Use this to select the environment to tune on.",
     )
     parser.add_argument(
-        "--num_trials", type=int, default=100, help="Number of trials to run."
+        "--num_trials",
+        type=int,
+        default=100,
+        help="Number of trials to run.",
     )
     parser.add_argument(
         "-j",
@@ -49,7 +53,7 @@ def make_parser() -> argparse.ArgumentParser:
 def make_study(args: argparse.Namespace) -> optuna.Study:
     if args.journal_log is not None:
         storage = optuna.storages.JournalStorage(
-            optuna.storages.JournalFileStorage(args.journal_log)
+            optuna.storages.JournalFileStorage(args.journal_log),
         )
     else:
         storage = None