HumanCompatibleAI · ernestum · Dec 4, 2023 · Jan 6, 2024 · Jan 10, 2024 · Jan 8, 2024
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 IS_NOT_WINDOWS = os.name != "nt"
 
-PARALLEL_REQUIRE = ["ray[debug,tune]~=2.0.0"]
+PARALLEL_REQUIRE = ["ray[debug,tune]~=2.9.0"]
 ATARI_REQUIRE = [
     "seals[atari]~=0.2.1",
 ]

diff --git a/src/imitation/algorithms/preference_comparisons.py b/src/imitation/algorithms/preference_comparisons.py
@@ -1678,6 +1678,8 @@ def train(
         unnormalized_probs = vec_schedule(np.linspace(0, 1, self.num_iterations))
         probs = unnormalized_probs / np.sum(unnormalized_probs)
         shares = util.oric(probs * total_comparisons)
+        shares[shares <= 0] = 1  # ensure we at least request one comparison per iteration
+
         schedule = [initial_comparisons] + shares.tolist()
         print(f"Query schedule: {schedule}")
 

diff --git a/src/imitation/scripts/config/train_preference_comparisons.py b/src/imitation/scripts/config/train_preference_comparisons.py
@@ -42,6 +42,8 @@ def train_defaults():
     transition_oversampling = 1
     # fraction of total_comparisons that will be sampled right at the beginning
     initial_comparison_frac = 0.1
+    # factor by which to oversample the number of epochs in the first iteration
+    initial_epoch_multiplier = 200.0
     # fraction of sampled trajectories that will include some random actions
     exploration_frac = 0.0
     preference_model_kwargs = {}
@@ -77,7 +79,7 @@ def cartpole():
 
 @train_preference_comparisons_ex.named_config
 def seals_ant():
-    environment = dict(gym_id="seals/Ant-v0")
+    environment = dict(gym_id="seals/Ant-v1")
     rl = dict(
         batch_size=2048,
         rl_kwargs=dict(
@@ -104,7 +106,7 @@ def half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_half_cheetah():
-    environment = dict(gym_id="seals/HalfCheetah-v0")
+    environment = dict(gym_id="seals/HalfCheetah-v1")
     rl = dict(
         batch_size=512,
         rl_kwargs=dict(
@@ -125,7 +127,7 @@ def seals_half_cheetah():
 
 @train_preference_comparisons_ex.named_config
 def seals_hopper():
-    environment = dict(gym_id="seals/Hopper-v0")
+    environment = dict(gym_id="seals/Hopper-v1")
     policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
@@ -151,7 +153,7 @@ def seals_hopper():
 
 @train_preference_comparisons_ex.named_config
 def seals_swimmer():
-    environment = dict(gym_id="seals/Swimmer-v0")
+    environment = dict(gym_id="seals/Swimmer-v1")
     policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
@@ -178,7 +180,7 @@ def seals_swimmer():
 
 @train_preference_comparisons_ex.named_config
 def seals_walker():
-    environment = dict(gym_id="seals/Walker2d-v0")
+    environment = dict(gym_id="seals/Walker2d-v1")
     policy = dict(
         policy_cls="MlpPolicy",
         policy_kwargs=dict(
@@ -206,7 +208,7 @@ def seals_walker():
 @train_preference_comparisons_ex.named_config
 def seals_humanoid():
     locals().update(**MUJOCO_SHARED_LOCALS)
-    environment = dict(gym_id="seals/Humanoid-v0")
+    environment = dict(gym_id="seals/Humanoid-v1")
     total_timesteps = int(4e6)
 
 

diff --git a/src/imitation/scripts/config/tuning.py b/src/imitation/scripts/config/tuning.py
@@ -188,38 +188,42 @@ def pc():
     parallel_run_config = dict(
         sacred_ex_name="train_preference_comparisons",
         run_name="pc_tuning",
-        base_named_configs=["logging.wandb_logging"],
+        base_named_configs=[],
         base_config_updates={
             "environment": {"num_vec": 1},
-            "demonstrations": {"source": "huggingface"},
             "total_timesteps": 2e7,
-            "total_comparisons": 5000,
-            "query_schedule": "hyperbolic",
-            "gatherer_kwargs": {"sample": True},
+            "total_comparisons": 1000,
+            "active_selection": True,
         },
         search_space={
-            "named_configs": [
-                ["reward.normalize_output_disable"],
-            ],
+            "named_configs": ["reward.reward_ensemble"],
             "config_updates": {
-                "train": {
-                    "policy_kwargs": {
-                        "activation_fn": tune.choice(
-                            [
-                                nn.ReLU,
-                            ],
-                        ),
-                    },
+                "active_selection_oversampling": tune.randint(1, 11),
+                "comparison_queue_size": tune.randint(1, 1001),  # upper bound determined by total_comparisons=1000
+                "exploration_frac": tune.uniform(0.0, 0.5),
+                "fragment_length": tune.randint(1, 1001),  # trajectories are 1000 steps long
+                "gatherer_kwargs": {
+                    "temperature": tune.uniform(0.0, 2.0),
+                    "discount_factor": tune.uniform(0.95, 1.0),
+                    "sample": tune.choice([True, False]),
                 },
-                "num_iterations": tune.choice([25, 50]),
-                "initial_comparison_frac": tune.choice([0.1, 0.25]),
+                "initial_comparison_frac": tune.uniform(0.01, 1.0),
+                "num_iterations": tune.randint(1, 51),
+                "preference_model_kwargs": {
+                    "noise_prob": tune.uniform(0.0, 0.1),
+                    "discount_factor": tune.uniform(0.95, 1.0),
+                },
+                "query_schedule": tune.choice(["hyperbolic", "constant", "inverse_quadratic"]),
+                "trajectory_generator_kwargs": {
+                    "switch_prob": tune.uniform(0.1, 1),
+                    "random_prob": tune.uniform(0.1, 0.9),
+                },
+                "transition_oversampling": tune.uniform(0.9, 2.0),
                 "reward_trainer_kwargs": {
-                    "epochs": tune.choice([1, 3, 6]),
+                    "epochs": tune.randint(1, 11),
                 },
                 "rl": {
-                    "batch_size": tune.choice([512, 2048, 8192]),
                     "rl_kwargs": {
-                        "learning_rate": tune.loguniform(1e-5, 1e-2),
                         "ent_coef": tune.loguniform(1e-7, 1e-3),
                     },
                 },

diff --git a/src/imitation/scripts/parallel.py b/src/imitation/scripts/parallel.py
@@ -188,13 +188,12 @@ def _ray_tune_sacred_wrapper(
         `ex.run`) and `reporter`. The function returns the run result.
     """
 
-    def inner(config: Mapping[str, Any], reporter) -> Mapping[str, Any]:
+    def inner(config: Mapping[str, Any]) -> Mapping[str, Any]:
         """Trainable function with the correct signature for `ray.tune`.
 
         Args:
             config: Keyword arguments for `ex.run()`, where `ex` is the
                 `sacred.Experiment` instance associated with `sacred_ex_name`.
-            reporter: Callback to report progress to Ray.
 
         Returns:
             Result from `ray.Run` object.

diff --git a/src/imitation/scripts/train_preference_comparisons.py b/src/imitation/scripts/train_preference_comparisons.py
@@ -68,6 +68,7 @@ def train_preference_comparisons(
     fragment_length: int,
     transition_oversampling: float,
     initial_comparison_frac: float,
+    initial_epoch_multiplier: float,
     exploration_frac: float,
     trajectory_path: Optional[str],
     trajectory_generator_kwargs: Mapping[str, Any],
@@ -106,6 +107,9 @@ def train_preference_comparisons(
             sampled before the rest of training begins (using the randomly initialized
             agent). This can be used to pretrain the reward model before the agent
             is trained on the learned reward.
+        initial_epoch_multiplier: before agent training begins, train the reward
+                model for this many more epochs than usual (on fragments sampled from a
+                random agent).
         exploration_frac: fraction of trajectory samples that will be created using
             partially random actions, rather than the current policy. Might be helpful
             if the learned policy explores too little and gets stuck with a wrong
@@ -258,6 +262,7 @@ def train_preference_comparisons(
             fragment_length=fragment_length,
             transition_oversampling=transition_oversampling,
             initial_comparison_frac=initial_comparison_frac,
+            initial_epoch_multiplier=initial_epoch_multiplier,
             custom_logger=custom_logger,
             allow_variable_horizon=allow_variable_horizon,
             query_schedule=query_schedule,