ray-project · sven1977 · Jan 21, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 21, 2025
@@ -55,6 +55,13 @@ All example sub-folders
 
 Actions
 +++++++
+
+.. _rllib-examples-overview-autoregressive-actions:
+
+- `Auto-regressive actions <https://github.com/ray-project/ray/blob/master/rllib/examples/actions/autoregressive_actions.py>`__:
+   Configures an RL module that generates actions in an autoregressive manner, where the second component of an action depends on
+   the previously sampled first component of the same action.
+
 - `Nested Action Spaces <https://github.com/ray-project/ray/blob/master/rllib/examples/actions/nested_action_spaces.py>`__:
    Sets up an environment with nested action spaces using custom single- or multi-agent
    configurations. This example demonstrates how RLlib manages complex action structures,
@@ -345,9 +352,8 @@ RLModules
    Implements an :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` with action masking, where certain disallowed actions are
    masked based on parts of the observation dict, useful for environments with conditional action availability.
 
-- `Auto-regressive actions <https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/autoregressive_actions_rl_module.py>`__:
-   Configures an RL module that generates actions in an autoregressive manner, where the second component of an action depends on
-   the previously sampled first component of the same action.
+- `Auto-regressive actions <https://github.com/ray-project/ray/blob/master/rllib/examples/actions/autoregressive_actions.py>`__:
+   :ref:`See here for more details <rllib-examples-overview-autoregressive-actions>`.
 
 - `Custom CNN-based RLModule <https://github.com/ray-project/ray/blob/master/rllib/examples/rl_modules/custom_cnn_rl_module.py>`__:
    Demonstrates a custom CNN architecture realized as an :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule`, enabling convolutional

@@ -1959,7 +1959,14 @@ py_test(
 
 # subdirectory: actions/
 # ....................................
-# Nested action spaces (flattening obs and learning w/ multi-action distribution).
+py_test(
+    name = "examples/actions/autoregressive_actions",
+    main = "examples/actions/autoregressive_actions.py",
+    tags = ["team:rllib", "examples"],
+    size = "medium",
+    srcs = ["examples/actions/autoregressive_actions.py"],
+    args = ["--enable-new-api-stack"],
+)
 py_test(
     name = "examples/actions/nested_action_spaces_ppo",
     main = "examples/actions/nested_action_spaces.py",
@@ -1968,7 +1975,6 @@ py_test(
     srcs = ["examples/actions/nested_action_spaces.py"],
     args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--stop-reward=-500.0", "--algo=PPO"]
 )
-
 py_test(
     name = "examples/actions/nested_action_spaces_multi_agent_ppo",
     main = "examples/actions/nested_action_spaces.py",
@@ -2878,15 +2884,6 @@ py_test(
     srcs = ["examples/rl_modules/action_masking_rl_module.py"],
     args = ["--enable-new-api-stack", "--stop-iters=5"],
 )
-
-py_test(
-    name = "examples/rl_modules/autoregressive_actions_rl_module",
-    main = "examples/rl_modules/autoregressive_actions_rl_module.py",
-    tags = ["team:rllib", "examples"],
-    size = "medium",
-    srcs = ["examples/rl_modules/autoregressive_actions_rl_module.py"],
-    args = ["--enable-new-api-stack"],
-)
 py_test(
     name = "examples/rl_modules/custom_cnn_rl_module",
     main = "examples/rl_modules/custom_cnn_rl_module.py",
@@ -2934,26 +2931,6 @@ py_test(
     args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--stop-reward-pretraining=250.0", "--stop-reward=250.0", "--stop-iters=3"],
 )
 
-#@OldAPIStack
-py_test(
-    name = "examples/autoregressive_action_dist_tf",
-    main = "examples/autoregressive_action_dist.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "medium",
-    srcs = ["examples/autoregressive_action_dist.py"],
-    args = ["--as-test", "--framework=tf", "--stop-reward=-0.012", "--num-cpus=4"]
-)
-
-#@OldAPIStack
-py_test(
-    name = "examples/autoregressive_action_dist_torch",
-    main = "examples/autoregressive_action_dist.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "medium",
-    srcs = ["examples/autoregressive_action_dist.py"],
-    args = ["--as-test", "--framework=torch", "--stop-reward=-0.012", "--num-cpus=4"]
-)
-
 #@OldAPIStack
 py_test(
     name = "examples/centralized_critic_tf",

@@ -0,0 +1,109 @@
+"""Example on how to define and run with an RLModule with a dependent action space.
+
+This examples:
+    - Shows how to write a custom RLModule outputting autoregressive actions.
+    The RLModule class used here implements a prior distribution for the first couple
+    of actions and then uses the sampled actions to compute the parameters for and
+    sample from a posterior distribution.
+    - Shows how to configure a PPO algorithm to use the custom RLModule.
+    - Stops the training after 100k steps or when the mean episode return
+    exceeds -0.012 in evaluation, i.e. if the agent has learned to
+    synchronize its actions.
+
+For details on the environment used, take a look at the `CorrelatedActionsEnv`
+class. To receive an episode return over 100, the agent must learn how to synchronize
+its actions.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack --num-env-runners 2`
+
+Control the number of `EnvRunner`s with the `--num-env-runners` flag. This
+will increase the sampling speed.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+For logging to your WandB account, use:
+`--wandb-key=[your WandB API key] --wandb-project=[some project name]
+--wandb-run-name=[optional: WandB run name (within the defined project)]`
+
+
+Results to expect
+-----------------
+You should reach an episode return of better than -0.5 quickly through a simple PPO
+policy. The logic behind beating the env is roughly:
+
+OBS:  optimal a1:   r1:  optimal a2:   r2:
+-1      2            0      -1.0        0
+-0.5    1/2       -0.5   -0.5/-1.5      0
+0       1            0      -1.0        0
+0.5     0/1       -0.5   -0.5/-1.5      0
+1       0            0      -1.0        0
+
+Meaning, most of the time, you would receive a reward better than -0.5, but worse than
+0.0.
+
++--------------------------------------+------------+--------+------------------+
+| Trial name                           | status     |   iter |   total time (s) |
+|                                      |            |        |                  |
+|--------------------------------------+------------+--------+------------------+
+| PPO_CorrelatedActionsEnv_6660d_00000 | TERMINATED |     76 |          132.438 |
++--------------------------------------+------------+--------+------------------+
++------------------------+------------------------+------------------------+
+|    episode_return_mean |   num_env_steps_sample |   ...env_steps_sampled |
+|                        |             d_lifetime |   _lifetime_throughput |
+|------------------------+------------------------+------------------------|
+|                  -0.43 |                 152000 |                1283.48 |
++------------------------+------------------------+------------------------+
+"""
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.rl_module import RLModuleSpec
+from ray.rllib.examples.envs.classes.correlated_actions_env import CorrelatedActionsEnv
+from ray.rllib.examples.rl_modules.classes.autoregressive_actions_rlm import (
+    AutoregressiveActionsRLM,
+)
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+
+
+parser = add_rllib_example_script_args(
+    default_iters=1000,
+    default_timesteps=2000000,
+    default_reward=-0.45,
+)
+parser.set_defaults(enable_new_api_stack=True)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    if args.algo != "PPO":
+        raise ValueError(
+            "This example script only runs with PPO! Set --algo=PPO on the command "
+            "line."
+        )
+
+    base_config = (
+        PPOConfig()
+        .environment(CorrelatedActionsEnv)
+        .training(
+            train_batch_size_per_learner=2000,
+            num_epochs=12,
+            minibatch_size=256,
+            entropy_coeff=0.005,
+            lr=0.0003,
+        )
+        # Specify the RLModule class to be used.
+        .rl_module(
+            rl_module_spec=RLModuleSpec(module_class=AutoregressiveActionsRLM),
+        )
+    )
+
+    run_rllib_example_script_experiment(base_config, args)