From bed55cc92869ce5f1831348b290cb565197c5274 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 3 Jan 2025 22:26:38 +0100 Subject: [PATCH 01/22] wip Signed-off-by: sven1977 --- doc/source/rllib/doc_code/getting_started.py | 37 -- doc/source/rllib/images/rllib-api.svg | 1 - doc/source/rllib/rllib-training.rst | 171 +++--- rllib/algorithms/algorithm.py | 517 ++++++++----------- 4 files changed, 284 insertions(+), 442 deletions(-) delete mode 100644 doc/source/rllib/images/rllib-api.svg diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py index 951b3acee8da..549a506ed043 100644 --- a/doc/source/rllib/doc_code/getting_started.py +++ b/doc/source/rllib/doc_code/getting_started.py @@ -1,44 +1,11 @@ # flake8: noqa -# __rllib-first-config-begin__ -from pprint import pprint - -from ray.rllib.algorithms.ppo import PPOConfig - -config = ( - PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) - .environment("CartPole-v1") - .env_runners(num_env_runners=1) -) - -algo = config.build() - -for i in range(10): - result = algo.train() - result.pop("config") - pprint(result) - - if i % 5 == 0: - checkpoint_dir = algo.save_to_path() - print(f"Checkpoint saved in directory {checkpoint_dir}") -# __rllib-first-config-end__ - -algo.stop() - if False: # __rllib-tune-config-begin__ from ray import train, tune config = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .training( lr=tune.grid_search([0.01, 0.001, 0.0001]), @@ -125,10 +92,6 @@ algo = ( PPOConfig() - .api_stack( - enable_rl_module_and_learner=True, - enable_env_runner_and_connector_v2=True, - ) .environment("CartPole-v1") .env_runners(num_env_runners=2) ).build() diff --git a/doc/source/rllib/images/rllib-api.svg b/doc/source/rllib/images/rllib-api.svg deleted file mode 100644 index 6eb03dac2e49..000000000000 --- a/doc/source/rllib/images/rllib-api.svg +++ /dev/null @@ -1 +0,0 @@ - diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index 4a94f17dfb62..fb8617179cc2 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -4,22 +4,24 @@ .. _rllib-getting-started: -Getting Started with RLlib -========================== +Getting Started +=============== All RLlib experiments are run using an ``Algorithm`` class which holds a policy for environment interaction. Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing). In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once. -.. image:: images/rllib-api.svg - In this guide, we will explain in detail RLlib's Python API for running learning experiments. -.. _rllib-training-api: +RLlib in 15 minutes +------------------- + -Using the Python API --------------------- +.. _rllib-python-api: + +Python API +~~~~~~~~~~ The Python API provides all the flexibility required for applying RLlib to any type of problem. @@ -31,12 +33,42 @@ the `env_runners` method. After we `build` the `PPO` Algorithm from its configuration, we can `train` it for a number of iterations (here `10`) and `save` the resulting policy periodically (here every `5` iterations). -.. literalinclude:: ./doc_code/getting_started.py - :language: python - :start-after: rllib-first-config-begin - :end-before: rllib-first-config-end + +.. testcode:: + + from ray.rllib.algorithms.ppo import PPOConfig + + # Configure the Algorithm (PPO). + config = ( + PPOConfig() + .environment("CartPole-v1") + .env_runners(num_env_runners=1) + ) + # Build the Algorithm (PPO). + ppo = config.build() + + # Train for 10 iterations. + for i in range(10): + result = ppo.train() + result.pop("config") + print(result) + + # Checkpoint every 5 iterations. + if i % 5 == 0: + checkpoint_dir = ppo.save_to_path() + print(f"Algorithm checkpoint saved in: {checkpoint_dir}") + +.. testcode:: + :hide: + + algo.stop() +.. _rllib-with-ray-tune: + +RLlib with Ray Tune +~~~~~~~~~~~~~~~~~~~ + All RLlib algorithms are compatible with the :ref:`Tune API `. This enables them to be easily used in experiments with :ref:`Ray Tune `. For example, the following code performs a simple hyper-parameter sweep of PPO. @@ -84,21 +116,31 @@ To load newer RLlib checkpoints (version >= 1.0), use the following code: .. code-block:: python from ray.rllib.algorithms.algorithm import Algorithm + algo = Algorithm.from_checkpoint(checkpoint_path) -For older RLlib checkpoint versions (version < 1.0), you can -restore an algorithm through: +Customizing your RL environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code-block:: python +In the preceding examples, your RL environment was always "CartPole-v1", however, you would probably like to +run your actual experiments against a different environment or even write your own custom one. + +See here ...blabla + +Customizing your models +~~~~~~~~~~~~~~~~~~~~~~~ + +In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything +in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define +the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here +for a detailed guide on how to do so. - from ray.rllib.algorithms.ppo import PPO - algo = PPO(config=config, env=env_class) - algo.restore(checkpoint_path) + +Deploying your models and computing actions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Computing Actions -~~~~~~~~~~~~~~~~~ The simplest way to programmatically compute actions from a trained agent is to use ``Algorithm.compute_single_action()``. @@ -114,6 +156,7 @@ Here is a simple example of testing a trained agent for one episode: For more advanced usage on computing actions and other functionality, you can consult the :ref:`RLlib Algorithm API documentation `. + Accessing Policy State ~~~~~~~~~~~~~~~~~~~~~~ @@ -183,97 +226,47 @@ an algorithm. `custom model classes `__. -.. _rllib-scaling-guide: - -RLlib Scaling Guide -------------------- - -Here are some rules of thumb for scaling training with RLlib. - -1. If the environment is slow and cannot be replicated (e.g., since it requires interaction with physical systems), then you should use a sample-efficient off-policy algorithm such as :ref:`DQN ` or :ref:`SAC `. These algorithms default to ``num_env_runners: 0`` for single-process operation. Make sure to set ``num_gpus: 1`` if you want to use a GPU. Consider also batch RL training with the `offline data `__ API. - -2. If the environment is fast and the model is small (most models for RL are), use time-efficient algorithms such as :ref:`PPO `, or :ref:`IMPALA `. -These can be scaled by increasing ``num_env_runners`` to add rollout workers. It may also make sense to enable `vectorization `__ for -inference. Make sure to set ``num_gpus: 1`` if you want to use a GPU. If the learner becomes a bottleneck, you can use multiple GPUs for learning by setting -``num_gpus > 1``. - -3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_env_runner: 1``. If you only have a single GPU, consider ``num_env_runners: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker `__. - -4. Finally, if both model and environment are compute intensive, then enable `remote worker envs `__ with `async batching `__ by setting ``remote_worker_envs: True`` and optionally ``remote_env_batch_wait_ms``. This batches inference on GPUs in the rollout workers while letting envs run asynchronously in separate actors, similar to the `SEED `__ architecture. The number of workers and number of envs per worker should be tuned to maximize GPU utilization. - -In case you are using lots of workers (``num_env_runners >> 10``) and you observe worker failures for whatever reasons, which normally interrupt your RLlib training runs, consider using -the config settings ``ignore_env_runner_failures=True``, ``restart_failed_env_runners=True``, or ``restart_failed_sub_environments=True``: - -``restart_failed_env_runners``: When set to True (default), your Algorithm will attempt to restart any failed EnvRunner and replace it with a newly created one. This way, your number of workers will never decrease, even if some of them fail from time to time. -``ignore_env_runner_failures``: When set to True, your Algorithm will not crash due to an EnvRunner error, but continue for as long as there is at least one functional worker remaining. This setting is ignored when ``restart_failed_env_runners=True``. -``restart_failed_sub_environments``: When set to True and there is a failure in one of the vectorized sub-environments in one of your EnvRunners, RLlib tries to recreate only the failed sub-environment and re-integrate the newly created one into your vectorized env stack on that EnvRunner. - -Note that only one of ``ignore_env_runner_failures`` or ``restart_failed_env_runners`` should be set to True (they are mutually exclusive settings). However, -you can combine each of these with the ``restart_failed_sub_environments=True`` setting. -Using these options will make your training runs much more stable and more robust against occasional OOM or other similar "once in a while" errors on the EnvRunners -themselves or inside your custom environments. - - -Debugging RLlib Experiments ---------------------------- - -Eager Mode -~~~~~~~~~~ - -Policies built with ``build_tf_policy`` (most of the reference algorithms are) -can be run in eager mode by setting the -``"framework": "tf2"`` / ``"eager_tracing": true`` config options. -This will tell RLlib to execute the model forward pass, action distribution, -loss, and stats functions in eager mode. - -Eager mode makes debugging much easier, since you can now use line-by-line -debugging with breakpoints or Python ``print()`` to inspect -intermediate tensor values. -However, eager can be slower than graph mode unless tracing is enabled. - - -Episode Traces -~~~~~~~~~~~~~~ - -You can use the `data output API `__ to save episode traces -for debugging. For example, the following command will run PPO while saving episode -traces to ``/tmp/debug``. - -.. code-block:: bash - +.. Debugging RLlib Experiments + --------------------------- + Eager Mode + ~~~~~~~~~~ + Policies built with ``build_tf_policy`` (most of the reference algorithms are) + can be run in eager mode by setting the + ``"framework": "tf2"`` / ``"eager_tracing": true`` config options. + This will tell RLlib to execute the model forward pass, action distribution, + loss, and stats functions in eager mode. + Eager mode makes debugging much easier, since you can now use line-by-line + debugging with breakpoints or Python ``print()`` to inspect + intermediate tensor values. + However, eager can be slower than graph mode unless tracing is enabled. + Episode Traces + ~~~~~~~~~~~~~~ + You can use the `data output API `__ to save episode traces + for debugging. For example, the following command will run PPO while saving episode + traces to ``/tmp/debug``. + .. code-block:: bash cd rllib/tuned_examples/ppo python cartpole_ppo.py --output /tmp/debug - # episode traces will be saved in /tmp/debug, for example output-2019-02-23_12-02-03_worker-2_0.json output-2019-02-23_12-02-04_worker-1_0.json - Log Verbosity ~~~~~~~~~~~~~ - You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG", "INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the verbosity of internal logging. For example: - -.. code-block:: bash - + .. code-block:: bash cd rllib/tuned_examples/ppo - python atari_ppo.py --env ALE/Pong-v5 --log-level INFO python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG - The default log level is ``WARN``. We strongly recommend using at least ``INFO`` level logging for development. - Stack Traces ~~~~~~~~~~~~ - You can use the ``ray stack`` command to dump the stack traces of all the Python workers on a single node. This can be useful for debugging unexpected hangs or performance issues. - Next Steps ---------- - - To check how your application is doing, you can use the :ref:`Ray dashboard `. diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 09f2e638d18f..fd52894dc432 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -2087,321 +2087,6 @@ def set_weights(self, weights: Dict[PolicyID, dict]): ) self.env_runner_group.local_env_runner.set_weights(weights) - @OldAPIStack - def compute_single_action( - self, - observation: Optional[TensorStructType] = None, - state: Optional[List[TensorStructType]] = None, - *, - prev_action: Optional[TensorStructType] = None, - prev_reward: Optional[float] = None, - info: Optional[EnvInfoDict] = None, - input_dict: Optional[SampleBatch] = None, - policy_id: PolicyID = DEFAULT_POLICY_ID, - full_fetch: bool = False, - explore: Optional[bool] = None, - timestep: Optional[int] = None, - episode=None, - unsquash_action: Optional[bool] = None, - clip_action: Optional[bool] = None, - # Kwargs placeholder for future compatibility. - **kwargs, - ) -> Union[ - TensorStructType, - Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]], - ]: - """Computes an action for the specified policy on the local worker. - - Note that you can also access the policy object through - self.get_policy(policy_id) and call compute_single_action() on it - directly. - - Args: - observation: Single (unbatched) observation from the - environment. - state: List of all RNN hidden (single, unbatched) state tensors. - prev_action: Single (unbatched) previous action value. - prev_reward: Single (unbatched) previous reward value. - info: Env info dict, if any. - input_dict: An optional SampleBatch that holds all the values - for: obs, state, prev_action, and prev_reward, plus maybe - custom defined views of the current env trajectory. Note - that only one of `obs` or `input_dict` must be non-None. - policy_id: Policy to query (only applies to multi-agent). - Default: "default_policy". - full_fetch: Whether to return extra action fetch results. - This is always set to True if `state` is specified. - explore: Whether to apply exploration to the action. - Default: None -> use self.config.explore. - timestep: The current (sampling) time step. - episode: This provides access to all of the internal episodes' - state, which may be useful for model-based or multi-agent - algorithms. - unsquash_action: Should actions be unsquashed according to the - env's/Policy's action space? If None, use the value of - self.config.normalize_actions. - clip_action: Should actions be clipped according to the - env's/Policy's action space? If None, use the value of - self.config.clip_actions. - - Keyword Args: - kwargs: forward compatibility placeholder - - Returns: - The computed action if full_fetch=False, or a tuple of a) the - full output of policy.compute_actions() if full_fetch=True - or we have an RNN-based Policy. - - Raises: - KeyError: If the `policy_id` cannot be found in this Algorithm's local - worker. - """ - # `unsquash_action` is None: Use value of config['normalize_actions']. - if unsquash_action is None: - unsquash_action = self.config.normalize_actions - # `clip_action` is None: Use value of config['clip_actions']. - elif clip_action is None: - clip_action = self.config.clip_actions - - # User provided an input-dict: Assert that `obs`, `prev_a|r`, `state` - # are all None. - err_msg = ( - "Provide either `input_dict` OR [`observation`, ...] as " - "args to `Algorithm.compute_single_action()`!" - ) - if input_dict is not None: - assert ( - observation is None - and prev_action is None - and prev_reward is None - and state is None - ), err_msg - observation = input_dict[Columns.OBS] - else: - assert observation is not None, err_msg - - # Get the policy to compute the action for (in the multi-agent case, - # Algorithm may hold >1 policies). - policy = self.get_policy(policy_id) - if policy is None: - raise KeyError( - f"PolicyID '{policy_id}' not found in PolicyMap of the " - f"Algorithm's local worker!" - ) - # Just preprocess observations, similar to how it used to be done before. - pp = policy.agent_connectors[ObsPreprocessorConnector] - - # convert the observation to array if possible - if not isinstance(observation, (np.ndarray, dict, tuple)): - try: - observation = np.asarray(observation) - except Exception: - raise ValueError( - f"Observation type {type(observation)} cannot be converted to " - f"np.ndarray." - ) - if pp: - assert len(pp) == 1, "Only one preprocessor should be in the pipeline" - pp = pp[0] - - if not pp.is_identity(): - # Note(Kourosh): This call will leave the policy's connector - # in eval mode. would that be a problem? - pp.in_eval() - if observation is not None: - _input_dict = {Columns.OBS: observation} - elif input_dict is not None: - _input_dict = {Columns.OBS: input_dict[Columns.OBS]} - else: - raise ValueError( - "Either observation or input_dict must be provided." - ) - - # TODO (Kourosh): Create a new util method for algorithm that - # computes actions based on raw inputs from env and can keep track - # of its own internal state. - acd = AgentConnectorDataType("0", "0", _input_dict) - # make sure the state is reset since we are only applying the - # preprocessor - pp.reset(env_id="0") - ac_o = pp([acd])[0] - observation = ac_o.data[Columns.OBS] - - # Input-dict. - if input_dict is not None: - input_dict[Columns.OBS] = observation - action, state, extra = policy.compute_single_action( - input_dict=input_dict, - explore=explore, - timestep=timestep, - episode=episode, - ) - # Individual args. - else: - action, state, extra = policy.compute_single_action( - obs=observation, - state=state, - prev_action=prev_action, - prev_reward=prev_reward, - info=info, - explore=explore, - timestep=timestep, - episode=episode, - ) - - # If we work in normalized action space (normalize_actions=True), - # we re-translate here into the env's action space. - if unsquash_action: - action = space_utils.unsquash_action(action, policy.action_space_struct) - # Clip, according to env's action space. - elif clip_action: - action = space_utils.clip_action(action, policy.action_space_struct) - - # Return 3-Tuple: Action, states, and extra-action fetches. - if state or full_fetch: - return action, state, extra - # Ensure backward compatibility. - else: - return action - - @OldAPIStack - def compute_actions( - self, - observations: TensorStructType, - state: Optional[List[TensorStructType]] = None, - *, - prev_action: Optional[TensorStructType] = None, - prev_reward: Optional[TensorStructType] = None, - info: Optional[EnvInfoDict] = None, - policy_id: PolicyID = DEFAULT_POLICY_ID, - full_fetch: bool = False, - explore: Optional[bool] = None, - timestep: Optional[int] = None, - episodes=None, - unsquash_actions: Optional[bool] = None, - clip_actions: Optional[bool] = None, - **kwargs, - ): - """Computes an action for the specified policy on the local Worker. - - Note that you can also access the policy object through - self.get_policy(policy_id) and call compute_actions() on it directly. - - Args: - observation: Observation from the environment. - state: RNN hidden state, if any. If state is not None, - then all of compute_single_action(...) is returned - (computed action, rnn state(s), logits dictionary). - Otherwise compute_single_action(...)[0] is returned - (computed action). - prev_action: Previous action value, if any. - prev_reward: Previous reward, if any. - info: Env info dict, if any. - policy_id: Policy to query (only applies to multi-agent). - full_fetch: Whether to return extra action fetch results. - This is always set to True if RNN state is specified. - explore: Whether to pick an exploitation or exploration - action (default: None -> use self.config.explore). - timestep: The current (sampling) time step. - episodes: This provides access to all of the internal episodes' - state, which may be useful for model-based or multi-agent - algorithms. - unsquash_actions: Should actions be unsquashed according - to the env's/Policy's action space? If None, use - self.config.normalize_actions. - clip_actions: Should actions be clipped according to the - env's/Policy's action space? If None, use - self.config.clip_actions. - - Keyword Args: - kwargs: forward compatibility placeholder - - Returns: - The computed action if full_fetch=False, or a tuple consisting of - the full output of policy.compute_actions_from_input_dict() if - full_fetch=True or we have an RNN-based Policy. - """ - # `unsquash_actions` is None: Use value of config['normalize_actions']. - if unsquash_actions is None: - unsquash_actions = self.config.normalize_actions - # `clip_actions` is None: Use value of config['clip_actions']. - elif clip_actions is None: - clip_actions = self.config.clip_actions - - # Preprocess obs and states. - state_defined = state is not None - policy = self.get_policy(policy_id) - filtered_obs, filtered_state = [], [] - for agent_id, ob in observations.items(): - worker = self.env_runner_group.local_env_runner - if worker.preprocessors.get(policy_id) is not None: - preprocessed = worker.preprocessors[policy_id].transform(ob) - else: - preprocessed = ob - filtered = worker.filters[policy_id](preprocessed, update=False) - filtered_obs.append(filtered) - if state is None: - continue - elif agent_id in state: - filtered_state.append(state[agent_id]) - else: - filtered_state.append(policy.get_initial_state()) - - # Batch obs and states - obs_batch = np.stack(filtered_obs) - if state is None: - state = [] - else: - state = list(zip(*filtered_state)) - state = [np.stack(s) for s in state] - - input_dict = {Columns.OBS: obs_batch} - - # prev_action and prev_reward can be None, np.ndarray, or tensor-like structure. - # Explicitly check for None here to avoid the error message "The truth value of - # an array with more than one element is ambiguous.", when np arrays are passed - # as arguments. - if prev_action is not None: - input_dict[SampleBatch.PREV_ACTIONS] = prev_action - if prev_reward is not None: - input_dict[SampleBatch.PREV_REWARDS] = prev_reward - if info: - input_dict[Columns.INFOS] = info - for i, s in enumerate(state): - input_dict[f"state_in_{i}"] = s - - # Batch compute actions - actions, states, infos = policy.compute_actions_from_input_dict( - input_dict=input_dict, - explore=explore, - timestep=timestep, - episodes=episodes, - ) - - # Unbatch actions for the environment into a multi-agent dict. - single_actions = space_utils.unbatch(actions) - actions = {} - for key, a in zip(observations, single_actions): - # If we work in normalized action space (normalize_actions=True), - # we re-translate here into the env's action space. - if unsquash_actions: - a = space_utils.unsquash_action(a, policy.action_space_struct) - # Clip, according to env's action space. - elif clip_actions: - a = space_utils.clip_action(a, policy.action_space_struct) - actions[key] = a - - # Unbatch states into a multi-agent dict. - unbatched_states = {} - for idx, agent_id in enumerate(observations): - unbatched_states[agent_id] = [s[idx] for s in states] - - # Return only actions or full tuple - if state_defined or full_fetch: - return actions, unbatched_states, infos - else: - return actions - @OldAPIStack def add_policy( self, @@ -4143,6 +3828,208 @@ def _compile_iteration_results_old_api_stack( return results + @OldAPIStack + @Deprecated( + help="`Algorithm.compute_single_action` should no longer be used. Get the " + "RLModule instance through `Algorithm.get_module([module ID])`, then compute " + "actions through `RLModule.forward_inference({'obs': [obs batch]})`.", + error=False, + ) + def compute_single_action( + self, + observation: Optional[TensorStructType] = None, + state: Optional[List[TensorStructType]] = None, + *, + prev_action: Optional[TensorStructType] = None, + prev_reward: Optional[float] = None, + info: Optional[EnvInfoDict] = None, + input_dict: Optional[SampleBatch] = None, + policy_id: PolicyID = DEFAULT_POLICY_ID, + full_fetch: bool = False, + explore: Optional[bool] = None, + timestep: Optional[int] = None, + episode=None, + unsquash_action: Optional[bool] = None, + clip_action: Optional[bool] = None, + ) -> Union[ + TensorStructType, + Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]], + ]: + if unsquash_action is None: + unsquash_action = self.config.normalize_actions + elif clip_action is None: + clip_action = self.config.clip_actions + + err_msg = ( + "Provide either `input_dict` OR [`observation`, ...] as " + "args to `Algorithm.compute_single_action()`!" + ) + if input_dict is not None: + assert ( + observation is None + and prev_action is None + and prev_reward is None + and state is None + ), err_msg + observation = input_dict[Columns.OBS] + else: + assert observation is not None, err_msg + + policy = self.get_policy(policy_id) + if policy is None: + raise KeyError( + f"PolicyID '{policy_id}' not found in PolicyMap of the " + f"Algorithm's local worker!" + ) + pp = policy.agent_connectors[ObsPreprocessorConnector] + + if not isinstance(observation, (np.ndarray, dict, tuple)): + try: + observation = np.asarray(observation) + except Exception: + raise ValueError( + f"Observation type {type(observation)} cannot be converted to " + f"np.ndarray." + ) + if pp: + assert len(pp) == 1, "Only one preprocessor should be in the pipeline" + pp = pp[0] + + if not pp.is_identity(): + pp.in_eval() + if observation is not None: + _input_dict = {Columns.OBS: observation} + elif input_dict is not None: + _input_dict = {Columns.OBS: input_dict[Columns.OBS]} + else: + raise ValueError( + "Either observation or input_dict must be provided." + ) + + acd = AgentConnectorDataType("0", "0", _input_dict) + pp.reset(env_id="0") + ac_o = pp([acd])[0] + observation = ac_o.data[Columns.OBS] + + if input_dict is not None: + input_dict[Columns.OBS] = observation + action, state, extra = policy.compute_single_action( + input_dict=input_dict, + explore=explore, + timestep=timestep, + episode=episode, + ) + else: + action, state, extra = policy.compute_single_action( + obs=observation, + state=state, + prev_action=prev_action, + prev_reward=prev_reward, + info=info, + explore=explore, + timestep=timestep, + episode=episode, + ) + + if unsquash_action: + action = space_utils.unsquash_action(action, policy.action_space_struct) + elif clip_action: + action = space_utils.clip_action(action, policy.action_space_struct) + + if state or full_fetch: + return action, state, extra + else: + return action + + @OldAPIStack + @Deprecated( + help="`Algorithm.compute_actions` should no longer be used. Get the RLModule " + "instance through `Algorithm.get_module([module ID])`, then compute actions " + "through `RLModule.forward_inference({'obs': [obs batch]})`.", + error=False, + ) + def compute_actions( + self, + observations: TensorStructType, + state: Optional[List[TensorStructType]] = None, + *, + prev_action: Optional[TensorStructType] = None, + prev_reward: Optional[TensorStructType] = None, + info: Optional[EnvInfoDict] = None, + policy_id: PolicyID = DEFAULT_POLICY_ID, + full_fetch: bool = False, + explore: Optional[bool] = None, + timestep: Optional[int] = None, + episodes=None, + unsquash_actions: Optional[bool] = None, + clip_actions: Optional[bool] = None, + ): + if unsquash_actions is None: + unsquash_actions = self.config.normalize_actions + elif clip_actions is None: + clip_actions = self.config.clip_actions + + state_defined = state is not None + policy = self.get_policy(policy_id) + filtered_obs, filtered_state = [], [] + for agent_id, ob in observations.items(): + worker = self.env_runner_group.local_env_runner + if worker.preprocessors.get(policy_id) is not None: + preprocessed = worker.preprocessors[policy_id].transform(ob) + else: + preprocessed = ob + filtered = worker.filters[policy_id](preprocessed, update=False) + filtered_obs.append(filtered) + if state is None: + continue + elif agent_id in state: + filtered_state.append(state[agent_id]) + else: + filtered_state.append(policy.get_initial_state()) + + obs_batch = np.stack(filtered_obs) + if state is None: + state = [] + else: + state = list(zip(*filtered_state)) + state = [np.stack(s) for s in state] + + input_dict = {Columns.OBS: obs_batch} + + if prev_action is not None: + input_dict[SampleBatch.PREV_ACTIONS] = prev_action + if prev_reward is not None: + input_dict[SampleBatch.PREV_REWARDS] = prev_reward + if info: + input_dict[Columns.INFOS] = info + for i, s in enumerate(state): + input_dict[f"state_in_{i}"] = s + + actions, states, infos = policy.compute_actions_from_input_dict( + input_dict=input_dict, + explore=explore, + timestep=timestep, + episodes=episodes, + ) + + single_actions = space_utils.unbatch(actions) + actions = {} + for key, a in zip(observations, single_actions): + if unsquash_actions: + a = space_utils.unsquash_action(a, policy.action_space_struct) + elif clip_actions: + a = space_utils.clip_action(a, policy.action_space_struct) + actions[key] = a + + unbatched_states = {} + for idx, agent_id in enumerate(observations): + unbatched_states[agent_id] = [s[idx] for s in states] + + if state_defined or full_fetch: + return actions, unbatched_states, infos + else: + return actions + @Deprecated( new="Algorithm.env_runner_group", error=False, From e33624ce14a5eff616866067e7d94e3513b0d2a0 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sun, 5 Jan 2025 16:33:51 +0100 Subject: [PATCH 02/22] Merge branch 'master' of https://github.com/ray-project/ray into docs_redo_getting_started Signed-off-by: sven1977 # Conflicts: # doc/source/rllib/rllib-training.rst --- doc/source/rllib/doc_code/getting_started.py | 33 ---- doc/source/rllib/rllib-training.rst | 154 +++++++++++++++--- rllib/algorithms/algorithm.py | 120 +++++++------- rllib/examples/_docs/rllib_on_rllib_readme.py | 6 +- 4 files changed, 191 insertions(+), 122 deletions(-) diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py index 549a506ed043..95236f14aa41 100644 --- a/doc/source/rllib/doc_code/getting_started.py +++ b/doc/source/rllib/doc_code/getting_started.py @@ -49,39 +49,6 @@ # __rllib-tuner-end__ -# __rllib-compute-action-begin__ -import pathlib -import gymnasium as gym -import numpy as np -import torch -from ray.rllib.core.rl_module import RLModule - -env = gym.make("CartPole-v1") - -# Create only the neural network (RLModule) from our checkpoint. -rl_module = RLModule.from_checkpoint( - pathlib.Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module" -)["default_policy"] - -episode_return = 0 -terminated = truncated = False - -obs, info = env.reset() - -while not terminated and not truncated: - # Compute the next action from a batch (B=1) of observations. - torch_obs_batch = torch.from_numpy(np.array([obs])) - action_logits = rl_module.forward_inference({"obs": torch_obs_batch})[ - "action_dist_inputs" - ] - # The default RLModule used here produces action logits (from which - # we'll have to sample an action or use the max-likelihood one). - action = torch.argmax(action_logits[0]).numpy() - obs, reward, terminated, truncated, info = env.step(action) - episode_return += reward - -print(f"Reached episode return of {episode_return}.") -# __rllib-compute-action-end__ del rl_module diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index fb8617179cc2..6ddf077307c2 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -7,22 +7,30 @@ Getting Started =============== -All RLlib experiments are run using an ``Algorithm`` class which holds a policy for environment interaction. -Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing). -In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once. +In this tutorial, you learn how to design, customize, and run an RLlib learning experiment from scratch. -In this guide, we will explain in detail RLlib's Python API for running learning experiments. +.. _rllib-in-15min: RLlib in 15 minutes ------------------- - .. _rllib-python-api: Python API ~~~~~~~~~~ +You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class. An +Algorithm typically holds a neural network for computing actions, called "policy", the :ref:`RL environment ` +you want to optimize against, a loss function, an optimizer, and some code describing the algorithm's execution logic, like determining when to +take which particular steps. + +Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing). +In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once. + + + + The Python API provides all the flexibility required for applying RLlib to any type of problem. Let's start with an example of the API's basic usage. @@ -58,10 +66,6 @@ iterations (here `10`) and `save` the resulting policy periodically (here every checkpoint_dir = ppo.save_to_path() print(f"Algorithm checkpoint saved in: {checkpoint_dir}") -.. testcode:: - :hide: - - algo.stop() .. _rllib-with-ray-tune: @@ -123,10 +127,72 @@ To load newer RLlib checkpoints (version >= 1.0), use the following code: Customizing your RL environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the preceding examples, your RL environment was always "CartPole-v1", however, you would probably like to -run your actual experiments against a different environment or even write your own custom one. - -See here ...blabla +In the preceding examples, your :ref:`RL environment ` was a `Farama gymnasium `__ +pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a different environment or even write a custom one, +see this tab below for a less-than-50-lines example of a custom ``gym.Env`` class. + +See here for an :ref:`in-depth guide on how to setup RL environments in RLlib ` and how to customize them. + +.. dropdown:: Quickstart: Custom RL environment + :animate: fade-in-slide-down + + .. testcode:: + + import gymnasium as gym + from ray.rllib.algorithms.ppo import PPOConfig + + # 1) Define your custom env class: + + class ParrotEnv(gym.Env): + """Environment in which the agent learns to repeat the seen observations. + + Observations are float numbers indicating the to-be-repeated values, + e.g. -1.0, 5.1, or 3.2. + The action space is the same as the observation space. + Rewards are `r=-abs([observation] - [action])`, for all steps. + """ + def __init__(self, config=None): + # Since actions should repeat observations, their spaces must be the same. + self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32) + self.action_space = self.observation_space + self._cur_obs = None + self._episode_len = 0 + + def reset(self, *, seed=None, options=None): + """Resets the environment, starting a new episode.""" + # Reset the episode len. + self._episode_len = 0 + # Sample a random number from our observation space. + self._cur_obs = self.observation_space.sample() + # Return initial observation. + return self._cur_obs, {} + + def step(self, action): + """Takes a single step in the episode given `action`.""" + # Set `terminated` and `truncated` flags to True after 10 steps. + self._episode_len += 1 + terminated = truncated = self._episode_len >= 10 + # Compute the reward: `r = -abs([obs] - [action])` + reward = -sum(abs(self._cur_obs - action)) + # Set a new observation (random sample). + self._cur_obs = self.observation_space.sample() + return self._cur_obs, reward, terminated, truncated, {} + + # 2) Configure it through RLlib's algorithm configs: + config = ( + PPOConfig() + .environment(ParrotEnv) # add `env_config=[some Box space] to customize the env + ) + + # 3) Build the PPO and train + ppo_w_custom_env = config.build() + + .. testcode:: + :hide: + + # Test that our setup is working. + ppo_w_custom_env.train() + ppo_w_custom_env.stop() Customizing your models ~~~~~~~~~~~~~~~~~~~~~~~ @@ -140,21 +206,61 @@ for a detailed guide on how to do so. Deploying your models and computing actions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The simplest way to programmatically compute actions from a trained :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` +is to get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module`, +then call the module's :py:meth:`~ray.rllib.core.rl_module.rl_module.forward_inference` method. + +Here is an example of how to test a trained agent for one episode: +.. testcode:: -The simplest way to programmatically compute actions from a trained agent is to -use ``Algorithm.compute_single_action()``. -This method preprocesses and filters the observation before passing it to the agent -policy. -Here is a simple example of testing a trained agent for one episode: + import gymnasium as gym + import numpy as np + import torch + from ray.rllib.core.rl_module import RLModule -.. literalinclude:: ./doc_code/getting_started.py - :language: python - :start-after: rllib-compute-action-begin - :end-before: rllib-compute-action-end + env = gym.make("CartPole-v1") + + # Get the RLModule from the up and running Algorithm instance: + rl_module = ppo.get_module() + + episode_return = 0 + terminated = truncated = False + + obs, info = env.reset() + + while not terminated and not truncated: + # Compute the next action from a batch (B=1) of observations. + obs_batch = torch.from_numpy(obs).unsqueeze(0) # add batch B=1 dimension + # Extract the logits from the output and dissolve batch again. + action_logits = rl_module.forward_inference({"obs": obs_batch})[ + "action_dist_inputs" + ][0] + # PPO's default RLModule produces action logits (from which + # you have to sample an action or use the max-likelihood one). + action = numpy.argmax(action_logits.numpy()) + # Send the action to the environment for the next step. + obs, reward, terminated, truncated, info = env.step(action) + episode_return += reward + + print(f"Reached episode return of {episode_return}.") + + +If you don't have your Algorithm instance up and running anymore and would like to create the trained RLModule +from a checkpoint, you can do the following instead. +Note that `best_checkpoint` is the highest performing Algorithm checkpoint you created +in the preceding experiment. To learn more about checkpoints and their structure, see this :ref:`checkpointing guide `. + +.. testcode:: + + from pathlib import Path + + # Create only the neural network (RLModule) from our checkpoint. + rl_module = RLModule.from_checkpoint( + Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module" + )["default_policy"] -For more advanced usage on computing actions and other functionality, -you can consult the :ref:`RLlib Algorithm API documentation `. + # Do the same computations with `rl_module` as in the preceding code snippet. Accessing Policy State diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index fd52894dc432..dc1123942a84 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -230,32 +230,33 @@ def _get_learner_bundles(cf: AlgorithmConfig) -> List[Dict[str, int]]: class Algorithm(Checkpointable, Trainable, AlgorithmBase): """An RLlib algorithm responsible for optimizing one or more Policies. - Algorithms contain a EnvRunnerGroup under `self.env_runner_group`. An EnvRunnerGroup - is composed of a single local EnvRunner (`self.env_runner_group.local_env_runner`), + Algorithms contain a EnvRunnerGroup under `self.env_runner_group`, which is + composed of one local EnvRunner (`self.env_runner_group.local_env_runner`), serving as the reference copy of the NeuralNetwork(s) to be trained and optionally - one or more remote EnvRunners used to generate environment samples in parallel. + one or more remote EnvRunner actors used to generate environment samples. EnvRunnerGroup is fault-tolerant and elastic. It tracks health states for all the managed remote EnvRunner actors. As a result, Algorithm should never access the underlying actor handles directly. Instead, always access them via all the foreach APIs with assigned IDs of the underlying EnvRunners. - Each EnvRunners (remotes or local) contains a PolicyMap, which itself - may contain either one policy for single-agent training or one or more - policies for multi-agent training. Policies are synchronized + Each EnvRunners (remotes or local) contains an RLModule, which + contains either one policy network for single-agent training or one or more + policy networks for multi-agent training. RLModules are synchronized automatically from time to time using ray.remote calls. The exact - synchronization logic depends on the specific algorithm used, - but this usually happens from local worker to all remote workers and - after each training update. + synchronization logic depends on the specific algorithm used and typically + happens from local worker to all remote workers and after each training update. You can write your own Algorithm classes by sub-classing from `Algorithm` or any of its built-in sub-classes. - This allows you to override the `training_step` method to implement - your own algorithm logic. You can find the different built-in - algorithms' `training_step()` methods in their respective main .py files, - e.g. rllib.algorithms.dqn.dqn.py or rllib.algorithms.impala.impala.py. - - The most important API methods a Algorithm exposes are `train()`, - `evaluate()`, `save_to_path()` and `restore_from_path()`. + Override the `training_step` method to implement your own algorithm logic. You can + find the different built-in algorithms' `training_step()` methods in their + respective [algorithm name].py files, + e.g. rllib.algorithms.ppo.ppo.py or rllib.algorithms.dqn.dqn.py. + + The most important API methods an Algorithm exposes are `train()` for running a + single training iteration, `evaluate()` for running a single round of evaluation, + `save_to_path()` for creating a checkpoint, and `restore_from_path()` for loading a + state from an existing checkpoint. """ # Whether to allow unknown top-level config keys. @@ -307,7 +308,7 @@ class Algorithm(Checkpointable, Trainable, AlgorithmBase): @override(Checkpointable) def from_checkpoint( cls, - path: Optional[Union[str, Checkpoint]] = None, + path: Union[str, Checkpoint], filesystem: Optional["pyarrow.fs.FileSystem"] = None, *, # @OldAPIStack @@ -349,19 +350,17 @@ def from_checkpoint( deprecation_warning( old="Algorithm.from_checkpoint(checkpoint=...)", new="Algorithm.from_checkpoint(path=...)", - error=False, - ) - path = checkpoint - if path is None: - raise ValueError( - "`path` not provided in call to Algorithm.from_checkpoint()!" + error=True, ) - checkpoint_info = get_checkpoint_info(path) + # New API stack -> Use Checkpointable's default implementation. + if checkpoint_info["checkpoint_version"] >= version.Version("2.0"): + return super().from_checkpoint(path, filesystem=filesystem, **kwargs) + # Not possible for (v0.1) (algo class and config information missing # or very hard to retrieve). - if checkpoint_info["checkpoint_version"] == version.Version("0.1"): + elif checkpoint_info["checkpoint_version"] == version.Version("0.1"): raise ValueError( "Cannot restore a v0 checkpoint using `Algorithm.from_checkpoint()`!" "In this case, do the following:\n" @@ -375,9 +374,6 @@ def from_checkpoint( "()` must be 1.0 or later! You are using a checkpoint with " f"version v{checkpoint_info['checkpoint_version']}." ) - # New API stack -> Use Checkpointable's default implementation. - elif checkpoint_info["checkpoint_version"] >= version.Version("2.0"): - return super().from_checkpoint(path, filesystem=filesystem, **kwargs) # This is a msgpack checkpoint. if checkpoint_info["format"] == "msgpack": @@ -411,40 +407,6 @@ def from_checkpoint( return Algorithm.from_state(state) - @OldAPIStack - @staticmethod - def from_state(state: Dict) -> "Algorithm": - """Recovers an Algorithm from a state object. - - The `state` of an instantiated Algorithm can be retrieved by calling its - `get_state` method. It contains all information necessary - to create the Algorithm from scratch. No access to the original code (e.g. - configs, knowledge of the Algorithm's class, etc..) is needed. - - Args: - state: The state to recover a new Algorithm instance from. - - Returns: - A new Algorithm instance. - """ - algorithm_class: Type[Algorithm] = state.get("algorithm_class") - if algorithm_class is None: - raise ValueError( - "No `algorithm_class` key was found in given `state`! " - "Cannot create new Algorithm." - ) - # algo_class = get_trainable_cls(algo_class_name) - # Create the new algo. - config = state.get("config") - if not config: - raise ValueError("No `config` found in given Algorithm state!") - new_algo = algorithm_class(config=config) - # Set the new algo's state. - new_algo.__setstate__(state) - - # Return the new algo. - return new_algo - @PublicAPI def __init__( self, @@ -2279,6 +2241,40 @@ def fn(worker): if remove_from_eval_env_runners and self.eval_env_runner_group is not None: self.eval_env_runner_group.foreach_env_runner(fn, local_env_runner=True) + @OldAPIStack + @staticmethod + def from_state(state: Dict) -> "Algorithm": + """Recovers an Algorithm from a state object. + + The `state` of an instantiated Algorithm can be retrieved by calling its + `get_state` method. It contains all information necessary + to create the Algorithm from scratch. No access to the original code (e.g. + configs, knowledge of the Algorithm's class, etc..) is needed. + + Args: + state: The state to recover a new Algorithm instance from. + + Returns: + A new Algorithm instance. + """ + algorithm_class: Type[Algorithm] = state.get("algorithm_class") + if algorithm_class is None: + raise ValueError( + "No `algorithm_class` key was found in given `state`! " + "Cannot create new Algorithm." + ) + # algo_class = get_trainable_cls(algo_class_name) + # Create the new algo. + config = state.get("config") + if not config: + raise ValueError("No `config` found in given Algorithm state!") + new_algo = algorithm_class(config=config) + # Set the new algo's state. + new_algo.__setstate__(state) + + # Return the new algo. + return new_algo + @OldAPIStack def export_policy_model( self, diff --git a/rllib/examples/_docs/rllib_on_rllib_readme.py b/rllib/examples/_docs/rllib_on_rllib_readme.py index 4463eba4ce85..be63d2da2c78 100644 --- a/rllib/examples/_docs/rllib_on_rllib_readme.py +++ b/rllib/examples/_docs/rllib_on_rllib_readme.py @@ -46,7 +46,7 @@ def step(self, action): Returns: New observation, reward, done-flag, info-dict (empty). """ - # Set `done` and `truncated` flags after 10 steps. + # Set `terminated` and `truncated` flags to True after 10 steps. self.episode_len += 1 terminated = truncated = self.episode_len >= 10 # r = -abs(obs - action) @@ -60,9 +60,9 @@ def step(self, action): # act in the above environment. config = ( PPOConfig().environment( - # Env class to use (here: our gym.Env sub-class from above). + # Env class to use (your gym.Env subclass from above). env=ParrotEnv, - # Config dict to be passed to our custom env's constructor. + # Config dict to be passed to your custom env's constructor. env_config={"parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1,))}, ) # Parallelize environment rollouts. From 042909da64a2a02eb662441d3af8c291f67d1735 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 8 Jan 2025 20:08:01 +0100 Subject: [PATCH 03/22] wip Signed-off-by: sven1977 --- doc/source/rllib/doc_code/rllib_in_60s.py | 25 -------- doc/source/rllib/index.rst | 72 ++++++++++++++++++----- doc/source/rllib/rllib-training.rst | 62 +++++++++++++------ 3 files changed, 99 insertions(+), 60 deletions(-) delete mode 100644 doc/source/rllib/doc_code/rllib_in_60s.py diff --git a/doc/source/rllib/doc_code/rllib_in_60s.py b/doc/source/rllib/doc_code/rllib_in_60s.py deleted file mode 100644 index 6d214504f15d..000000000000 --- a/doc/source/rllib/doc_code/rllib_in_60s.py +++ /dev/null @@ -1,25 +0,0 @@ -# flake8: noqa - -# __rllib-in-60s-begin__ -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.connectors.env_to_module import FlattenObservations - -# 1. Configure the algorithm, -config = ( - PPOConfig() - .environment("Taxi-v3") - .env_runners( - num_env_runners=2, - # Observations are discrete (ints) -> We need to flatten (one-hot) them. - env_to_module_connector=lambda env: FlattenObservations(), - ) - .evaluation(evaluation_num_env_runners=1) -) -# 2. build the algorithm .. -algo = config.build() -# 3. .. train it .. -for _ in range(5): - print(algo.train()) -# 4. .. and evaluate it. -algo.evaluate() -# __rllib-in-60s-end__ diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst index 1d7c05099141..d42a5c272583 100644 --- a/doc/source/rllib/index.rst +++ b/doc/source/rllib/index.rst @@ -101,32 +101,72 @@ Install RLlib and `PyTorch `__, as shown below: .. note:: To be able to run the Atari or MuJoCo examples, you also need to do: - `pip install "gymnasium[atari,accept-rom-license,mujoco]"`. -This is all. You can now start coding against RLlib. Here is an example for running the PPO Algorithm on the + .. code-block:: bash + + `pip install "gymnasium[atari,accept-rom-license,mujoco]"`. + +This is all, you can now start coding against RLlib. Here is an example for running the :ref:`PPO Algorithm ` on the `Taxi domain `__. -You first create a `config` for the algorithm, which defines the RL environment and -any other needed settings and parameters. +You first create a `config` for the algorithm, which defines the :ref:`RL environment ` and any other needed settings and parameters. + +.. testcode:: + + from ray.rllib.algorithms.ppo import PPOConfig + from ray.rllib.connectors.env_to_module import FlattenObservations + + # Configure the algorithm. + config = ( + PPOConfig() + .environment("Taxi-v3") + .env_runners( + num_env_runners=2, + # Observations are discrete (ints) -> We need to flatten (one-hot) them. + env_to_module_connector=lambda env: FlattenObservations(), + ) + .evaluation(evaluation_num_env_runners=1) + ) + + +Next, ``build`` the algorithm and ``train`` it for a total of five iterations. +One training iteration includes parallel, distributed sample collection by the +:py:class:`~ray.rllib.env.env_runner.EnvRunner` actors, followed by loss calculation +on the collected data, and a model update step. + +.. testcode:: + + from pprint import pprint + + # Build the algorithm. + algo = config.build_algo() + + # Train it for 5 iterations ... + for _ in range(5): + pprint(algo.train()) + +At the end of your script, you evaluate the trained Algorithm: + +.. testcode:: + + # ... and evaluate it. + pprint(algo.evaluate()) + -Next, `build` the algorithm and `train` it for a total of five iterations. -One training iteration includes parallel, distributed sample collection by the :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors, -followed by loss calculation on the collected data, and a model update step. +.. testcode:: + :hide: -At the end of your script, RLlib evaluates the trained Algorithm: + algo.stop() -.. literalinclude:: doc_code/rllib_in_60s.py - :language: python - :start-after: __rllib-in-60s-begin__ - :end-before: __rllib-in-60s-end__ You can use any `Farama-Foundation Gymnasium `__ registered environment -with the `env` argument. +with the ``env`` argument. -In `config.env_runners()` you can specify - amongst many other things - the number of parallel +In ``config.env_runners()`` you can specify - amongst many other things - the number of parallel :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors to collect samples from the environment. -You can also tweak the NN architecture used by tweaking RLlib's `DefaultModelConfig`, as well as, set up a separate -config for the evaluation :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors through the `config.evaluation()` method. +You can also tweak the NN architecture used by tweaking RLlib's :py:class:`~ray.rllib.core.rl_module.default_model_cnofig.DefaultModelConfig`, +as well as, set up a separate config for the evaluation +:py:class:`~ray.rllib.env.env_runner.EnvRunner` actors through the ``config.evaluation()`` method. `See here `_, if you want to learn more about the RLlib training APIs. Also, `see here `__ diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index 6ddf077307c2..692f3507bf90 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -7,31 +7,54 @@ Getting Started =============== -In this tutorial, you learn how to design, customize, and run an RLlib learning experiment from scratch. +.. _rllib-in-60min: +RLlib in 60 minutes +------------------- -.. _rllib-in-15min: +.. figure:: images/rllib-index-header.svg + +In this tutorial, you learn how to design, customize, and run an end-to-end RLlib learning experiment +from scratch. This includes picking and configuring an Algorithm, running a couple of training iterations, +saving the state of your Algorithm from time to time, running a separate evaluation loop, +and finally utilizing one of the checkpoints to deploy your trained model in an environment outside of RLlib +and compute actions through it. + +You also learn how to optionally customize your RL environment and your neural network model. + +Installation +~~~~~~~~~~~~ + +First, install RLlib and `PyTorch `__, as shown below: + +.. code-block:: bash + + pip install "ray[rllib]" "gymnasium[atari,accept-rom-license,mujoco]" torch -RLlib in 15 minutes -------------------- .. _rllib-python-api: Python API ~~~~~~~~~~ -You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class. An -Algorithm typically holds a neural network for computing actions, called "policy", the :ref:`RL environment ` -you want to optimize against, a loss function, an optimizer, and some code describing the algorithm's execution logic, like determining when to -take which particular steps. +RLlib's Python API provides all the flexibility required for applying the library to any +type of RL problem. + +You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` +class. An :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` typically holds a neural +network for computing actions, called "policy", the :ref:`RL environment ` +you want to optimize against, a loss function, an optimizer, and some code describing the +algorithm's execution logic, like determining when to take which particular steps. + +In multi-agent training, :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` +manages the querying and optimization of multiple policies at once. -Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing). -In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once. +Through the algorithm's interface, you can train the policy, compute actions, or store your +algorithm's state through checkpointing. -The Python API provides all the flexibility required for applying RLlib to any type of problem. Let's start with an example of the API's basic usage. We first create a `PPOConfig` instance and set some properties through the config class' various methods. @@ -114,7 +137,7 @@ of the training results and retrieving the checkpoint(s) of the trained agent. Loading and restoring a trained algorithm from a checkpoint is simple. Let's assume you have a local checkpoint directory called ``checkpoint_path``. -To load newer RLlib checkpoints (version >= 1.0), use the following code: +To load newer RLlib checkpoints (version >= 2.1), use the following code: .. code-block:: python @@ -128,8 +151,8 @@ Customizing your RL environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In the preceding examples, your :ref:`RL environment ` was a `Farama gymnasium `__ -pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a different environment or even write a custom one, -see this tab below for a less-than-50-lines example of a custom ``gym.Env`` class. +pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a custom one, +see this tab below for a less-than-50-lines example. See here for an :ref:`in-depth guide on how to setup RL environments in RLlib ` and how to customize them. @@ -141,7 +164,7 @@ See here for an :ref:`in-depth guide on how to setup RL environments in RLlib Date: Thu, 9 Jan 2025 10:05:20 +0100 Subject: [PATCH 04/22] wip Signed-off-by: sven1977 --- doc/source/rllib/rllib-training.rst | 88 ++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 15 deletions(-) diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index 692f3507bf90..c86ac015b137 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -1,12 +1,12 @@ .. include:: /_includes/rllib/we_are_hiring.rst -.. include:: /_includes/rllib/new_api_stack.rst - .. _rllib-getting-started: Getting Started =============== +.. include:: /_includes/rllib/new_api_stack.rst + .. _rllib-in-60min: RLlib in 60 minutes @@ -20,7 +20,7 @@ saving the state of your Algorithm from time to time, running a separate evaluat and finally utilizing one of the checkpoints to deploy your trained model in an environment outside of RLlib and compute actions through it. -You also learn how to optionally customize your RL environment and your neural network model. +You also learn how to customize your RL environment and your neural network model. Installation ~~~~~~~~~~~~ @@ -46,24 +46,28 @@ network for computing actions, called "policy", the :ref:`RL environment `, +:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` manages the querying and optimization of multiple policies at once. Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state through checkpointing. +Configure and build the algorithm ++++++++++++++++++++++++++++++++++ +You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance +and change some default settings through the config object's various methods. +For example, we can set the RL environment we want to use by calling the config's +:py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment` method. -Let's start with an example of the API's basic usage. -We first create a `PPOConfig` instance and set some properties through the config class' various methods. -For example, we can set the RL environment we want to use by calling the config's `environment` method. -To scale our algorithm and define, how many environment workers (EnvRunners) we want to leverage, we can call -the `env_runners` method. -After we `build` the `PPO` Algorithm from its configuration, we can `train` it for a number of -iterations (here `10`) and `save` the resulting policy periodically (here every `5` iterations). +To scale our setup and define, how many EnvRunner actors you want to leverage, +you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method. +Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance +through calling the :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo` +method. .. testcode:: @@ -72,11 +76,65 @@ iterations (here `10`) and `save` the resulting policy periodically (here every # Configure the Algorithm (PPO). config = ( PPOConfig() - .environment("CartPole-v1") - .env_runners(num_env_runners=1) + .environment("Pendulum-v1") + .env_runners(num_env_runners=3) + .training( + lr=0.0002, + train_batch_size_per_learner=2000, + num_epochs=10, + ) ) + # Build the Algorithm (PPO). - ppo = config.build() + ppo = config.build_algo() + + +.. note:: + + See here to learn, which config methods you can use to configure your Algorithm, see here. + + +Run the algorithm ++++++++++++++++++ + +After you have built your :ref:`PPO ` from its configuration, you can ``train`` it for a number of +iterations through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.train` method. + +It returns a result dictionary that you can pretty-print for debugging purposes: + +.. testcode:: + + from pprint import pprint + + for _ in range(5): + pprint(ppo.train()) + + +Checkpoint the algorithm +++++++++++++++++++++++++ + +To save your Algorithm's current state, create a so-called ``checkpoint`` through +calling its `save_to_path` method. It returns the location of the saved checkpoint. + +Alternatively to not passing any arguments and letting the algorithm decide, where to save +the checkpoint, you can provide a checkpoint directory yourself: + +.. testcode:: + + checkpoint_path = ppo.save_to_path() + + # OR: + # ppo.save_to_path([a checkpoint location of your choice]) + + +Evaluate the algorithm +++++++++++++++++++++++ + + +Restore the model + + +Let's start with an example of the API's basic usage. # Train for 10 iterations. for i in range(10): From e0d6ce65d09361b5290a544ec8a6f22b8649833b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 17 Jan 2025 21:03:28 +0100 Subject: [PATCH 05/22] wip Signed-off-by: sven1977 --- doc/source/rllib/doc_code/getting_started.py | 81 --- doc/source/rllib/doc_code/training.py | 3 +- doc/source/rllib/getting-started.rst | 487 +++++++++++++++++++ doc/source/rllib/index.rst | 2 +- doc/source/rllib/rllib-training.rst | 462 +----------------- 5 files changed, 493 insertions(+), 542 deletions(-) delete mode 100644 doc/source/rllib/doc_code/getting_started.py create mode 100644 doc/source/rllib/getting-started.rst diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py deleted file mode 100644 index 95236f14aa41..000000000000 --- a/doc/source/rllib/doc_code/getting_started.py +++ /dev/null @@ -1,81 +0,0 @@ -# flake8: noqa - -if False: - # __rllib-tune-config-begin__ - from ray import train, tune - - config = ( - PPOConfig() - .environment("CartPole-v1") - .training( - lr=tune.grid_search([0.01, 0.001, 0.0001]), - ) - ) - - tuner = tune.Tuner( - "PPO", - param_space=config, - run_config=train.RunConfig( - stop={"env_runners/episode_return_mean": 150.0}, - ), - ) - - tuner.fit() - # __rllib-tune-config-end__ - - -# __rllib-tuner-begin__ -from ray import train, tune - -# Tuner.fit() allows setting a custom log directory (other than ~/ray-results). -tuner = tune.Tuner( - "PPO", - param_space=config, - run_config=train.RunConfig( - stop={"num_env_steps_sampled_lifetime": 20000}, - checkpoint_config=train.CheckpointConfig(checkpoint_at_end=True), - ), -) - -results = tuner.fit() - -# Get the best result based on a particular metric. -best_result = results.get_best_result( - metric="env_runners/episode_return_mean", mode="max" -) - -# Get the best checkpoint corresponding to the best result. -best_checkpoint = best_result.checkpoint -# __rllib-tuner-end__ - - - - -del rl_module - - -# __rllib-get-state-begin__ -from ray.rllib.algorithms.ppo import PPOConfig - -algo = ( - PPOConfig() - .environment("CartPole-v1") - .env_runners(num_env_runners=2) -).build() - -# Get weights of the algo's RLModule. -algo.get_module().get_state() - -# Same as above -algo.env_runner.module.get_state() - -# Get list of weights of each EnvRunner, including remote replicas. -algo.env_runner_group.foreach_worker(lambda env_runner: env_runner.module.get_state()) - -# Same as above, but with index. -algo.env_runner_group.foreach_worker_with_id( - lambda _id, env_runner: env_runner.module.get_state() -) -# __rllib-get-state-end__ - -algo.stop() diff --git a/doc/source/rllib/doc_code/training.py b/doc/source/rllib/doc_code/training.py index 75bf8a48f18c..2e50cd6e0425 100644 --- a/doc/source/rllib/doc_code/training.py +++ b/doc/source/rllib/doc_code/training.py @@ -36,7 +36,8 @@ algo = ( DQNConfig() .api_stack( - enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False + enable_rl_module_and_learner=False, + enable_env_runner_and_connector_v2=False, ) .framework("torch") .environment("CartPole-v1") diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst new file mode 100644 index 000000000000..f280185cc4e6 --- /dev/null +++ b/doc/source/rllib/getting-started.rst @@ -0,0 +1,487 @@ +.. include:: /_includes/rllib/we_are_hiring.rst + +.. _rllib-getting-started: + +Getting Started +=============== + +.. include:: /_includes/rllib/new_api_stack.rst + +.. _rllib-in-60min: + +RLlib in 60 minutes +------------------- + +.. figure:: images/rllib-index-header.svg + +In this tutorial, you learn how to design, customize, and run an end-to-end RLlib learning experiment +from scratch. This includes picking and configuring an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`, +running a couple of training iterations, saving the state of your +:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` from time to time, running a separate +evaluation loop, and finally utilizing one of the checkpoints to deploy your trained model +to an environment outside of RLlib and compute actions. + +You also learn how to customize your :ref:`RL environment ` +and your :ref:`neural network model `. + +Installation +~~~~~~~~~~~~ + +First, install RLlib, `PyTorch `__, and `Farama Gymnasium `__ as shown below: + +.. code-block:: bash + + pip install "ray[rllib]" torch "gymnasium[atari,accept-rom-license,mujoco]" + + +.. _rllib-python-api: + +Python API +~~~~~~~~~~ + +RLlib's Python API provides all the flexibility required for applying the library to any +type of RL problem. + +You manage RLlib experiments through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` +class. An :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` typically holds a neural +network for computing actions, called ``policy``, the :ref:`RL environment ` +that you want to optimize against, a loss function, an optimizer, and some code describing the +algorithm's execution logic, like determining when to collect samples, when to update your model, etc.. + +In :ref:`multi-agent training `, +:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` manages the querying and optimization of multiple policies at once. + +Through the algorithm's interface, you can train the policy, compute actions, or store your +algorithm's state through checkpointing. + + +Configure and build the algorithm ++++++++++++++++++++++++++++++++++ + +You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance +and change some default settings through the config object's various methods. + +For example, we can set the :ref:`RL environment ` +we want to use by calling the config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment` +method: + +.. testcode:: + + from ray.rllib.algorithms.ppo import PPOConfig + + # Create a config instance for the PPO algorithm. + config = ( + PPOConfig() + .environment("Pendulum-v1") + ) + + +To scale our setup and define, how many EnvRunner actors you want to leverage, +you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method: + +.. testcode:: + + config.env_runners(num_env_runners=3) + +For training-related settings or any algorithm-specific settings, use the +:py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.training` method: + +.. testcode:: + + config.training( + lr=0.0002, + train_batch_size_per_learner=2000, + num_epochs=10, + ) + +Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance +through calling your config's :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo` +method. + +.. testcode:: + + # Build the Algorithm (PPO). + ppo = config.build_algo() + + +.. note:: + + See here to learn about the :ref:`methods you can use to configure your Algorithm `. + + +Run the algorithm ++++++++++++++++++ + +After you built your :ref:`PPO ` from its configuration, you can ``train`` it for a number of +iterations through calling the :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.train` method, +which returns a result dictionary that you can pretty-print for debugging purposes: + +.. testcode:: + + from pprint import pprint + + for _ in range(5): + pprint(ppo.train()) + + +Checkpoint the algorithm +++++++++++++++++++++++++ + +To save the current state of your :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`, +create a ``checkpoint`` through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.save_to_path` method, +which returns the directory of the saved checkpoint. + +Instead of not passing any arguments to this call and letting the algorithm decide where to save +the checkpoint, you can also provide a checkpoint directory yourself: + +.. testcode:: + + checkpoint_path = ppo.save_to_path() + + # OR: + # ppo.save_to_path([a checkpoint location of your choice]) + + +Evaluate the algorithm +++++++++++++++++++++++ + +RLlib supports setting up a separate :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup` +for the sole purpose of evaluating your model from time to time on the RL environment. + +Use your config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.evaluation` method +to set up the details. By default, RLlib doesn't perform evaluation during training and only reports the +results of collecting training samples with its "regular" :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup`. + + +.. testcode:: + :hide: + + ppo.stop() + + +.. testcode:: + + config.evaluation( + # Run one evaluation round every iteration. + evaluation_interval=1, + + # Create 2 eval EnvRunners in the extra EnvRunnerGroup. + evaluation_num_env_runners=2, + + # Run evaluation for exactly 10 episodes. Note that because you have + # 2 EnvRunners, each one runs through 5 episodes. + evaluation_duration_unit="episodes", + evaluation_duration=10, + ) + + # Rebuild the PPO, but with the extra evaluation EnvRunnerGroup + ppo_with_evaluation = config.build() + + for _ in range(3): + pprint(ppo_with_evaluation.train()) + + +.. _rllib-with-ray-tune: + +RLlib with Ray Tune ++++++++++++++++++++ + +All RLlib :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` classes are compatible with +the :ref:`Ray Tune API `. + +This allows for easy utilization of your configured :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` in +:ref:`Ray Tune ` experiments. + +For example, the following code performs a simple hyper-parameter sweep of your :ref:`PPO ` +through creating three ``Trials``, one for each configured learning rate: + +.. testcode:: + + from ray import train, tune + from ray.rllib.algorithms.ppo import PPOConfig + + config = ( + PPOConfig() + .environment("Pendulum-v1") + # Specify a simple tune hyperparameter sweep. + .training( + lr=tune.grid_search([0.001, 0.0005, 0.0001]), + ) + ) + + # Create a Tuner instance to manage the trials. + tuner = tune.Tuner( + config.algo_class, + param_space=config, + # Specify a stopping criterion. Note that the criterion has to match one of the + # pretty printed result metrics from the results returned previously by + # ``.train()``. + run_config=train.RunConfig( + stop={"env_runners/episode_return_mean": -1000.0}, + ), + ) + # Run the Tuner and capture the results. + results = tuner.fit() + +Note that each :py:class:`~ray.tune.trial.Trial` creates a separate +:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance as a :ref:`Ray actor `, +assigns compute resources to each ``Trial``, and runs them in parallel, if possible, +on your Ray cluster: + +.. code-block:: text + + Trial status: 3 RUNNING + Current time: 2025-01-17 18:47:33. Total running time: 3min 0s + Logical resource usage: 9.0/12 CPUs, 0/0 GPUs + ╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ + │ Trial name status lr iter total time (s) episode_return_mean ..._sampled_lifetime │ + ├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤ + │ PPO_Pendulum-v1_b5c41_00000 RUNNING 0.01 29 86.2426 -998.449 108000 │ + │ PPO_Pendulum-v1_b5c41_00001 RUNNING 0.001 25 74.4335 -997.079 100000 │ + │ PPO_Pendulum-v1_b5c41_00002 RUNNING 0.0001 20 60.0421 -960.293 80000 │ + ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + +``Tuner.fit()`` returns an ``ResultGrid`` object that allows for a detailed analysis of the +training process and for retrieving the :ref:`checkpoints ` of the trained +algorithms and their models: + +.. testcode:: + # Get the best result based on a particular metric. + best_result = results.get_best_result( + metric="env_runners/episode_return_mean", mode="max" + ) + + # Get the best checkpoint corresponding to the best result + # from the preceding experiment. + best_checkpoint = best_result.checkpoint + + +Deploy a trained model for production inference ++++++++++++++++++++++++++++++++++++++++++++++++ + +After training, you might want to deploy your models into a new environment, for example +to run inference in production. You can do so using the checkpoint directory created in the +preceding example. To read more about checkpoints, model deployments, and algorithm state restoration, +see this :ref:`page on checkpointing ` here. + +.. testcode:: + + from pathlib import Path + import gymnasium as gym + import numpy as np + import torch + from ray.rllib.core.rl_module import RLModule + + # Create only the neural network (RLModule) from our checkpoint. + rl_module = RLModule.from_checkpoint( + Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module" + )["default_policy"] + + # Create the RL environment to test against (same as was used for + # training earlier). + env = gym.make("Pendulum-v1") + + episode_return = 0.0 + done = False + + # Reset the env to get the initial observation. + obs, info = env.reset() + + while not done: + # Compute the next action from a batch (B=1) of observations. + obs_batch = torch.from_numpy(obs).unsqueeze(0) # add batch B=1 dimension + + # Extract the logits from the output and dissolve batch again. + action_logits = rl_module.forward_inference({"obs": obs_batch})[ + "action_dist_inputs" + ][0] + + # PPO's default RLModule produces action logits (from which + # you have to sample an action or use the max-likelihood one). + action = numpy.argmax(action_logits.numpy()) + + # Send the action to the environment for the next step. + obs, reward, terminated, truncated, info = env.step(action) + + # Perform env-loop bookkeeping. + episode_return += reward + done = terminated or truncated + + print(f"Reached episode return of {episode_return}.") + + +If you still have an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance up and running +in your script, you can also get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the +:py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module` method: + +.. testcode:: + + rl_module = ppo_with_evaluation.get_module("default_policy") + + +Customizing your RL environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the preceding examples, your :ref:`RL environment ` was +a `Farama gymnasium `__ pre-registered one, +like ``Pendulum-v1`` or ``CartPole-v1``. However, if you would like to run your +experiments against a custom one, see this tab below for a less-than-50-lines example. + +See here for an :ref:`in-depth guide on how to setup RL environments in RLlib ` and how to customize them. + +.. dropdown:: Quickstart: Custom RL environment + :animate: fade-in-slide-down + + .. testcode:: + + import gymnasium as gym + from ray.rllib.algorithms.ppo import PPOConfig + + # Define your custom env class by subclassing gymnasium.Env: + + class ParrotEnv(gym.Env): + """Environment in which the agent learns to repeat the seen observations. + + Observations are float numbers indicating the to-be-repeated values, + e.g. -1.0, 5.1, or 3.2. + The action space is the same as the observation space. + Rewards are `r=-abs([observation] - [action])`, for all steps. + """ + def __init__(self, config=None): + # Since actions should repeat observations, their spaces must be the same. + self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32) + self.action_space = self.observation_space + self._cur_obs = None + self._episode_len = 0 + + def reset(self, *, seed=None, options=None): + """Resets the environment, starting a new episode.""" + # Reset the episode len. + self._episode_len = 0 + # Sample a random number from our observation space. + self._cur_obs = self.observation_space.sample() + # Return initial observation. + return self._cur_obs, {} + + def step(self, action): + """Takes a single step in the episode given `action`.""" + # Set `terminated` and `truncated` flags to True after 10 steps. + self._episode_len += 1 + terminated = truncated = self._episode_len >= 10 + # Compute the reward: `r = -abs([obs] - [action])` + reward = -sum(abs(self._cur_obs - action)) + # Set a new observation (random sample). + self._cur_obs = self.observation_space.sample() + return self._cur_obs, reward, terminated, truncated, {} + + # Point your config to your custom env class: + config = ( + PPOConfig() + .environment(ParrotEnv) # add `env_config=[some Box space] to customize the env + ) + + # Build a PPO algorithm and train it. + ppo_w_custom_env = config.build_algo() + ppo_w_custom_env.train() + + .. testcode:: + :hide: + + # Test that our setup is working. + ppo_w_custom_env.stop() + + +Customizing your models +~~~~~~~~~~~~~~~~~~~~~~~ + +In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything +in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define +the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here +for a detailed guide on how to do so. + + + + +Accessing Model State +~~~~~~~~~~~~~~~~~~~~~ + +Similar to accessing policy state, you may want to get a reference to the +underlying neural network model being trained. For example, you may want to +pre-train it separately, or otherwise update its weights outside of RLlib. +This can be done by accessing the ``model`` of the policy. + +Below you find three explicit examples showing how to access the model state of +an algorithm. + +.. dropdown:: **Example: Preprocessing observations for feeding into a model** + + + Then for the code: + + .. literalinclude:: doc_code/training.py + :language: python + :start-after: __preprocessing_observations_start__ + :end-before: __preprocessing_observations_end__ + +.. dropdown:: **Example: Querying a policy's action distribution** + + .. literalinclude:: doc_code/training.py + :language: python + :start-after: __query_action_dist_start__ + :end-before: __query_action_dist_end__ + +.. dropdown:: **Example: Getting Q values from a DQN model** + + .. literalinclude:: doc_code/training.py + :language: python + :start-after: __get_q_values_dqn_start__ + :end-before: __get_q_values_dqn_end__ + + This is especially useful when used with + `custom model classes `__. + + +.. Debugging RLlib Experiments + --------------------------- + Eager Mode + ~~~~~~~~~~ + Policies built with ``build_tf_policy`` (most of the reference algorithms are) + can be run in eager mode by setting the + ``"framework": "tf2"`` / ``"eager_tracing": true`` config options. + This will tell RLlib to execute the model forward pass, action distribution, + loss, and stats functions in eager mode. + Eager mode makes debugging much easier, since you can now use line-by-line + debugging with breakpoints or Python ``print()`` to inspect + intermediate tensor values. + However, eager can be slower than graph mode unless tracing is enabled. + Episode Traces + ~~~~~~~~~~~~~~ + You can use the `data output API `__ to save episode traces + for debugging. For example, the following command will run PPO while saving episode + traces to ``/tmp/debug``. + .. code-block:: bash + cd rllib/tuned_examples/ppo + python cartpole_ppo.py --output /tmp/debug + # episode traces will be saved in /tmp/debug, for example + output-2019-02-23_12-02-03_worker-2_0.json + output-2019-02-23_12-02-04_worker-1_0.json +Log Verbosity +~~~~~~~~~~~~~ +You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG", +"INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the +verbosity of internal logging. +For example: + .. code-block:: bash + cd rllib/tuned_examples/ppo + python atari_ppo.py --env ALE/Pong-v5 --log-level INFO + python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG +The default log level is ``WARN``. We strongly recommend using at least ``INFO`` +level logging for development. +Stack Traces +~~~~~~~~~~~~ +You can use the ``ray stack`` command to dump the stack traces of all the +Python workers on a single node. This can be useful for debugging unexpected +hangs or performance issues. +Next Steps +---------- +- To check how your application is doing, you can use the :ref:`Ray dashboard `. diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst index 107ddbc02728..2bee34ef0931 100644 --- a/doc/source/rllib/index.rst +++ b/doc/source/rllib/index.rst @@ -102,7 +102,7 @@ Install RLlib and `PyTorch `__, as shown below: .. code-block:: bash - `pip install "gymnasium[atari,accept-rom-license,mujoco]"`. + pip install "gymnasium[atari,accept-rom-license,mujoco]" This is all, you can now start coding against RLlib. Here is an example for running the :ref:`PPO Algorithm ` on the `Taxi domain `__. diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst index c86ac015b137..22ad7993cebe 100644 --- a/doc/source/rllib/rllib-training.rst +++ b/doc/source/rllib/rllib-training.rst @@ -1,460 +1,4 @@ -.. include:: /_includes/rllib/we_are_hiring.rst +.. raw:: html -.. _rllib-getting-started: - -Getting Started -=============== - -.. include:: /_includes/rllib/new_api_stack.rst - -.. _rllib-in-60min: - -RLlib in 60 minutes -------------------- - -.. figure:: images/rllib-index-header.svg - -In this tutorial, you learn how to design, customize, and run an end-to-end RLlib learning experiment -from scratch. This includes picking and configuring an Algorithm, running a couple of training iterations, -saving the state of your Algorithm from time to time, running a separate evaluation loop, -and finally utilizing one of the checkpoints to deploy your trained model in an environment outside of RLlib -and compute actions through it. - -You also learn how to customize your RL environment and your neural network model. - -Installation -~~~~~~~~~~~~ - -First, install RLlib and `PyTorch `__, as shown below: - -.. code-block:: bash - - pip install "ray[rllib]" "gymnasium[atari,accept-rom-license,mujoco]" torch - - -.. _rllib-python-api: - -Python API -~~~~~~~~~~ - -RLlib's Python API provides all the flexibility required for applying the library to any -type of RL problem. - -You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` -class. An :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` typically holds a neural -network for computing actions, called "policy", the :ref:`RL environment ` -you want to optimize against, a loss function, an optimizer, and some code describing the -algorithm's execution logic, like determining when to take which particular steps. - -In :ref:`multi-agent training `, -:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` manages the querying and optimization of multiple policies at once. - -Through the algorithm's interface, you can train the policy, compute actions, or store your -algorithm's state through checkpointing. - - -Configure and build the algorithm -+++++++++++++++++++++++++++++++++ - -You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance -and change some default settings through the config object's various methods. - -For example, we can set the RL environment we want to use by calling the config's -:py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment` method. - -To scale our setup and define, how many EnvRunner actors you want to leverage, -you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method. - -Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance -through calling the :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo` -method. - -.. testcode:: - - from ray.rllib.algorithms.ppo import PPOConfig - - # Configure the Algorithm (PPO). - config = ( - PPOConfig() - .environment("Pendulum-v1") - .env_runners(num_env_runners=3) - .training( - lr=0.0002, - train_batch_size_per_learner=2000, - num_epochs=10, - ) - ) - - # Build the Algorithm (PPO). - ppo = config.build_algo() - - -.. note:: - - See here to learn, which config methods you can use to configure your Algorithm, see here. - - -Run the algorithm -+++++++++++++++++ - -After you have built your :ref:`PPO ` from its configuration, you can ``train`` it for a number of -iterations through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.train` method. - -It returns a result dictionary that you can pretty-print for debugging purposes: - -.. testcode:: - - from pprint import pprint - - for _ in range(5): - pprint(ppo.train()) - - -Checkpoint the algorithm -++++++++++++++++++++++++ - -To save your Algorithm's current state, create a so-called ``checkpoint`` through -calling its `save_to_path` method. It returns the location of the saved checkpoint. - -Alternatively to not passing any arguments and letting the algorithm decide, where to save -the checkpoint, you can provide a checkpoint directory yourself: - -.. testcode:: - - checkpoint_path = ppo.save_to_path() - - # OR: - # ppo.save_to_path([a checkpoint location of your choice]) - - -Evaluate the algorithm -++++++++++++++++++++++ - - -Restore the model - - -Let's start with an example of the API's basic usage. - - # Train for 10 iterations. - for i in range(10): - result = ppo.train() - result.pop("config") - print(result) - - # Checkpoint every 5 iterations. - if i % 5 == 0: - checkpoint_dir = ppo.save_to_path() - print(f"Algorithm checkpoint saved in: {checkpoint_dir}") - - - -.. _rllib-with-ray-tune: - -RLlib with Ray Tune -~~~~~~~~~~~~~~~~~~~ - -All RLlib algorithms are compatible with the :ref:`Tune API `. -This enables them to be easily used in experiments with :ref:`Ray Tune `. -For example, the following code performs a simple hyper-parameter sweep of PPO. - - -.. literalinclude:: ./doc_code/getting_started.py - :dedent: 4 - :language: python - :start-after: rllib-tune-config-begin - :end-before: rllib-tune-config-end - -Tune will schedule the trials to run in parallel on your Ray cluster: - -:: - - == Status == - Using FIFO scheduling algorithm. - Resources requested: 4/4 CPUs, 0/0 GPUs - Result logdir: ~/ray_results/my_experiment - PENDING trials: - - PPO_CartPole-v1_2_lr=0.0001: PENDING - RUNNING trials: - - PPO_CartPole-v1_0_lr=0.01: RUNNING [pid=21940], 16 s, 4013 ts, 22 rew - - PPO_CartPole-v1_1_lr=0.001: RUNNING [pid=21942], 27 s, 8111 ts, 54.7 rew - -``Tuner.fit()`` returns an ``ResultGrid`` object that allows further analysis -of the training results and retrieving the checkpoint(s) of the trained agent. - -.. literalinclude:: ./doc_code/getting_started.py - :dedent: 0 - :language: python - :start-after: rllib-tuner-begin - :end-before: rllib-tuner-end - -.. note:: - - You can find your checkpoint's version by - looking into the ``rllib_checkpoint.json`` file inside your checkpoint directory. - -Loading and restoring a trained algorithm from a checkpoint is simple. -Let's assume you have a local checkpoint directory called ``checkpoint_path``. -To load newer RLlib checkpoints (version >= 2.1), use the following code: - - -.. code-block:: python - - from ray.rllib.algorithms.algorithm import Algorithm - - algo = Algorithm.from_checkpoint(checkpoint_path) - - -Customizing your RL environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In the preceding examples, your :ref:`RL environment ` was a `Farama gymnasium `__ -pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a custom one, -see this tab below for a less-than-50-lines example. - -See here for an :ref:`in-depth guide on how to setup RL environments in RLlib ` and how to customize them. - -.. dropdown:: Quickstart: Custom RL environment - :animate: fade-in-slide-down - - .. testcode:: - - import gymnasium as gym - from ray.rllib.algorithms.ppo import PPOConfig - - # Define your custom env class by subclassing gymnasium.Env: - - class ParrotEnv(gym.Env): - """Environment in which the agent learns to repeat the seen observations. - - Observations are float numbers indicating the to-be-repeated values, - e.g. -1.0, 5.1, or 3.2. - The action space is the same as the observation space. - Rewards are `r=-abs([observation] - [action])`, for all steps. - """ - def __init__(self, config=None): - # Since actions should repeat observations, their spaces must be the same. - self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32) - self.action_space = self.observation_space - self._cur_obs = None - self._episode_len = 0 - - def reset(self, *, seed=None, options=None): - """Resets the environment, starting a new episode.""" - # Reset the episode len. - self._episode_len = 0 - # Sample a random number from our observation space. - self._cur_obs = self.observation_space.sample() - # Return initial observation. - return self._cur_obs, {} - - def step(self, action): - """Takes a single step in the episode given `action`.""" - # Set `terminated` and `truncated` flags to True after 10 steps. - self._episode_len += 1 - terminated = truncated = self._episode_len >= 10 - # Compute the reward: `r = -abs([obs] - [action])` - reward = -sum(abs(self._cur_obs - action)) - # Set a new observation (random sample). - self._cur_obs = self.observation_space.sample() - return self._cur_obs, reward, terminated, truncated, {} - - # Point your config to your custom env class: - config = ( - PPOConfig() - .environment(ParrotEnv) # add `env_config=[some Box space] to customize the env - ) - - # Build a PPO algorithm and train it. - ppo_w_custom_env = config.build_algo() - ppo_w_custom_env.train() - - .. testcode:: - :hide: - - # Test that our setup is working. - ppo_w_custom_env.stop() - - -Customizing your models -~~~~~~~~~~~~~~~~~~~~~~~ - -In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything -in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define -the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here -for a detailed guide on how to do so. - - -Deploying your models and computing actions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The simplest way to programmatically compute actions from a trained :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` -is to get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module`, -then call the module's :py:meth:`~ray.rllib.core.rl_module.rl_module.forward_inference` method. - -Here is an example of how to test a trained agent for one episode: - -.. testcode:: - - import gymnasium as gym - import numpy as np - import torch - from ray.rllib.core.rl_module import RLModule - - env = gym.make("CartPole-v1") - - # Get the RLModule from the up and running Algorithm instance: - rl_module = ppo.get_module() - - episode_return = 0 - terminated = truncated = False - - obs, info = env.reset() - - while not terminated and not truncated: - # Compute the next action from a batch (B=1) of observations. - obs_batch = torch.from_numpy(obs).unsqueeze(0) # add batch B=1 dimension - # Extract the logits from the output and dissolve batch again. - action_logits = rl_module.forward_inference({"obs": obs_batch})[ - "action_dist_inputs" - ][0] - # PPO's default RLModule produces action logits (from which - # you have to sample an action or use the max-likelihood one). - action = numpy.argmax(action_logits.numpy()) - # Send the action to the environment for the next step. - obs, reward, terminated, truncated, info = env.step(action) - episode_return += reward - - print(f"Reached episode return of {episode_return}.") - - -If you don't have your Algorithm instance up and running anymore and would like to create the trained RLModule -from a checkpoint, you can do the following instead. -Note that `best_checkpoint` is the highest performing Algorithm checkpoint you created -in the preceding experiment. To learn more about checkpoints and their structure, see this :ref:`checkpointing guide `. - -.. testcode:: - - from pathlib import Path - - # Create only the neural network (RLModule) from our checkpoint. - rl_module = RLModule.from_checkpoint( - Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module" - )["default_policy"] - - # Do the same computations with `rl_module` as in the preceding code snippet. - - -Accessing Policy State -~~~~~~~~~~~~~~~~~~~~~~ - -It is common to need to access a algorithm's internal state, for instance to set -or get model weights. - -In RLlib algorithm state is replicated across multiple *rollout workers* (Ray actors) -in the cluster. -However, you can easily get and update this state between calls to ``train()`` -via ``Algorithm.env_runner_group.foreach_worker()`` -or ``Algorithm.env_runner_group.foreach_worker_with_index()``. -These functions take a lambda function that is applied with the worker as an argument. -These functions return values for each worker as a list. - -You can also access just the "master" copy of the algorithm state through -``Algorithm.get_policy()`` or ``Algorithm.env_runner``, -but note that updates here may not be immediately reflected in -your rollout workers (if you have configured ``num_env_runners > 0``). -Here's a quick example of how to access state of a model: - -.. literalinclude:: ./doc_code/getting_started.py - :language: python - :start-after: rllib-get-state-begin - :end-before: rllib-get-state-end - -Accessing Model State -~~~~~~~~~~~~~~~~~~~~~ - -Similar to accessing policy state, you may want to get a reference to the -underlying neural network model being trained. For example, you may want to -pre-train it separately, or otherwise update its weights outside of RLlib. -This can be done by accessing the ``model`` of the policy. - -.. note:: - - To run these examples, you need to install a few extra dependencies, namely - `pip install "gym[atari]" "gym[accept-rom-license]" atari_py`. - -Below you find three explicit examples showing how to access the model state of -an algorithm. - -.. dropdown:: **Example: Preprocessing observations for feeding into a model** - - - Then for the code: - - .. literalinclude:: doc_code/training.py - :language: python - :start-after: __preprocessing_observations_start__ - :end-before: __preprocessing_observations_end__ - -.. dropdown:: **Example: Querying a policy's action distribution** - - .. literalinclude:: doc_code/training.py - :language: python - :start-after: __query_action_dist_start__ - :end-before: __query_action_dist_end__ - -.. dropdown:: **Example: Getting Q values from a DQN model** - - .. literalinclude:: doc_code/training.py - :language: python - :start-after: __get_q_values_dqn_start__ - :end-before: __get_q_values_dqn_end__ - - This is especially useful when used with - `custom model classes `__. - - -.. Debugging RLlib Experiments - --------------------------- - Eager Mode - ~~~~~~~~~~ - Policies built with ``build_tf_policy`` (most of the reference algorithms are) - can be run in eager mode by setting the - ``"framework": "tf2"`` / ``"eager_tracing": true`` config options. - This will tell RLlib to execute the model forward pass, action distribution, - loss, and stats functions in eager mode. - Eager mode makes debugging much easier, since you can now use line-by-line - debugging with breakpoints or Python ``print()`` to inspect - intermediate tensor values. - However, eager can be slower than graph mode unless tracing is enabled. - Episode Traces - ~~~~~~~~~~~~~~ - You can use the `data output API `__ to save episode traces - for debugging. For example, the following command will run PPO while saving episode - traces to ``/tmp/debug``. - .. code-block:: bash - cd rllib/tuned_examples/ppo - python cartpole_ppo.py --output /tmp/debug - # episode traces will be saved in /tmp/debug, for example - output-2019-02-23_12-02-03_worker-2_0.json - output-2019-02-23_12-02-04_worker-1_0.json -Log Verbosity -~~~~~~~~~~~~~ -You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG", -"INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the -verbosity of internal logging. -For example: - .. code-block:: bash - cd rllib/tuned_examples/ppo - python atari_ppo.py --env ALE/Pong-v5 --log-level INFO - python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG -The default log level is ``WARN``. We strongly recommend using at least ``INFO`` -level logging for development. -Stack Traces -~~~~~~~~~~~~ -You can use the ``ray stack`` command to dump the stack traces of all the -Python workers on a single node. This can be useful for debugging unexpected -hangs or performance issues. -Next Steps ----------- -- To check how your application is doing, you can use the :ref:`Ray dashboard `. + +

If you are not redirected automatically, follow this link.

From bcd594c44b4d85d1655b46bc16b2c5cb4a7de1ff Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 18 Jan 2025 18:58:24 +0100 Subject: [PATCH 06/22] wip Signed-off-by: sven1977 --- doc/source/rllib/getting-started.rst | 121 +++++++++--------- .../rl_modules/classes/vpg_torch_rlm.py | 2 +- 2 files changed, 65 insertions(+), 58 deletions(-) diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst index f280185cc4e6..d7f909f954e2 100644 --- a/doc/source/rllib/getting-started.rst +++ b/doc/source/rllib/getting-started.rst @@ -256,14 +256,22 @@ algorithms and their models: best_checkpoint = best_result.checkpoint -Deploy a trained model for production inference -+++++++++++++++++++++++++++++++++++++++++++++++ +Deploy a trained model for inference +++++++++++++++++++++++++++++++++++++ After training, you might want to deploy your models into a new environment, for example -to run inference in production. You can do so using the checkpoint directory created in the -preceding example. To read more about checkpoints, model deployments, and algorithm state restoration, +to run inference in production. For this purpose, you can use the checkpoint directory created +in the preceding example. To read more about checkpoints, model deployments, and restoring algorithm state, see this :ref:`page on checkpointing ` here. +Here is how you would create a new model instance from the checkpoint and run inference through +a single episode of your RL environment. Note in particular the use of the +:py:meth:`~ray.rllib.utils.checkpoints.Checkpointable.from_checkpoint` method to create +the model and the +:py:meth:`~ray.rllib.core.rl_module.rl_module.RLModule.forward_inference` +method to compute actions: + + .. testcode:: from pathlib import Path @@ -272,14 +280,13 @@ see this :ref:`page on checkpointing ` here. import torch from ray.rllib.core.rl_module import RLModule - # Create only the neural network (RLModule) from our checkpoint. + # Create only the neural network (RLModule) from our algorithm checkpoint. rl_module = RLModule.from_checkpoint( Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module" )["default_policy"] - # Create the RL environment to test against (same as was used for - # training earlier). - env = gym.make("Pendulum-v1") + # Create the RL environment to test against (same as was used for training earlier). + env = gym.make("Pendulum-v1", render_mode="human") episode_return = 0.0 done = False @@ -288,14 +295,14 @@ see this :ref:`page on checkpointing ` here. obs, info = env.reset() while not done: + # Render the env. + env.render() + # Compute the next action from a batch (B=1) of observations. obs_batch = torch.from_numpy(obs).unsqueeze(0) # add batch B=1 dimension - + model_outputs = rl_module.forward_inference({"obs": obs_batch}) # Extract the logits from the output and dissolve batch again. - action_logits = rl_module.forward_inference({"obs": obs_batch})[ - "action_dist_inputs" - ][0] - + action_logits = model_outputs["action_dist_inputs"][0] # PPO's default RLModule produces action logits (from which # you have to sample an action or use the max-likelihood one). action = numpy.argmax(action_logits.numpy()) @@ -310,8 +317,8 @@ see this :ref:`page on checkpointing ` here. print(f"Reached episode return of {episode_return}.") -If you still have an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance up and running -in your script, you can also get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the +Alternatively, if you still have an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance up and running +in your script, you can get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module` method: .. testcode:: @@ -327,7 +334,8 @@ a `Farama gymnasium `__ pre-registered one, like ``Pendulum-v1`` or ``CartPole-v1``. However, if you would like to run your experiments against a custom one, see this tab below for a less-than-50-lines example. -See here for an :ref:`in-depth guide on how to setup RL environments in RLlib ` and how to customize them. +See here for an :ref:`in-depth guide on how to setup RL environments in RLlib ` +and how to customize them. .. dropdown:: Quickstart: Custom RL environment :animate: fade-in-slide-down @@ -394,51 +402,50 @@ See here for an :ref:`in-depth guide on how to setup RL environments in RLlib