From bed55cc92869ce5f1831348b290cb565197c5274 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 3 Jan 2025 22:26:38 +0100
Subject: [PATCH 01/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/doc_code/getting_started.py |  37 --
 doc/source/rllib/images/rllib-api.svg        |   1 -
 doc/source/rllib/rllib-training.rst          | 171 +++---
 rllib/algorithms/algorithm.py                | 517 ++++++++-----------
 4 files changed, 284 insertions(+), 442 deletions(-)
 delete mode 100644 doc/source/rllib/images/rllib-api.svg
diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py
index 951b3acee8da..549a506ed043 100644
--- a/doc/source/rllib/doc_code/getting_started.py
+++ b/doc/source/rllib/doc_code/getting_started.py
@@ -1,44 +1,11 @@
 # flake8: noqa
 
-# __rllib-first-config-begin__
-from pprint import pprint
-
-from ray.rllib.algorithms.ppo import PPOConfig
-
-config = (
-    PPOConfig()
-    .api_stack(
-        enable_rl_module_and_learner=True,
-        enable_env_runner_and_connector_v2=True,
-    )
-    .environment("CartPole-v1")
-    .env_runners(num_env_runners=1)
-)
-
-algo = config.build()
-
-for i in range(10):
-    result = algo.train()
-    result.pop("config")
-    pprint(result)
-
-    if i % 5 == 0:
-        checkpoint_dir = algo.save_to_path()
-        print(f"Checkpoint saved in directory {checkpoint_dir}")
-# __rllib-first-config-end__
-
-algo.stop()
-
 if False:
     # __rllib-tune-config-begin__
     from ray import train, tune
 
     config = (
         PPOConfig()
-        .api_stack(
-            enable_rl_module_and_learner=True,
-            enable_env_runner_and_connector_v2=True,
-        )
         .environment("CartPole-v1")
         .training(
             lr=tune.grid_search([0.01, 0.001, 0.0001]),
@@ -125,10 +92,6 @@
 
 algo = (
     PPOConfig()
-    .api_stack(
-        enable_rl_module_and_learner=True,
-        enable_env_runner_and_connector_v2=True,
-    )
     .environment("CartPole-v1")
     .env_runners(num_env_runners=2)
 ).build()
diff --git a/doc/source/rllib/images/rllib-api.svg b/doc/source/rllib/images/rllib-api.svg
deleted file mode 100644
index 6eb03dac2e49..000000000000
--- a/doc/source/rllib/images/rllib-api.svg
+++ /dev/null
@@ -1 +0,0 @@
-<svg version="1.1" viewBox="0.0 0.0 759.8083989501313 224.63517060367454" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l759.8084 0l0 224.63518l-759.8084 0l0 -224.63518z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l759.8084 0l0 224.63518l-759.8084 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m390.8302 5.6091375l333.7323 0l0 133.85828l-333.7323 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m390.8302 5.6091375l333.7323 0l0 133.85828l-333.7323 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m398.8302 13.609138l333.7323 0l0 133.85828l-333.7323 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m398.8302 13.609138l333.7323 0l0 133.85828l-333.7323 0z" fill-rule="evenodd"/><path fill="#ffffff" d="m407.96997 23.897638l333.7323 0l0 133.85826l-333.7323 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m407.96997 23.897638l333.7323 0l0 133.85826l-333.7323 0z" fill-rule="evenodd"/><path fill="#efefef" d="m596.1649 41.081364l124.06305 0l0 96.25197l-124.06305 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="2.0,6.0" d="m596.1649 41.081364l124.06305 0l0 96.25197l-124.06305 0z" fill-rule="evenodd"/><path fill="#6aa84f" d="m419.6576 49.729347l53.07086 0l0 27.81102l-53.07086 0z" fill-rule="evenodd"/><path fill="#6aa84f" d="m490.14734 49.729347l53.07086 0l0 27.81102l-53.07086 0z" fill-rule="evenodd"/><path fill="#c27ba0" d="m558.7319 49.729347l25.606262 0l0 27.81102l-25.606262 0z" fill-rule="evenodd"/><path fill="#6aa84f" d="m605.2463 49.729347l46.425232 0l0 27.81102l-46.425232 0z" fill-rule="evenodd"/><path fill="#c27ba0" d="m605.2463 93.21663l46.425232 0l0 27.81102l-46.425232 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m472.72845 63.634857l17.417328 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m472.72845 63.634857l5.417328 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m478.14578 66.938324l9.076202 -3.3034668l-9.076202 -3.303463z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m543.6333 63.634857l7.4960327 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m543.6333 63.634857l7.4960327 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m551.12933 66.938324l9.076233 -3.3034668l-9.076233 -3.303463z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m584.33813 63.634857l20.913391 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m584.33813 63.634857l8.913391 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m593.2515 66.938324l9.076172 -3.3034668l-9.076172 -3.303463z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m628.4589 77.54037l0 15.685043" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m628.4589 77.54037l0 3.6850433" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m625.15546 81.22542l3.3034668 9.076195l3.3034668 -9.076195z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m628.4589 121.02765l0.03149414 6.4566956" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m628.4589 121.02765l0.03149414 6.4566956" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m628.7798 127.068275l-182.89764 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m628.7798 127.068275l-182.89764 0" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m446.19302 77.54037l0 49.95276" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m446.19302 89.54037l0 37.952766" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m449.4965 89.54037l-3.3034668 -9.076195l-3.3034668 9.076195z" fill-rule="evenodd"/><path fill="#c27ba0" d="m663.5916 93.21663l46.42517 0l0 27.81102l-46.42517 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m663.5916 107.12214l-11.905518 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m663.5916 107.12214l-11.905518 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m651.6861 110.425606l9.076233 -3.3034668l-9.076233 -3.3034668z" fill-rule="evenodd"/><path fill="#d5a6bd" d="m186.19044 29.514437l156.34645 0l0 107.52756l-156.34645 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m186.19044 29.514437l156.34645 0l0 107.52756l-156.34645 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m190.42404 29.608923l159.30707 0l0 37.543304l-159.30707 0z" fill-rule="evenodd"/><path fill="#000000" d="m204.26779 56.528923l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm7.0782776 0l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm12.540802 -1.1875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm4.203842 -6.609375l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm4.144821 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.000717 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm17.72409 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm2.96875 3.546875l0 -13.359375l1.640625 0l0 13.359375l-1.640625 0zm10.504211 -1.1875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm3.547577 1.96875l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm9.328125 0l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125z" fill-rule="nonzero"/><path fill="#c27ba0" d="m221.73637 68.05774l113.511795 0l0 59.55906l-113.511795 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m221.73637 68.05774l113.511795 0l0 59.55906l-113.511795 0z" fill-rule="evenodd"/><path fill="#000000" d="m253.36435 93.03727l0 -14.3125l5.390625 0q1.4375 0 2.1875 0.125q1.046875 0.1875 1.765625 0.6875q0.71875 0.484375 1.140625 1.375q0.4375 0.890625 0.4375 1.953125q0 1.828125 -1.171875 3.09375q-1.15625 1.25 -4.1875 1.25l-3.671875 0l0 5.828125l-1.890625 0zm1.890625 -7.515625l3.703125 0q1.828125 0 2.59375 -0.671875q0.78125 -0.6875 0.78125 -1.9375q0 -0.890625 -0.453125 -1.53125q-0.453125 -0.640625 -1.1875 -0.84375q-0.484375 -0.125 -1.78125 -0.125l-3.65625 0l0 5.109375zm10.574219 2.328125q0 -2.875 1.59375 -4.265625q1.34375 -1.15625 3.265625 -1.15625q2.140625 0 3.484375 1.40625q1.359375 1.40625 1.359375 3.875q0 2.0 -0.59375 3.15625q-0.59375 1.140625 -1.75 1.78125q-1.140625 0.625 -2.5 0.625q-2.1875 0 -3.53125 -1.390625q-1.328125 -1.40625 -1.328125 -4.03125zm1.796875 0q0 2.0 0.859375 2.984375q0.875 0.984375 2.203125 0.984375q1.3125 0 2.171875 -0.984375q0.875 -1.0 0.875 -3.046875q0 -1.921875 -0.875 -2.90625q-0.875 -1.0 -2.171875 -1.0q-1.328125 0 -2.203125 1.0q-0.859375 0.984375 -0.859375 2.96875zm9.935547 5.1875l0 -14.3125l1.75 0l0 14.3125l-1.75 0zm4.4902344 -12.296875l0 -2.015625l1.765625 0l0 2.015625l-1.765625 0zm0 12.296875l0 -10.375l1.765625 0l0 10.375l-1.765625 0zm11.208984 -3.796875l1.71875 0.21875q-0.28125 1.796875 -1.453125 2.8125q-1.15625 1.0 -2.859375 1.0q-2.125 0 -3.421875 -1.390625q-1.296875 -1.390625 -1.296875 -3.984375q0 -1.6875 0.546875 -2.9375q0.5625 -1.265625 1.703125 -1.890625q1.140625 -0.640625 2.484375 -0.640625q1.6875 0 2.75 0.859375q1.078125 0.859375 1.390625 2.421875l-1.71875 0.265625q-0.234375 -1.046875 -0.859375 -1.5625q-0.625 -0.53125 -1.5 -0.53125q-1.328125 0 -2.15625 0.953125q-0.828125 0.953125 -0.828125 3.0q0 2.09375 0.796875 3.046875q0.796875 0.9375 2.09375 0.9375q1.03125 0 1.71875 -0.625q0.703125 -0.640625 0.890625 -1.953125zm3.140625 7.796875l-0.1875 -1.65625q0.578125 0.15625 1.0 0.15625q0.59375 0 0.9375 -0.203125q0.359375 -0.1875 0.578125 -0.53125q0.171875 -0.265625 0.546875 -1.3125q0.046875 -0.15625 0.15625 -0.4375l-3.9375 -10.390625l1.890625 0l2.15625 6.015625q0.421875 1.140625 0.75 2.390625q0.3125 -1.203125 0.71875 -2.359375l2.21875 -6.046875l1.765625 0l-3.953125 10.546875q-0.625 1.71875 -0.984375 2.359375q-0.46875 0.875 -1.078125 1.265625q-0.59375 0.40625 -1.4375 0.40625q-0.515625 0 -1.140625 -0.203125z" fill-rule="nonzero"/><path fill="#000000" d="m236.67783 110.06852q0 -3.5625 1.90625 -5.578125q1.921875 -2.03125 4.953125 -2.03125q1.984375 0 3.5625 0.953125q1.59375 0.953125 2.421875 2.65625q0.84375 1.6875 0.84375 3.828125q0 2.171875 -0.875 3.890625q-0.875 1.71875 -2.5 2.609375q-1.609375 0.890625 -3.46875 0.890625q-2.015625 0 -3.609375 -0.96875q-1.59375 -0.984375 -2.421875 -2.671875q-0.8125 -1.703125 -0.8125 -3.578125zm1.953125 0.03125q0 2.578125 1.390625 4.078125q1.390625 1.484375 3.484375 1.484375q2.140625 0 3.515625 -1.5q1.390625 -1.515625 1.390625 -4.28125q0 -1.734375 -0.59375 -3.03125q-0.578125 -1.3125 -1.71875 -2.03125q-1.140625 -0.71875 -2.5625 -0.71875q-2.0 0 -3.453125 1.390625q-1.453125 1.375 -1.453125 4.609375zm13.947266 10.90625l0 -14.34375l1.609375 0l0 1.34375q0.5625 -0.78125 1.265625 -1.171875q0.71875 -0.40625 1.734375 -0.40625q1.328125 0 2.34375 0.6875q1.015625 0.6875 1.53125 1.9375q0.53125 1.234375 0.53125 2.71875q0 1.59375 -0.578125 2.875q-0.5625 1.265625 -1.65625 1.953125q-1.09375 0.671875 -2.296875 0.671875q-0.875 0 -1.578125 -0.375q-0.6875 -0.375 -1.140625 -0.9375l0 5.046875l-1.765625 0zm1.59375 -9.09375q0 2.0 0.8125 2.953125q0.8125 0.953125 1.96875 0.953125q1.171875 0 2.0 -0.984375q0.84375 -0.984375 0.84375 -3.078125q0 -1.96875 -0.828125 -2.953125q-0.8125 -1.0 -1.9375 -1.0q-1.125 0 -2.0 1.0625q-0.859375 1.046875 -0.859375 3.046875zm13.373047 3.546875l0.25 1.5625q-0.734375 0.15625 -1.328125 0.15625q-0.953125 0 -1.484375 -0.296875q-0.515625 -0.3125 -0.734375 -0.796875q-0.21875 -0.5 -0.21875 -2.078125l0 -5.96875l-1.28125 0l0 -1.375l1.28125 0l0 -2.5625l1.75 -1.0625l0 3.625l1.765625 0l0 1.375l-1.765625 0l0 6.0625q0 0.75 0.09375 0.96875q0.09375 0.203125 0.296875 0.34375q0.21875 0.125 0.609375 0.125q0.28125 0 0.765625 -0.078125zm1.7285156 -10.71875l0 -2.015625l1.765625 0l0 2.015625l-1.765625 0zm0 12.296875l0 -10.375l1.765625 0l0 10.375l-1.765625 0zm4.4277344 0l0 -10.375l1.578125 0l0 1.453125q0.484375 -0.75 1.296875 -1.21875q0.8125 -0.46875 1.84375 -0.46875q1.15625 0 1.890625 0.484375q0.734375 0.46875 1.046875 1.328125q1.234375 -1.8125 3.203125 -1.8125q1.546875 0 2.375 0.859375q0.828125 0.859375 0.828125 2.625l0 7.125l-1.75 0l0 -6.53125q0 -1.0625 -0.171875 -1.515625q-0.171875 -0.46875 -0.625 -0.75q-0.4375 -0.28125 -1.046875 -0.28125q-1.09375 0 -1.828125 0.734375q-0.71875 0.71875 -0.71875 2.3125l0 6.03125l-1.75 0l0 -6.734375q0 -1.171875 -0.4375 -1.75q-0.421875 -0.59375 -1.40625 -0.59375q-0.734375 0 -1.375 0.390625q-0.625 0.390625 -0.90625 1.140625q-0.28125 0.75 -0.28125 2.171875l0 5.375l-1.765625 0zm16.675781 -12.296875l0 -2.015625l1.765625 0l0 2.015625l-1.765625 0zm0 12.296875l0 -10.375l1.765625 0l0 10.375l-1.765625 0zm3.5058594 0l0 -1.421875l6.609375 -7.578125q-1.125 0.046875 -1.984375 0.046875l-4.234375 0l0 -1.421875l8.484375 0l0 1.171875l-5.625 6.578125l-1.078125 1.203125q1.171875 -0.09375 2.21875 -0.09375l4.796875 0l0 1.515625l-9.1875 0zm18.03125 -3.34375l1.8125 0.234375q-0.421875 1.578125 -1.59375 2.46875q-1.15625 0.875 -2.96875 0.875q-2.265625 0 -3.609375 -1.390625q-1.328125 -1.40625 -1.328125 -3.9375q0 -2.625 1.34375 -4.0625q1.34375 -1.453125 3.5 -1.453125q2.078125 0 3.390625 1.421875q1.328125 1.40625 1.328125 3.984375q0 0.15625 -0.015625 0.46875l-7.734375 0q0.09375 1.703125 0.96875 2.609375q0.875 0.90625 2.171875 0.90625q0.96875 0 1.640625 -0.5q0.6875 -0.515625 1.09375 -1.625zm-5.78125 -2.84375l5.796875 0q-0.109375 -1.296875 -0.65625 -1.953125q-0.84375 -1.015625 -2.1875 -1.015625q-1.203125 0 -2.03125 0.8125q-0.828125 0.796875 -0.921875 2.15625zm9.779297 6.1875l0 -10.375l1.578125 0l0 1.578125q0.609375 -1.109375 1.125 -1.453125q0.515625 -0.359375 1.125 -0.359375q0.890625 0 1.8125 0.5625l-0.609375 1.640625q-0.640625 -0.390625 -1.28125 -0.390625q-0.578125 0 -1.046875 0.359375q-0.453125 0.34375 -0.65625 0.953125q-0.28125 0.9375 -0.28125 2.046875l0 5.4375l-1.765625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m335.24817 97.83727l55.590576 -25.291344" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m335.24817 97.83727l50.12921 -22.806656" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m386.0614 76.534065l3.4466858 -3.3827362l-4.8146973 0.37583923z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m335.24817 97.83727l63.590576 -17.291344" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m335.24817 97.83727l57.80081 -15.717003" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m393.48236 83.71413l3.9457092 -2.784607l-4.8125 -0.40311432z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m335.24817 97.83727l72.72443 -7.0236206" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m335.24817 97.83727l66.75223 -6.4468307" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m402.15918 93.034515l4.358307 -2.08033l-4.675873 -1.2078323z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m6.145455 36.41995l165.79527 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m6.1454544 36.41995l159.79529 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m165.94073 38.07168l4.538086 -1.6517296l-4.538086 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m-4.2097926 0.68774277l113.51181 0l0 37.543304l-113.51181 0z" fill-rule="evenodd"/><path fill="#000000" d="m13.618332 27.48274q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.8437495 -0.78125 -0.8437495 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515624l1.5781245 -0.40625l0 2.921874l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm3.2572937 -9.015625l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625zm15.757294 9.140625l-0.03125 -1.234375q-0.75 0.75 -1.515625 1.078125q-0.765625 0.3125 -1.625 0.3125q-0.78125 0 -1.34375 -0.1875q-0.546875 -0.203125 -0.90625 -0.546875q-0.359375 -0.359375 -0.53125 -0.828125q-0.15625 -0.484375 -0.15625 -1.03125q0 -1.375 1.015625 -2.15625q1.03125 -0.78125 3.03125 -0.78125l1.890625 0l0 -0.796875q0 -0.8125 -0.515625 -1.296875q-0.515625 -0.5 -1.578125 -0.5q-0.78125 0 -1.53125 0.1875q-0.75 0.171875 -1.546875 0.484375l0 -1.4375q0.296875 -0.109375 0.65625 -0.203125q0.375 -0.109375 0.78125 -0.1875q0.40625 -0.078125 0.859375 -0.125q0.453125 -0.0625 0.90625 -0.0625q0.828125 0 1.484375 0.1875q0.671875 0.171875 1.125 0.546875q0.46875 0.375 0.703125 0.953125q0.25 0.5625 0.25 1.328125l0 6.296875l-1.421875 0zm-0.171875 -4.15625l-2.015625 0q-0.578125 0 -1.015625 0.125q-0.421875 0.109375 -0.703125 0.328125q-0.265625 0.21875 -0.40625 0.53125q-0.125 0.296875 -0.125 0.671875q0 0.265625 0.078125 0.515625q0.078125 0.234375 0.265625 0.421875q0.1875 0.1875 0.46875 0.296875q0.296875 0.109375 0.71875 0.109375q0.546875 0 1.25 -0.328125q0.703125 -0.34375 1.484375 -1.0625l0 -1.609375zm7.6791687 -3.671875l-2.703125 0l0 -1.3125l4.296875 0l0 7.8125l2.734375 0l0 1.328125l-7.34375 0l0 -1.328125l3.015625 0l0 -6.5zm0.546875 -5.156249q0.265625 0 0.484375 0.09375q0.234375 0.09375 0.40625 0.28125q0.171875 0.171875 0.265625 0.390625q0.09375 0.21875 0.09375 0.484375q0 0.24999905 -0.09375 0.48437405q-0.09375 0.234375 -0.265625 0.40625q-0.171875 0.171875 -0.40625 0.265625q-0.21875 0.09375 -0.484375 0.09375q-0.265625 0 -0.5 -0.09375q-0.21875 -0.09375 -0.390625 -0.265625q-0.15625 -0.171875 -0.265625 -0.40625q-0.09375 -0.234375 -0.09375 -0.48437405q0 -0.265625 0.09375 -0.484375q0.109375 -0.21875 0.265625 -0.390625q0.171875 -0.1875 0.390625 -0.28125q0.234375 -0.09375 0.5 -0.09375zm6.5854187 3.843749l1.421875 0l0.0625 1.46875q0.390625 -0.46875 0.765625 -0.78125q0.375 -0.3125 0.734375 -0.5q0.359375 -0.203125 0.734375 -0.28125q0.375 -0.078125 0.78125 -0.078125q1.40625 0 2.125 0.84375q0.734375 0.828125 0.734375 2.5l0 5.96875l-1.59375 0l0 -5.84375q0 -1.078125 -0.40625 -1.578125q-0.390625 -0.515625 -1.1875 -0.515625q-0.28125 0 -0.5625 0.09375q-0.28125 0.078125 -0.578125 0.296875q-0.296875 0.203125 -0.65625 0.578125q-0.34375 0.359375 -0.78125 0.90625l0 6.0625l-1.59375 0l0 -9.140625zm15.741669 12.96875q-4.203125 -3.890625 -4.203125 -8.59375q0 -1.109375 0.21875 -2.203125q0.21875 -1.109375 0.71875 -2.203125q0.5 -1.109375 1.3125 -2.203125q0.8125 -1.109374 1.984375 -2.187499l0.921875 0.9375q-3.53125 3.484374 -3.53125 7.734374q0 2.109375 0.890625 4.0625q0.890625 1.953125 2.640625 3.671875l-0.953125 0.984375zm6.6479187 -17.390625q4.203125 3.890624 4.203125 8.656249q0 0.984375 -0.203125 2.046875q-0.203125 1.046875 -0.6875 2.15625q-0.484375 1.109375 -1.296875 2.234375q-0.8125 1.140625 -2.0625 2.296875l-0.90625 -0.9375q1.765625 -1.765625 2.640625 -3.671875q0.875 -1.921875 0.875 -4.015625q0 -4.328125 -3.515625 -7.796874l0.953125 -0.96875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m6.145455 68.419945l165.79527 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m6.1454544 68.419945l159.79529 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m165.94073 70.07168l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m-4.2097545 32.687744l104.40945 0l0 37.543304l-104.40945 0z" fill-rule="evenodd"/><path fill="#000000" d="m13.49337 57.107742q0 0.484375 -0.171875 0.875q-0.15625 0.375 -0.4375 0.671875q-0.28125 0.296875 -0.65625 0.515625q-0.375 0.203125 -0.796875 0.34375q-0.421875 0.125 -0.875 0.1875q-0.453125 0.0625 -0.875 0.0625q-0.921875 0 -1.7031245 -0.078125q-0.78125 -0.078125 -1.53125 -0.265625l0 -1.453125q0.796875 0.234375 1.5937495 0.359375q0.796875 0.109375 1.578125 0.109375q1.140625 0 1.6875 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.234375 -0.09375 -0.421875q-0.078125 -0.203125 -0.3125 -0.375q-0.21875 -0.1875 -0.703125 -0.375q-0.484375 -0.203125 -1.3125 -0.4375q-0.609375 -0.1875 -1.140625 -0.421875q-0.5156245 -0.234375 -0.9062495 -0.546875q-0.375 -0.328125 -0.59375 -0.75q-0.21875 -0.421875 -0.21875 -1.015625q0 -0.375 0.171875 -0.828125q0.171875 -0.453125 0.59375 -0.84375q0.4375 -0.40625 1.1718745 -0.65625q0.734375 -0.265625 1.8125 -0.265625q0.546875 0 1.203125 0.0625q0.65625 0.0625 1.359375 0.203125l0 1.40625q-0.75 -0.171875 -1.421875 -0.25q-0.65625 -0.09375 -1.15625 -0.09375q-0.59375 0 -1.0 0.09375q-0.390625 0.078125 -0.65625 0.25q-0.25 0.15625 -0.359375 0.375q-0.109375 0.203125 -0.109375 0.453125q0 0.25 0.09375 0.453125q0.109375 0.1875 0.359375 0.375q0.265625 0.1875 0.71875 0.375q0.46875 0.1875 1.21875 0.40625q0.8125 0.234375 1.359375 0.5q0.5625 0.25 0.90625 0.578125q0.359375 0.3125 0.5 0.71875q0.15625 0.40625 0.15625 0.90625zm8.882294 2.5l-0.03125 -1.234375q-0.75 0.75 -1.515625 1.078125q-0.765625 0.3125 -1.625 0.3125q-0.78125 0 -1.34375 -0.1875q-0.546875 -0.203125 -0.90625 -0.546875q-0.359375 -0.359375 -0.53125 -0.828125q-0.15625 -0.484375 -0.15625 -1.03125q0 -1.375 1.015625 -2.15625q1.03125 -0.78125 3.03125 -0.78125l1.890625 0l0 -0.796875q0 -0.8125 -0.515625 -1.296875q-0.515625 -0.5 -1.578125 -0.5q-0.78125 0 -1.53125 0.1875q-0.75 0.171875 -1.546875 0.484375l0 -1.4375q0.296875 -0.109375 0.65625 -0.203125q0.375 -0.109375 0.78125 -0.1875q0.40625 -0.078125 0.859375 -0.125q0.453125 -0.0625 0.90625 -0.0625q0.828125 0 1.484375 0.1875q0.671875 0.171875 1.125 0.546875q0.46875 0.375 0.703125 0.953125q0.25 0.5625 0.25 1.328125l0 6.296875l-1.421875 0zm-0.171875 -4.15625l-2.015625 0q-0.578125 0 -1.015625 0.125q-0.421875 0.109375 -0.703125 0.328125q-0.265625 0.21875 -0.40625 0.53125q-0.125 0.296875 -0.125 0.671875q0 0.265625 0.078125 0.515625q0.078125 0.234375 0.265625 0.421875q0.1875 0.1875 0.46875 0.296875q0.296875 0.109375 0.71875 0.109375q0.546875 0 1.25 -0.328125q0.703125 -0.34375 1.484375 -1.0625l0 -1.609375zm3.6947937 -4.984375l1.8125 0l2.234375 6.046875l0.484375 1.46875l0.5 -1.515625l2.234375 -6.0l1.734375 0l-3.59375 9.140625l-1.8125 0l-3.59375 -9.140625zm18.851044 4.078125q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm9.663544 9.109375q-4.203125 -3.890625 -4.203125 -8.59375q0 -1.109375 0.21875 -2.203125q0.21875 -1.109375 0.71875 -2.203125q0.5 -1.109375 1.3125 -2.203125q0.8125 -1.109375 1.984375 -2.1875l0.921875 0.9375q-3.53125 3.484375 -3.53125 7.734375q0 2.109375 0.890625 4.0625q0.890625 1.953125 2.640625 3.671875l-0.953125 0.984375zm6.6479187 -17.390625q4.203125 3.890625 4.203125 8.65625q0 0.984375 -0.203125 2.046875q-0.203125 1.046875 -0.6875 2.15625q-0.484375 1.109375 -1.296875 2.234375q-0.8125 1.140625 -2.0625 2.296875l-0.90625 -0.9375q1.765625 -1.765625 2.640625 -3.671875q0.875 -1.921875 0.875 -4.015625q0 -4.328125 -3.515625 -7.796875l0.953125 -0.96875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m6.145455 100.419945l165.79527 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m6.1454544 100.419945l159.79529 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m165.94073 102.07168l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m-4.209682 64.687744l116.69291 0l0 37.543304l-116.69291 0z" fill-rule="evenodd"/><path fill="#000000" d="m6.618443 82.46712l1.4531245 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.421875 0 2.15625 0.921875q0.734375 0.921875 0.671875 2.734375l-1.59375 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.6093745 0l0 -9.140625zm17.616669 4.078125q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm11.429169 2.78125q0 0.484375 -0.171875 0.875q-0.15625 0.375 -0.4375 0.671875q-0.28125 0.296875 -0.65625 0.515625q-0.375 0.203125 -0.796875 0.34375q-0.421875 0.125 -0.875 0.1875q-0.453125 0.0625 -0.875 0.0625q-0.921875 0 -1.703125 -0.078125q-0.78125 -0.078125 -1.53125 -0.265625l0 -1.453125q0.796875 0.234375 1.59375 0.359375q0.796875 0.109375 1.578125 0.109375q1.140625 0 1.6875 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.234375 -0.09375 -0.421875q-0.078125 -0.203125 -0.3125 -0.375q-0.21875 -0.1875 -0.703125 -0.375q-0.484375 -0.203125 -1.3125 -0.4375q-0.609375 -0.1875 -1.140625 -0.421875q-0.515625 -0.234375 -0.90625 -0.546875q-0.375 -0.328125 -0.59375 -0.75q-0.21875 -0.421875 -0.21875 -1.015625q0 -0.375 0.171875 -0.828125q0.171875 -0.453125 0.59375 -0.84375q0.4375 -0.40625 1.171875 -0.65625q0.734375 -0.265625 1.8125 -0.265625q0.546875 0 1.203125 0.0625q0.65625 0.0625 1.359375 0.203125l0 1.40625q-0.75 -0.171875 -1.421875 -0.25q-0.65625 -0.09375 -1.15625 -0.09375q-0.59375 0 -1.0 0.09375q-0.390625 0.078125 -0.65625 0.25q-0.25 0.15625 -0.359375 0.375q-0.109375 0.203125 -0.109375 0.453125q0 0.25 0.09375 0.453125q0.109375 0.1875 0.359375 0.375q0.265625 0.1875 0.71875 0.375q0.46875 0.1875 1.21875 0.40625q0.8125 0.234375 1.359375 0.5q0.5625 0.25 0.90625 0.578125q0.359375 0.3125 0.5 0.71875q0.15625 0.40625 0.15625 0.90625zm10.382294 2.375q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm10.851044 -4.515625q0 1.0625 -0.3125 1.953125q-0.296875 0.890625 -0.859375 1.53125q-0.5625 0.625 -1.375 0.96875q-0.8125 0.34375 -1.84375 0.34375q-0.984375 0 -1.765625 -0.296875q-0.765625 -0.3125 -1.3125 -0.890625q-0.546875 -0.59375 -0.828125 -1.46875q-0.28125 -0.875 -0.28125 -2.015625q0 -1.0625 0.296875 -1.9375q0.296875 -0.890625 0.859375 -1.515625q0.5625 -0.640625 1.375 -0.984375q0.8125 -0.359375 1.84375 -0.359375q0.984375 0 1.765625 0.3125q0.78125 0.296875 1.3125 0.890625q0.546875 0.578125 0.828125 1.453125q0.296875 0.875 0.296875 2.015625zm-1.625 0.0625q0 -0.84375 -0.1875 -1.46875q-0.1875 -0.640625 -0.53125 -1.0625q-0.34375 -0.421875 -0.84375 -0.640625q-0.5 -0.21875 -1.109375 -0.21875q-0.703125 0 -1.21875 0.28125q-0.5 0.28125 -0.828125 0.75q-0.3125 0.453125 -0.46875 1.078125q-0.15625 0.609375 -0.15625 1.28125q0 0.859375 0.1875 1.5q0.1875 0.625 0.53125 1.046875q0.359375 0.421875 0.84375 0.640625q0.5 0.21875 1.109375 0.21875q0.71875 0 1.21875 -0.28125q0.5 -0.28125 0.828125 -0.734375q0.328125 -0.46875 0.46875 -1.078125q0.15625 -0.625 0.15625 -1.3125zm4.2885437 -4.5625l1.453125 0l0.046875 1.6875q0.8125 -0.984375 1.59375 -1.421875q0.796875 -0.4375 1.59375 -0.4375q1.4218788 0 2.1562538 0.921875q0.734375 0.921875 0.671875 2.734375l-1.5937538 0q0.015625 -1.203125 -0.359375 -1.734375q-0.375 -0.546875 -1.109375 -0.546875q-0.3125 0 -0.640625 0.109375q-0.328125 0.109375 -0.671875 0.359375q-0.328125 0.25 -0.71875 0.640625q-0.375 0.390625 -0.8125 0.953125l0 5.875l-1.609375 0l0 -9.140625zm17.616673 4.078125q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm9.663544 9.109375q-4.203125 -3.890625 -4.203125 -8.59375q0 -1.109375 0.21875 -2.203125q0.21875 -1.109375 0.71875 -2.203125q0.5 -1.109375 1.3125 -2.203125q0.8125 -1.109375 1.984375 -2.1875l0.921875 0.9375q-3.53125 3.484375 -3.53125 7.734375q0 2.109375 0.890625 4.0625q0.890625 1.953125 2.640625 3.671875l-0.953125 0.984375zm6.6479187 -17.390625q4.203125 3.890625 4.203125 8.65625q0 0.984375 -0.203125 2.046875q-0.203125 1.046875 -0.6875 2.15625q-0.484375 1.109375 -1.296875 2.234375q-0.8125 1.140625 -2.0625 2.296875l-0.90625 -0.9375q1.765625 -1.765625 2.640625 -3.671875q0.875 -1.921875 0.875 -4.015625q0 -4.328125 -3.515625 -7.796875l0.953125 -0.96875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m6.145455 132.41995l165.79527 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m6.1454544 132.41995l159.79529 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m165.94073 134.07169l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m-4.2095776 96.64156l193.92126 0l0 37.543312l-193.92126 0z" fill-rule="evenodd"/><path fill="#000000" d="m13.352922 123.21781q-0.625 0.25 -1.28125 0.359375q-0.640625 0.109375 -1.328125 0.109375q-2.171875 0 -3.34375 -1.171875q-1.171875 -1.171875 -1.171875 -3.4375q0 -1.078125 0.328125 -1.953125q0.34375 -0.890625 0.953125 -1.515625q0.609375 -0.640625 1.453125 -0.984375q0.859375 -0.34375 1.875 -0.34375q0.71875 0 1.328125 0.109375q0.625 0.09375 1.1875 0.328125l0 1.515625q-0.59375 -0.3125 -1.21875 -0.453125q-0.609375 -0.140625 -1.265625 -0.140625q-0.609375 0 -1.15625 0.234375q-0.53125 0.234375 -0.9375 0.671875q-0.40625 0.421875 -0.65625 1.0625q-0.234375 0.625 -0.234375 1.421875q0 1.65625 0.796875 2.484375q0.8125 0.8125 2.25 0.8125q0.640625 0 1.25 -0.140625q0.609375 -0.140625 1.171875 -0.4375l0 1.46875zm11.116669 -4.296875q0 1.0625 -0.3125 1.953125q-0.296875 0.890625 -0.859375 1.53125q-0.5625 0.625 -1.375 0.96875q-0.8125 0.34375 -1.84375 0.34375q-0.984375 0 -1.765625 -0.296875q-0.765625 -0.3125 -1.3125 -0.890625q-0.546875 -0.59375 -0.828125 -1.46875q-0.28125 -0.875 -0.28125 -2.015625q0 -1.0625 0.296875 -1.9375q0.296875 -0.890625 0.859375 -1.515625q0.5625 -0.640625 1.375 -0.984375q0.8125 -0.359375 1.84375 -0.359375q0.984375 0 1.765625 0.3125q0.78125 0.296875 1.3125 0.890625q0.546875 0.578125 0.828125 1.453125q0.296875 0.875 0.296875 2.015625zm-1.625 0.0625q0 -0.84375 -0.1875 -1.46875q-0.1875 -0.640625 -0.53125 -1.0625q-0.34375 -0.421875 -0.84375 -0.640625q-0.5 -0.21875 -1.109375 -0.21875q-0.703125 0 -1.21875 0.28125q-0.5 0.28125 -0.828125 0.75q-0.3125 0.453125 -0.46875 1.078125q-0.15625 0.609375 -0.15625 1.28125q0 0.859375 0.1875 1.5q0.1875 0.625 0.53125 1.046875q0.359375 0.421875 0.84375 0.640625q0.5 0.21875 1.109375 0.21875q0.71875 0 1.21875 -0.28125q0.5 -0.28125 0.828125 -0.734375q0.328125 -0.46875 0.46875 -1.078125q0.15625 -0.625 0.15625 -1.3125zm10.444792 4.578125l0 -6.5625q0 -0.4375 -0.03125 -0.703125q-0.03125 -0.28125 -0.109375 -0.4375q-0.0625 -0.15625 -0.171875 -0.21875q-0.109375 -0.0625 -0.25 -0.0625q-0.1875 0 -0.34375 0.109375q-0.15625 0.109375 -0.34375 0.359375q-0.1718731 0.234375 -0.3906231 0.640625q-0.203125 0.40625 -0.5 1.0l0 5.875l-1.4375 0l0 -6.390625q0 -0.5 -0.03125 -0.8125q-0.03125 -0.3125 -0.109375 -0.484375q-0.0625 -0.171875 -0.171875 -0.234375q-0.109375 -0.0625 -0.265625 -0.0625q-0.171875 0 -0.3125 0.09375q-0.140625 0.078125 -0.328125 0.328125q-0.171875 0.234375 -0.390625 0.640625q-0.21875 0.40625 -0.515625 1.046875l0 5.875l-1.46875 0l0 -9.140625l1.21875 0l0.078125 1.734375q0.234375 -0.515625 0.453125 -0.875q0.21875 -0.375 0.453125 -0.59375q0.25 -0.21875 0.515625 -0.328125q0.265625 -0.109375 0.59375 -0.109375q0.734375 0 1.109375 0.484375q0.390625 0.484375 0.390625 1.5q0.21875 -0.484375 0.421875 -0.84375q0.21875 -0.375 0.453125 -0.625q0.2499981 -0.25 0.5312481 -0.375q0.296875 -0.140625 0.6875 -0.140625q1.71875 0 1.71875 2.65625l0 6.65625l-1.453125 0zm11.554169 -4.734375q0 1.21875 -0.34375 2.140625q-0.34375 0.90625 -0.953125 1.515625q-0.59375 0.59375 -1.421875 0.890625q-0.8125 0.296875 -1.765625 0.296875q-0.4375 0 -0.875 -0.046875q-0.4375 -0.046875 -0.875 -0.15625l0 3.828125l-1.59375 0l0 -12.875l1.421875 0l0.09375 1.53125q0.6875 -0.9375 1.453125 -1.3125q0.78125 -0.390625 1.6875 -0.390625q0.78125 0 1.375 0.328125q0.59375 0.328125 0.984375 0.9375q0.40625 0.59375 0.609375 1.4375q0.203125 0.828125 0.203125 1.875zm-1.625 0.0625q0 -0.71875 -0.109375 -1.3125q-0.109375 -0.609375 -0.34375 -1.03125q-0.21875 -0.421875 -0.578125 -0.65625q-0.34375 -0.25 -0.84375 -0.25q-0.296875 0 -0.609375 0.09375q-0.3125 0.09375 -0.640625 0.328125q-0.328125 0.21875 -0.703125 0.59375q-0.359375 0.359375 -0.78125 0.90625l0 4.4375q0.4375 0.171875 0.921875 0.28125q0.484375 0.109375 0.9375 0.109375q1.28125 0 2.015625 -0.875q0.734375 -0.875 0.734375 -2.625zm11.413544 4.671875l-1.421875 0l-0.0625 -1.46875q-0.40625 0.46875 -0.78125 0.78125q-0.359375 0.3125 -0.734375 0.515625q-0.359375 0.1875 -0.734375 0.25q-0.359375 0.078125 -0.765625 0.078125q-1.421875 0 -2.140625 -0.828125q-0.71875 -0.828125 -0.71875 -2.5l0 -5.96875l1.59375 0l0 5.84375q0 2.109375 1.578125 2.109375q0.296875 0 0.5625 -0.078125q0.28125 -0.09375 0.578125 -0.3125q0.3125 -0.21875 0.65625 -0.578125q0.359375 -0.359375 0.796875 -0.921875l0 -6.0625l1.59375 0l0 9.140625zm10.272919 -0.125q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm10.616669 -4.9375q0 0.34375 -0.015625 0.578125q0 0.21875 -0.03125 0.421875l-6.421875 0q0 1.40625 0.78125 2.15625q0.796875 0.75 2.265625 0.75q0.40625 0 0.796875 -0.03125q0.40625 -0.03125 0.78125 -0.078125q0.375 -0.0625 0.71875 -0.125q0.34375 -0.078125 0.625 -0.15625l0 1.296875q-0.640625 0.1875 -1.46875 0.296875q-0.8125 0.109375 -1.6875 0.109375q-1.171875 0 -2.015625 -0.3125q-0.84375 -0.3125 -1.390625 -0.921875q-0.546875 -0.609375 -0.8125 -1.484375q-0.25 -0.875 -0.25 -2.0q0 -0.953125 0.28125 -1.8125q0.28125 -0.859375 0.8125 -1.515625q0.53125 -0.65625 1.296875 -1.03125q0.78125 -0.390625 1.765625 -0.390625q0.953125 0 1.6875 0.3125q0.75 0.296875 1.25 0.84375q0.5 0.546875 0.765625 1.34375q0.265625 0.78125 0.265625 1.75zm-1.65625 -0.21875q0.03125 -0.609375 -0.125 -1.109375q-0.140625 -0.515625 -0.453125 -0.875q-0.296875 -0.375 -0.75 -0.578125q-0.453125 -0.203125 -1.0625 -0.203125q-0.515625 0 -0.953125 0.203125q-0.421875 0.203125 -0.734375 0.5625q-0.296875 0.359375 -0.5 0.875q-0.1875 0.515625 -0.234375 1.125l4.8125 0zm12.976044 9.015625l-10.25 0l0 -1.3125l10.25 0l0 1.3125zm7.3354187 -3.734375l-0.03125 -1.234375q-0.75 0.75 -1.515625 1.078125q-0.765625 0.3125 -1.625 0.3125q-0.78125 0 -1.34375 -0.1875q-0.546875 -0.203125 -0.90625 -0.546875q-0.359375 -0.359375 -0.53125 -0.828125q-0.15625 -0.484375 -0.15625 -1.03125q0 -1.375 1.015625 -2.15625q1.03125 -0.78125 3.03125 -0.78125l1.890625 0l0 -0.796875q0 -0.8125 -0.515625 -1.296875q-0.515625 -0.5 -1.578125 -0.5q-0.78125 0 -1.53125 0.1875q-0.75 0.171875 -1.546875 0.484375l0 -1.4375q0.296875 -0.109375 0.65625 -0.203125q0.375 -0.109375 0.78125 -0.1875q0.40625 -0.078125 0.859375 -0.125q0.453125 -0.0625 0.90625 -0.0625q0.828125 0 1.484375 0.1875q0.671875 0.171875 1.125 0.546875q0.46875 0.375 0.703125 0.953125q0.25 0.5625 0.25 1.328125l0 6.296875l-1.421875 0zm-0.171875 -4.15625l-2.015625 0q-0.578125 0 -1.015625 0.125q-0.421875 0.109375 -0.703125 0.328125q-0.265625 0.21875 -0.40625 0.53125q-0.125 0.296875 -0.125 0.671875q0 0.265625 0.078125 0.515625q0.078125 0.234375 0.265625 0.421875q0.1875 0.1875 0.46875 0.296875q0.296875 0.109375 0.71875 0.109375q0.546875 0 1.25 -0.328125q0.703125 -0.34375 1.484375 -1.0625l0 -1.609375zm11.663544 3.8125q-0.625 0.25 -1.28125 0.359375q-0.640625 0.109375 -1.328125 0.109375q-2.171875 0 -3.34375 -1.171875q-1.171875 -1.171875 -1.171875 -3.4375q0 -1.078125 0.328125 -1.953125q0.34375 -0.890625 0.953125 -1.515625q0.609375 -0.640625 1.453125 -0.984375q0.859375 -0.34375 1.875 -0.34375q0.71875 0 1.328125 0.109375q0.625 0.09375 1.1875 0.328125l0 1.515625q-0.59375 -0.3125 -1.21875 -0.453125q-0.609375 -0.140625 -1.265625 -0.140625q-0.609375 0 -1.15625 0.234375q-0.53125 0.234375 -0.9375 0.671875q-0.40625 0.421875 -0.65625 1.0625q-0.234375 0.625 -0.234375 1.421875q0 1.65625 0.796875 2.484375q0.8125 0.8125 2.25 0.8125q0.640625 0 1.25 -0.140625q0.609375 -0.140625 1.171875 -0.4375l0 1.46875zm10.522919 0.21875q-0.53125 0.140625 -1.109375 0.1875q-0.578125 0.0625 -1.171875 0.0625q-1.71875 0 -2.5625 -0.78125q-0.84375 -0.78125 -0.84375 -2.390625l0 -4.765625l-2.5625 0l0 -1.328125l2.5625 0l0 -2.515625l1.578125 -0.40625l0 2.921875l4.109375 0l0 1.328125l-4.109375 0l0 4.640625q0 0.984375 0.515625 1.46875q0.53125 0.484375 1.546875 0.484375q0.4375 0 0.953125 -0.0625q0.53125 -0.0625 1.09375 -0.21875l0 1.375zm6.0072937 -7.703125l-2.703125 0l0 -1.3125l4.296875 0l0 7.8125l2.734375 0l0 1.328125l-7.34375 0l0 -1.328125l3.015625 0l0 -6.5zm0.546875 -5.15625q0.265625 0 0.484375 0.09375q0.234375 0.09375 0.40625 0.28125q0.171875 0.171875 0.265625 0.390625q0.09375 0.21875 0.09375 0.484375q0 0.25 -0.09375 0.484375q-0.09375 0.234375 -0.265625 0.40625q-0.171875 0.171875 -0.40625 0.265625q-0.21875 0.09375 -0.484375 0.09375q-0.265625 0 -0.5 -0.09375q-0.21875 -0.09375 -0.390625 -0.265625q-0.15625 -0.171875 -0.265625 -0.40625q-0.09375 -0.234375 -0.09375 -0.484375q0 -0.265625 0.09375 -0.484375q0.109375 -0.21875 0.265625 -0.390625q0.171875 -0.1875 0.390625 -0.28125q0.234375 -0.09375 0.5 -0.09375zm14.554169 8.34375q0 1.0625 -0.3125 1.953125q-0.296875 0.890625 -0.859375 1.53125q-0.5625 0.625 -1.375 0.96875q-0.8125 0.34375 -1.84375 0.34375q-0.984375 0 -1.765625 -0.296875q-0.765625 -0.3125 -1.3125 -0.890625q-0.546875 -0.59375 -0.828125 -1.46875q-0.28125 -0.875 -0.28125 -2.015625q0 -1.0625 0.296875 -1.9375q0.296875 -0.890625 0.859375 -1.515625q0.5625 -0.640625 1.375 -0.984375q0.8125 -0.359375 1.84375 -0.359375q0.984375 0 1.765625 0.3125q0.78125 0.296875 1.3125 0.890625q0.546875 0.578125 0.828125 1.453125q0.296875 0.875 0.296875 2.015625zm-1.625 0.0625q0 -0.84375 -0.1875 -1.46875q-0.1875 -0.640625 -0.53125 -1.0625q-0.34375 -0.421875 -0.84375 -0.640625q-0.5 -0.21875 -1.109375 -0.21875q-0.703125 0 -1.21875 0.28125q-0.5 0.28125 -0.828125 0.75q-0.3125 0.453125 -0.46875 1.078125q-0.15625 0.609375 -0.15625 1.28125q0 0.859375 0.1875 1.5q0.1875 0.625 0.53125 1.046875q0.359375 0.421875 0.84375 0.640625q0.5 0.21875 1.109375 0.21875q0.71875 0 1.21875 -0.28125q0.5 -0.28125 0.828125 -0.734375q0.328125 -0.46875 0.46875 -1.078125q0.15625 -0.625 0.15625 -1.3125zm3.9135437 -4.5625l1.421875 0l0.0625 1.46875q0.390625 -0.46875 0.765625 -0.78125q0.375 -0.3125 0.734375 -0.5q0.359375 -0.203125 0.734375 -0.28125q0.375 -0.078125 0.78125 -0.078125q1.40625 0 2.125 0.84375q0.734375 0.828125 0.734375 2.5l0 5.96875l-1.59375 0l0 -5.84375q0 -1.078125 -0.40625 -1.578125q-0.390625 -0.515625 -1.1875 -0.515625q-0.28125 0 -0.5625 0.09375q-0.28125 0.078125 -0.578125 0.296875q-0.296875 0.203125 -0.65625 0.578125q-0.34375 0.359375 -0.78125 0.90625l0 6.0625l-1.59375 0l0 -9.140625zm15.741669 12.96875q-4.203125 -3.890625 -4.203125 -8.59375q0 -1.109375 0.21875 -2.203125q0.21875 -1.109375 0.71875 -2.203125q0.5 -1.109375 1.3125 -2.203125q0.8125 -1.109375 1.984375 -2.1875l0.921875 0.9375q-3.53125 3.484375 -3.53125 7.734375q0 2.109375 0.890625 4.0625q0.890625 1.953125 2.640625 3.671875l-0.953125 0.984375zm6.6479187 -17.390625q4.203125 3.890625 4.203125 8.65625q0 0.984375 -0.203125 2.046875q-0.203125 1.046875 -0.6875 2.15625q-0.484375 1.109375 -1.296875 2.234375q-0.8125 1.140625 -2.0625 2.296875l-0.90625 -0.9375q1.765625 -1.765625 2.640625 -3.671875q0.875 -1.921875 0.875 -4.015625q0 -4.328125 -3.515625 -7.796875l0.953125 -0.96875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m403.76102 128.86555l156.34647 0l0 27.811035l-156.34647 0z" fill-rule="evenodd"/><path fill="#000000" d="m413.8079 150.66556l0 -9.546875l4.234375 0q1.265625 0 1.921875 0.265625q0.671875 0.25 1.0625 0.90625q0.40625 0.65625 0.40625 1.4375q0 1.015625 -0.65625 1.71875q-0.65625 0.6875 -2.03125 0.875q0.5 0.25 0.765625 0.484375q0.546875 0.5 1.046875 1.265625l1.65625 2.59375l-1.578125 0l-1.265625 -1.984375q-0.5625 -0.859375 -0.921875 -1.3125q-0.34375 -0.453125 -0.640625 -0.640625q-0.28125 -0.1875 -0.5625 -0.25q-0.21875 -0.046875 -0.703125 -0.046875l-1.46875 0l0 4.234375l-1.265625 0zm1.265625 -5.328125l2.71875 0q0.859375 0 1.34375 -0.171875q0.484375 -0.1875 0.734375 -0.578125q0.265625 -0.390625 0.265625 -0.859375q0 -0.671875 -0.5 -1.109375q-0.484375 -0.4375 -1.546875 -0.4375l-3.015625 0l0 3.15625zm7.7501526 1.875q0 -1.921875 1.078125 -2.84375q0.890625 -0.765625 2.171875 -0.765625q1.421875 0 2.328125 0.9375q0.90625 0.921875 0.90625 2.578125q0 1.328125 -0.40625 2.09375q-0.390625 0.765625 -1.15625 1.1875q-0.765625 0.421875 -1.671875 0.421875q-1.453125 0 -2.359375 -0.921875q-0.890625 -0.9375 -0.890625 -2.6875zm1.203125 0q0 1.328125 0.578125 1.984375q0.59375 0.65625 1.46875 0.65625q0.875 0 1.453125 -0.65625q0.578125 -0.671875 0.578125 -2.03125q0 -1.28125 -0.59375 -1.9375q-0.578125 -0.65625 -1.4375 -0.65625q-0.875 0 -1.46875 0.65625q-0.578125 0.65625 -0.578125 1.984375zm6.6312256 3.453125l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.96109 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.539215 -3.453125q0 -1.921875 1.078125 -2.84375q0.890625 -0.765625 2.171875 -0.765625q1.421875 0 2.328125 0.9375q0.90625 0.921875 0.90625 2.578125q0 1.328125 -0.40625 2.09375q-0.390625 0.765625 -1.15625 1.1875q-0.765625 0.421875 -1.671875 0.421875q-1.453125 0 -2.359375 -0.921875q-0.890625 -0.9375 -0.890625 -2.6875zm1.203125 0q0 1.328125 0.578125 1.984375q0.59375 0.65625 1.46875 0.65625q0.875 0 1.453125 -0.65625q0.578125 -0.671875 0.578125 -2.03125q0 -1.28125 -0.59375 -1.9375q-0.578125 -0.65625 -1.4375 -0.65625q-0.875 0 -1.46875 0.65625q-0.578125 0.65625 -0.578125 1.984375zm11.178101 3.453125l0 -1.015625q-0.8125 1.171875 -2.1875 1.171875q-0.609375 0 -1.140625 -0.234375q-0.53125 -0.234375 -0.796875 -0.578125q-0.25 -0.359375 -0.359375 -0.875q-0.0625 -0.34375 -0.0625 -1.09375l0 -4.28125l1.171875 0l0 3.828125q0 0.921875 0.0625 1.234375q0.109375 0.46875 0.46875 0.734375q0.359375 0.25 0.890625 0.25q0.515625 0 0.984375 -0.265625q0.46875 -0.265625 0.65625 -0.734375q0.1875 -0.46875 0.1875 -1.34375l0 -3.703125l1.171875 0l0 6.90625l-1.046875 0zm5.4437256 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm6.6559753 1.046875l-2.53125 -9.546875l1.296875 0l1.453125 6.265625q0.234375 0.984375 0.40625 1.953125q0.359375 -1.53125 0.421875 -1.765625l1.828125 -6.453125l1.515625 0l1.375 4.828125q0.515625 1.796875 0.734375 3.390625q0.1875 -0.90625 0.484375 -2.078125l1.484375 -6.140625l1.28125 0l-2.625 9.546875l-1.21875 0l-2.0 -7.265625q-0.265625 -0.921875 -0.3125 -1.125q-0.140625 0.65625 -0.28125 1.125l-2.015625 7.265625l-1.296875 0zm10.088837 -3.453125q0 -1.921875 1.078125 -2.84375q0.890625 -0.765625 2.171875 -0.765625q1.421875 0 2.328125 0.9375q0.90625 0.921875 0.90625 2.578125q0 1.328125 -0.40625 2.09375q-0.390625 0.765625 -1.15625 1.1875q-0.765625 0.421875 -1.671875 0.421875q-1.453125 0 -2.359375 -0.921875q-0.890625 -0.9375 -0.890625 -2.6875zm1.203125 0q0 1.328125 0.578125 1.984375q0.59375 0.65625 1.46875 0.65625q0.875 0 1.453125 -0.65625q0.578125 -0.671875 0.578125 -2.03125q0 -1.28125 -0.59375 -1.9375q-0.578125 -0.65625 -1.4375 -0.65625q-0.875 0 -1.46875 0.65625q-0.578125 0.65625 -0.578125 1.984375zm6.6312256 3.453125l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm4.4696045 0l0 -9.546875l1.171875 0l0 5.453125l2.765625 -2.8125l1.515625 0l-2.640625 2.5625l2.90625 4.34375l-1.4375 0l-2.28125 -3.53125l-0.828125 0.796875l0 2.734375l-1.171875 0zm11.3828125 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5062256 4.125l0 -6.90625l1.0625 0l0 1.046875q0.40625 -0.734375 0.734375 -0.96875q0.34375 -0.234375 0.765625 -0.234375q0.59375 0 1.203125 0.375l-0.40625 1.078125q-0.4375 -0.25 -0.859375 -0.25q-0.390625 0 -0.703125 0.234375q-0.296875 0.234375 -0.421875 0.640625q-0.203125 0.625 -0.203125 1.359375l0 3.625l-1.171875 0zm3.98526 -2.0625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m651.3263 39.810444l88.724365 0l0 27.811024l-88.724365 0z" fill-rule="evenodd"/><path fill="#000000" d="m661.35754 61.610443l0 -9.546875l3.59375 0q0.953125 0 1.453125 0.09375q0.703125 0.125 1.171875 0.453125q0.484375 0.328125 0.765625 0.921875q0.296875 0.59375 0.296875 1.296875q0 1.21875 -0.78125 2.0625q-0.765625 0.84375 -2.796875 0.84375l-2.4375 0l0 3.875l-1.265625 0zm1.265625 -5.0l2.453125 0q1.234375 0 1.75 -0.453125q0.515625 -0.46875 0.515625 -1.28125q0 -0.609375 -0.3125 -1.03125q-0.296875 -0.421875 -0.796875 -0.5625q-0.3125 -0.09375 -1.171875 -0.09375l-2.4375 0l0 3.421875zm7.0303345 1.546875q0 -1.921875 1.078125 -2.84375q0.890625 -0.765625 2.171875 -0.765625q1.421875 0 2.328125 0.9375q0.90625 0.921875 0.90625 2.578125q0 1.328125 -0.40625 2.09375q-0.390625 0.765625 -1.15625 1.1875q-0.765625 0.421875 -1.671875 0.421875q-1.453125 0 -2.359375 -0.921875q-0.890625 -0.9375 -0.890625 -2.6875zm1.203125 0q0 1.328125 0.578125 1.984375q0.59375 0.65625 1.46875 0.65625q0.875 0 1.453125 -0.65625q0.578125 -0.671875 0.578125 -2.03125q0 -1.28125 -0.59375 -1.9375q-0.578125 -0.65625 -1.4375 -0.65625q-0.875 0 -1.46875 0.65625q-0.578125 0.65625 -0.578125 1.984375zm6.6312256 3.453125l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm2.9923706 -8.1875l0 -1.359375l1.171875 0l0 1.359375l-1.171875 0zm0 8.1875l0 -6.90625l1.171875 0l0 6.90625l-1.171875 0zm7.4610596 -2.53125l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.40625 0 -2.28125 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.65625 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.453125 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.40625 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm2.1015625 5.1875l-0.125 -1.09375q0.375 0.109375 0.65625 0.109375q0.390625 0 0.625 -0.140625q0.234375 -0.125 0.390625 -0.359375q0.109375 -0.171875 0.359375 -0.875q0.03125 -0.09375 0.109375 -0.28125l-2.625 -6.921875l1.265625 0l1.4375 4.0q0.28125 0.765625 0.5 1.59375q0.203125 -0.796875 0.46875 -1.578125l1.484375 -4.015625l1.171875 0l-2.625 7.015625q-0.421875 1.140625 -0.65625 1.578125q-0.3125 0.578125 -0.71875 0.84375q-0.40625 0.28125 -0.96875 0.28125q-0.328125 0 -0.75 -0.15625z" fill-rule="nonzero"/><path fill="#000000" d="m665.7169 75.07919l1.15625 0.15625q-0.1875 1.1875 -0.96875 1.859375q-0.78125 0.671875 -1.921875 0.671875q-1.40625 0 -2.28125 -0.921875q-0.859375 -0.9375 -0.859375 -2.65625q0 -1.125 0.375 -1.96875q0.375 -0.84375 1.125 -1.25q0.765625 -0.421875 1.65625 -0.421875q1.125 0 1.84375 0.578125q0.71875 0.5625 0.921875 1.609375l-1.140625 0.171875q-0.171875 -0.703125 -0.59375 -1.046875q-0.40625 -0.359375 -0.984375 -0.359375q-0.890625 0 -1.453125 0.640625q-0.546875 0.640625 -0.546875 2.0q0 1.40625 0.53125 2.03125q0.546875 0.625 1.40625 0.625q0.6875 0 1.140625 -0.421875q0.46875 -0.421875 0.59375 -1.296875zm2.1328125 2.53125l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm7.4923096 -0.859375q-0.65625 0.5625 -1.265625 0.796875q-0.59375 0.21875 -1.28125 0.21875q-1.140625 0 -1.75 -0.546875q-0.609375 -0.5625 -0.609375 -1.4375q0 -0.5 0.21875 -0.921875q0.234375 -0.421875 0.609375 -0.671875q0.375 -0.25 0.84375 -0.390625q0.34375 -0.078125 1.046875 -0.171875q1.421875 -0.171875 2.09375 -0.40625q0 -0.234375 0 -0.296875q0 -0.71875 -0.328125 -1.015625q-0.453125 -0.390625 -1.34375 -0.390625q-0.8125 0 -1.21875 0.296875q-0.390625 0.28125 -0.578125 1.015625l-1.140625 -0.15625q0.15625 -0.734375 0.515625 -1.1875q0.359375 -0.453125 1.03125 -0.6875q0.671875 -0.25 1.5625 -0.25q0.890625 0 1.4375 0.203125q0.5625 0.203125 0.8125 0.53125q0.265625 0.3125 0.375 0.796875q0.046875 0.296875 0.046875 1.078125l0 1.5625q0 1.625 0.078125 2.0625q0.078125 0.4375 0.296875 0.828125l-1.21875 0q-0.1875 -0.359375 -0.234375 -0.859375zm-0.09375 -2.609375q-0.640625 0.265625 -1.921875 0.4375q-0.71875 0.109375 -1.015625 0.25q-0.296875 0.125 -0.46875 0.375q-0.15625 0.25 -0.15625 0.546875q0 0.46875 0.34375 0.78125q0.359375 0.3125 1.046875 0.3125q0.671875 0 1.203125 -0.296875q0.53125 -0.296875 0.78125 -0.8125q0.1875 -0.390625 0.1875 -1.171875l0 -0.421875zm2.5218506 1.40625l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625zm6.6640625 0l1.15625 -0.1875q0.109375 0.703125 0.546875 1.078125q0.453125 0.359375 1.25 0.359375q0.8125 0 1.203125 -0.328125q0.390625 -0.328125 0.390625 -0.765625q0 -0.390625 -0.359375 -0.625q-0.234375 -0.15625 -1.1875 -0.390625q-1.296875 -0.328125 -1.796875 -0.5625q-0.484375 -0.25 -0.75 -0.65625q-0.25 -0.421875 -0.25 -0.9375q0 -0.453125 0.203125 -0.84375q0.21875 -0.40625 0.578125 -0.671875q0.28125 -0.1875 0.75 -0.328125q0.46875 -0.140625 1.015625 -0.140625q0.8125 0 1.421875 0.234375q0.609375 0.234375 0.90625 0.640625q0.296875 0.390625 0.40625 1.0625l-1.140625 0.15625q-0.078125 -0.53125 -0.453125 -0.828125q-0.375 -0.3125 -1.0625 -0.3125q-0.8125 0 -1.15625 0.265625q-0.34375 0.265625 -0.34375 0.625q0 0.234375 0.140625 0.421875q0.15625 0.1875 0.453125 0.3125q0.171875 0.0625 1.03125 0.296875q1.25 0.328125 1.734375 0.546875q0.5 0.203125 0.78125 0.609375q0.28125 0.40625 0.28125 1.0q0 0.59375 -0.34375 1.109375q-0.34375 0.515625 -1.0 0.796875q-0.640625 0.28125 -1.453125 0.28125q-1.34375 0 -2.046875 -0.5625q-0.703125 -0.5625 -0.90625 -1.65625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m758.382 149.6588c0 12.819962 -1.7319946 23.212585 -3.8685913 23.212585l-284.42026 0c-2.1365662 0 -3.8686218 10.392639 -3.8686218 23.2126l0 0c0 -12.819962 -1.7320251 -23.2126 -3.8685913 -23.2126l-284.42026 0l0 0c-2.1365814 0 -3.8686218 -10.392624 -3.8686218 -23.212585z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m758.382 149.6588c0 12.819962 -1.7319946 23.212585 -3.8685913 23.212585l-284.42026 0c-2.1365662 0 -3.8686218 10.392639 -3.8686218 23.2126l0 0c0 -12.819962 -1.7320251 -23.2126 -3.8685913 -23.2126l-284.42026 0l0 0c-2.1365814 0 -3.8686218 -10.392624 -3.8686218 -23.212585" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m758.382 149.6588c0 12.819962 -1.7319946 23.212585 -3.8685913 23.212585l-284.42026 0c-2.1365662 0 -3.8686218 10.392639 -3.8686218 23.2126l0 0c0 -12.819962 -1.7320251 -23.2126 -3.8685913 -23.2126l-284.42026 0l0 0c-2.1365814 0 -3.8686218 -10.392624 -3.8686218 -23.212585" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m318.47916 190.5748l517.6378 0l0 60.37796l-517.6378 0z" fill-rule="evenodd"/><path fill="#000000" d="m328.91666 217.4948l0 -13.359375l4.609375 0q1.546875 0 2.375 0.203125q1.140625 0.25 1.953125 0.953125q1.0625 0.890625 1.578125 2.28125q0.53125 1.390625 0.53125 3.171875q0 1.515625 -0.359375 2.703125q-0.359375 1.171875 -0.921875 1.9375q-0.546875 0.765625 -1.203125 1.21875q-0.65625 0.4375 -1.59375 0.671875q-0.9375 0.21875 -2.140625 0.21875l-4.828125 0zm1.765625 -1.578125l2.859375 0q1.3125 0 2.0625 -0.234375q0.75 -0.25 1.203125 -0.703125q0.625 -0.625 0.96875 -1.6875q0.359375 -1.0625 0.359375 -2.578125q0 -2.09375 -0.6875 -3.21875q-0.6875 -1.125 -1.671875 -1.5q-0.703125 -0.28125 -2.28125 -0.28125l-2.8125 0l0 10.203125zm11.504181 -9.890625l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm3.4885864 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm13.5625 1.421875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.5895386 1.46875l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm6.228302 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm5.6448364 0l-1.515625 0l0 -13.359375l1.640625 0l0 4.765625q1.046875 -1.296875 2.65625 -1.296875q0.890625 0 1.6875 0.359375q0.796875 0.359375 1.3125 1.015625q0.515625 0.640625 0.796875 1.5625q0.296875 0.921875 0.296875 1.96875q0 2.484375 -1.234375 3.84375q-1.21875 1.359375 -2.953125 1.359375q-1.703125 0 -2.6875 -1.4375l0 1.21875zm-0.015625 -4.90625q0 1.734375 0.484375 2.515625q0.765625 1.265625 2.09375 1.265625q1.078125 0 1.859375 -0.9375q0.78125 -0.9375 0.78125 -2.78125q0 -1.890625 -0.75 -2.796875q-0.75 -0.90625 -1.828125 -0.90625q-1.0625 0 -1.859375 0.9375q-0.78125 0.9375 -0.78125 2.703125zm15.219452 4.90625l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.6257324 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm8.230164 -1.640625l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm15.406982 5.765625l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm14.137146 -1.671875q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125zm13.027039 10.1875l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm12.469482 3.3125l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051636 -10.0l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm4.1448364 0l0 -9.671875l1.46875 0l0 1.359375q0.453125 -0.71875 1.203125 -1.140625q0.765625 -0.4375 1.71875 -0.4375q1.078125 0 1.765625 0.453125q0.6875 0.4375 0.96875 1.234375q1.15625 -1.6875 2.984375 -1.6875q1.453125 0 2.21875 0.796875q0.78125 0.796875 0.78125 2.453125l0 6.640625l-1.640625 0l0 -6.09375q0 -0.984375 -0.15625 -1.40625q-0.15625 -0.4375 -0.578125 -0.703125q-0.421875 -0.265625 -0.984375 -0.265625q-1.015625 0 -1.6875 0.6875q-0.671875 0.671875 -0.671875 2.15625l0 5.625l-1.640625 0l0 -6.28125q0 -1.09375 -0.40625 -1.640625q-0.40625 -0.546875 -1.3125 -0.546875q-0.6875 0 -1.28125 0.359375q-0.59375 0.359375 -0.859375 1.0625q-0.25 0.703125 -0.25 2.03125l0 5.015625l-1.640625 0zm15.540802 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm3.269806 0l0 -1.328125l6.15625 -7.078125q-1.046875 0.0625 -1.84375 0.0625l-3.9375 0l0 -1.328125l7.90625 0l0 1.078125l-5.25 6.140625l-1.0 1.125q1.09375 -0.078125 2.0625 -0.078125l4.46875 0l0 1.40625l-8.5625 0zm16.515625 -1.1875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.7819824 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051636 -10.0l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm3.5354614 -4.84375q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.297607 4.84375l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm17.340271 0l-2.96875 -9.671875l1.703125 0l1.53125 5.578125l0.578125 2.078125q0.046875 -0.15625 0.5 -2.0l1.546875 -5.65625l1.6875 0l1.4375 5.609375l0.484375 1.84375l0.5625 -1.859375l1.65625 -5.59375l1.59375 0l-3.03125 9.671875l-1.703125 0l-1.53125 -5.796875l-0.375 -1.640625l-1.953125 7.4375l-1.71875 0zm11.69165 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm7.7229614 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051636 1.46875l0 -13.359375l1.640625 0l0 4.796875q1.140625 -1.328125 2.890625 -1.328125q1.078125 0 1.859375 0.421875q0.796875 0.421875 1.140625 1.171875q0.34375 0.75 0.34375 2.171875l0 6.125l-1.640625 0l0 -6.125q0 -1.234375 -0.53125 -1.796875q-0.53125 -0.5625 -1.515625 -0.5625q-0.71875 0 -1.359375 0.390625q-0.640625 0.375 -0.921875 1.015625q-0.265625 0.640625 -0.265625 1.78125l0 5.296875l-1.640625 0zm15.793396 0l0 -13.359375l5.921875 0q1.78125 0 2.703125 0.359375q0.9375 0.359375 1.484375 1.28125q0.5625 0.90625 0.5625 2.015625q0 1.40625 -0.921875 2.390625q-0.921875 0.96875 -2.84375 1.234375q0.703125 0.34375 1.078125 0.671875q0.765625 0.703125 1.453125 1.765625l2.328125 3.640625l-2.21875 0l-1.765625 -2.78125q-0.78125 -1.203125 -1.28125 -1.828125q-0.5 -0.640625 -0.90625 -0.890625q-0.390625 -0.265625 -0.796875 -0.359375q-0.296875 -0.078125 -0.984375 -0.078125l-2.046875 0l0 5.9375l-1.765625 0zm1.765625 -7.453125l3.796875 0q1.21875 0 1.890625 -0.25q0.6875 -0.265625 1.046875 -0.8125q0.359375 -0.546875 0.359375 -1.1875q0 -0.953125 -0.6875 -1.5625q-0.6875 -0.609375 -2.1875 -0.609375l-4.21875 0l0 4.421875zm17.785461 6.265625q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm4.1257324 8.578125l-0.1875 -1.53125q0.546875 0.140625 0.9375 0.140625q0.546875 0 0.875 -0.1875q0.328125 -0.171875 0.546875 -0.5q0.15625 -0.25 0.5 -1.21875q0.046875 -0.140625 0.140625 -0.40625l-3.671875 -9.6875l1.765625 0l2.015625 5.59375q0.390625 1.078125 0.703125 2.25q0.28125 -1.125 0.671875 -2.203125l2.078125 -5.640625l1.640625 0l-3.6875 9.828125q-0.59375 1.609375 -0.921875 2.203125q-0.4375 0.8125 -1.0 1.1875q-0.5625 0.375 -1.34375 0.375q-0.484375 0 -1.0625 -0.203125z" fill-rule="nonzero"/></g></svg>
diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst
index 4a94f17dfb62..fb8617179cc2 100644
--- a/doc/source/rllib/rllib-training.rst
+++ b/doc/source/rllib/rllib-training.rst
@@ -4,22 +4,24 @@
 
 .. _rllib-getting-started:
 
-Getting Started with RLlib
-==========================
+Getting Started
+===============
 
 All RLlib experiments are run using an ``Algorithm`` class which holds a policy for environment interaction.
 Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing).
 In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once.
 
-.. image:: images/rllib-api.svg
-
 In this guide, we will explain in detail RLlib's Python API for running learning experiments.
 
 
-.. _rllib-training-api:
+RLlib in 15 minutes
+-------------------
+
 
-Using the Python API
---------------------
+.. _rllib-python-api:
+
+Python API
+~~~~~~~~~~
 
 The Python API provides all the flexibility required for applying RLlib to any type of problem.
 
@@ -31,12 +33,42 @@ the `env_runners` method.
 After we `build` the `PPO` Algorithm from its configuration, we can `train` it for a number of
 iterations (here `10`) and `save` the resulting policy periodically (here every `5` iterations).
 
-.. literalinclude:: ./doc_code/getting_started.py
-    :language: python
-    :start-after: rllib-first-config-begin
-    :end-before: rllib-first-config-end
+
+.. testcode::
+
+    from ray.rllib.algorithms.ppo import PPOConfig
+
+    # Configure the Algorithm (PPO).
+    config = (
+        PPOConfig()
+        .environment("CartPole-v1")
+        .env_runners(num_env_runners=1)
+    )
+    # Build the Algorithm (PPO).
+    ppo = config.build()
+
+    # Train for 10 iterations.
+    for i in range(10):
+        result = ppo.train()
+        result.pop("config")
+        print(result)
+
+        # Checkpoint every 5 iterations.
+        if i % 5 == 0:
+            checkpoint_dir = ppo.save_to_path()
+            print(f"Algorithm checkpoint saved in: {checkpoint_dir}")
+
+.. testcode::
+    :hide:
+
+    algo.stop()
 
 
+.. _rllib-with-ray-tune:
+
+RLlib with Ray Tune
+~~~~~~~~~~~~~~~~~~~
+
 All RLlib algorithms are compatible with the :ref:`Tune API <tune-api-ref>`.
 This enables them to be easily used in experiments with :ref:`Ray Tune <tune-main>`.
 For example, the following code performs a simple hyper-parameter sweep of PPO.
@@ -84,21 +116,31 @@ To load newer RLlib checkpoints (version >= 1.0), use the following code:
 .. code-block:: python
 
     from ray.rllib.algorithms.algorithm import Algorithm
+
     algo = Algorithm.from_checkpoint(checkpoint_path)
 
 
-For older RLlib checkpoint versions (version < 1.0), you can
-restore an algorithm through:
+Customizing your RL environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. code-block:: python
+In the preceding examples, your RL environment was always "CartPole-v1", however, you would probably like to
+run your actual experiments against a different environment or even write your own custom one.
+
+See here ...blabla
+
+Customizing your models
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything
+in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define
+the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here
+for a detailed guide on how to do so.
 
-    from ray.rllib.algorithms.ppo import PPO
-    algo = PPO(config=config, env=env_class)
-    algo.restore(checkpoint_path)
+
+Deploying your models and computing actions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
-Computing Actions
-~~~~~~~~~~~~~~~~~
 
 The simplest way to programmatically compute actions from a trained agent is to
 use ``Algorithm.compute_single_action()``.
@@ -114,6 +156,7 @@ Here is a simple example of testing a trained agent for one episode:
 For more advanced usage on computing actions and other functionality,
 you can consult the :ref:`RLlib Algorithm API documentation <rllib-algorithm-api>`.
 
+
 Accessing Policy State
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -183,97 +226,47 @@ an algorithm.
     `custom model classes <rllib-models.html>`__.
 
 
-.. _rllib-scaling-guide:
-
-RLlib Scaling Guide
--------------------
-
-Here are some rules of thumb for scaling training with RLlib.
-
-1. If the environment is slow and cannot be replicated (e.g., since it requires interaction with physical systems), then you should use a sample-efficient off-policy algorithm such as :ref:`DQN <dqn>` or :ref:`SAC <sac>`. These algorithms default to ``num_env_runners: 0`` for single-process operation. Make sure to set ``num_gpus: 1`` if you want to use a GPU. Consider also batch RL training with the `offline data <rllib-offline.html>`__ API.
-
-2. If the environment is fast and the model is small (most models for RL are), use time-efficient algorithms such as :ref:`PPO <ppo>`, or :ref:`IMPALA <impala>`.
-These can be scaled by increasing ``num_env_runners`` to add rollout workers. It may also make sense to enable `vectorization <rllib-env.html#vectorized>`__ for
-inference. Make sure to set ``num_gpus: 1`` if you want to use a GPU. If the learner becomes a bottleneck, you can use multiple GPUs for learning by setting
-``num_gpus > 1``.
-
-3. If the model is compute intensive (e.g., a large deep residual network) and inference is the bottleneck, consider allocating GPUs to workers by setting ``num_gpus_per_env_runner: 1``. If you only have a single GPU, consider ``num_env_runners: 0`` to use the learner GPU for inference. For efficient use of GPU time, use a small number of GPU workers and a large number of `envs per worker <rllib-env.html#vectorized>`__.
-
-4. Finally, if both model and environment are compute intensive, then enable `remote worker envs <rllib-env.html#vectorized>`__ with `async batching <rllib-env.html#vectorized>`__ by setting ``remote_worker_envs: True`` and optionally ``remote_env_batch_wait_ms``. This batches inference on GPUs in the rollout workers while letting envs run asynchronously in separate actors, similar to the `SEED <https://ai.googleblog.com/2020/03/massively-scaling-reinforcement.html>`__ architecture. The number of workers and number of envs per worker should be tuned to maximize GPU utilization.
-
-In case you are using lots of workers (``num_env_runners >> 10``) and you observe worker failures for whatever reasons, which normally interrupt your RLlib training runs, consider using
-the config settings ``ignore_env_runner_failures=True``, ``restart_failed_env_runners=True``, or ``restart_failed_sub_environments=True``:
-
-``restart_failed_env_runners``: When set to True (default), your Algorithm will attempt to restart any failed EnvRunner and replace it with a newly created one. This way, your number of workers will never decrease, even if some of them fail from time to time.
-``ignore_env_runner_failures``: When set to True, your Algorithm will not crash due to an EnvRunner error, but continue for as long as there is at least one functional worker remaining. This setting is ignored when ``restart_failed_env_runners=True``.
-``restart_failed_sub_environments``: When set to True and there is a failure in one of the vectorized sub-environments in one of your EnvRunners, RLlib tries to recreate only the failed sub-environment and re-integrate the newly created one into your vectorized env stack on that EnvRunner.
-
-Note that only one of ``ignore_env_runner_failures`` or ``restart_failed_env_runners`` should be set to True (they are mutually exclusive settings). However,
-you can combine each of these with the ``restart_failed_sub_environments=True`` setting.
-Using these options will make your training runs much more stable and more robust against occasional OOM or other similar "once in a while" errors on the EnvRunners
-themselves or inside your custom environments.
-
-
-Debugging RLlib Experiments
----------------------------
-
-Eager Mode
-~~~~~~~~~~
-
-Policies built with ``build_tf_policy`` (most of the reference algorithms are)
-can be run in eager mode by setting the
-``"framework": "tf2"`` / ``"eager_tracing": true`` config options.
-This will tell RLlib to execute the model forward pass, action distribution,
-loss, and stats functions in eager mode.
-
-Eager mode makes debugging much easier, since you can now use line-by-line
-debugging with breakpoints or Python ``print()`` to inspect
-intermediate tensor values.
-However, eager can be slower than graph mode unless tracing is enabled.
-
-
-Episode Traces
-~~~~~~~~~~~~~~
-
-You can use the `data output API <rllib-offline.html>`__ to save episode traces
-for debugging. For example, the following command will run PPO while saving episode
-traces to ``/tmp/debug``.
-
-.. code-block:: bash
-
+.. Debugging RLlib Experiments
+    ---------------------------
+    Eager Mode
+    ~~~~~~~~~~
+    Policies built with ``build_tf_policy`` (most of the reference algorithms are)
+    can be run in eager mode by setting the
+    ``"framework": "tf2"`` / ``"eager_tracing": true`` config options.
+    This will tell RLlib to execute the model forward pass, action distribution,
+    loss, and stats functions in eager mode.
+    Eager mode makes debugging much easier, since you can now use line-by-line
+    debugging with breakpoints or Python ``print()`` to inspect
+    intermediate tensor values.
+    However, eager can be slower than graph mode unless tracing is enabled.
+    Episode Traces
+    ~~~~~~~~~~~~~~
+    You can use the `data output API <rllib-offline.html>`__ to save episode traces
+    for debugging. For example, the following command will run PPO while saving episode
+    traces to ``/tmp/debug``.
+    .. code-block:: bash
     cd rllib/tuned_examples/ppo
     python cartpole_ppo.py --output /tmp/debug
-
     # episode traces will be saved in /tmp/debug, for example
     output-2019-02-23_12-02-03_worker-2_0.json
     output-2019-02-23_12-02-04_worker-1_0.json
-
 Log Verbosity
 ~~~~~~~~~~~~~
-
 You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG",
 "INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the
 verbosity of internal logging.
 For example:
-
-.. code-block:: bash
-
+    .. code-block:: bash
     cd rllib/tuned_examples/ppo
-
     python atari_ppo.py --env ALE/Pong-v5 --log-level INFO
     python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG
-
 The default log level is ``WARN``. We strongly recommend using at least ``INFO``
 level logging for development.
-
 Stack Traces
 ~~~~~~~~~~~~
-
 You can use the ``ray stack`` command to dump the stack traces of all the
 Python workers on a single node. This can be useful for debugging unexpected
 hangs or performance issues.
-
 Next Steps
 ----------
-
 - To check how your application is doing, you can use the :ref:`Ray dashboard <observability-getting-started>`.
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 09f2e638d18f..fd52894dc432 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -2087,321 +2087,6 @@ def set_weights(self, weights: Dict[PolicyID, dict]):
             )
         self.env_runner_group.local_env_runner.set_weights(weights)
 
-    @OldAPIStack
-    def compute_single_action(
-        self,
-        observation: Optional[TensorStructType] = None,
-        state: Optional[List[TensorStructType]] = None,
-        *,
-        prev_action: Optional[TensorStructType] = None,
-        prev_reward: Optional[float] = None,
-        info: Optional[EnvInfoDict] = None,
-        input_dict: Optional[SampleBatch] = None,
-        policy_id: PolicyID = DEFAULT_POLICY_ID,
-        full_fetch: bool = False,
-        explore: Optional[bool] = None,
-        timestep: Optional[int] = None,
-        episode=None,
-        unsquash_action: Optional[bool] = None,
-        clip_action: Optional[bool] = None,
-        # Kwargs placeholder for future compatibility.
-        **kwargs,
-    ) -> Union[
-        TensorStructType,
-        Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]],
-    ]:
-        """Computes an action for the specified policy on the local worker.
-
-        Note that you can also access the policy object through
-        self.get_policy(policy_id) and call compute_single_action() on it
-        directly.
-
-        Args:
-            observation: Single (unbatched) observation from the
-                environment.
-            state: List of all RNN hidden (single, unbatched) state tensors.
-            prev_action: Single (unbatched) previous action value.
-            prev_reward: Single (unbatched) previous reward value.
-            info: Env info dict, if any.
-            input_dict: An optional SampleBatch that holds all the values
-                for: obs, state, prev_action, and prev_reward, plus maybe
-                custom defined views of the current env trajectory. Note
-                that only one of `obs` or `input_dict` must be non-None.
-            policy_id: Policy to query (only applies to multi-agent).
-                Default: "default_policy".
-            full_fetch: Whether to return extra action fetch results.
-                This is always set to True if `state` is specified.
-            explore: Whether to apply exploration to the action.
-                Default: None -> use self.config.explore.
-            timestep: The current (sampling) time step.
-            episode: This provides access to all of the internal episodes'
-                state, which may be useful for model-based or multi-agent
-                algorithms.
-            unsquash_action: Should actions be unsquashed according to the
-                env's/Policy's action space? If None, use the value of
-                self.config.normalize_actions.
-            clip_action: Should actions be clipped according to the
-                env's/Policy's action space? If None, use the value of
-                self.config.clip_actions.
-
-        Keyword Args:
-            kwargs: forward compatibility placeholder
-
-        Returns:
-            The computed action if full_fetch=False, or a tuple of a) the
-            full output of policy.compute_actions() if full_fetch=True
-            or we have an RNN-based Policy.
-
-        Raises:
-            KeyError: If the `policy_id` cannot be found in this Algorithm's local
-                worker.
-        """
-        # `unsquash_action` is None: Use value of config['normalize_actions'].
-        if unsquash_action is None:
-            unsquash_action = self.config.normalize_actions
-        # `clip_action` is None: Use value of config['clip_actions'].
-        elif clip_action is None:
-            clip_action = self.config.clip_actions
-
-        # User provided an input-dict: Assert that `obs`, `prev_a|r`, `state`
-        # are all None.
-        err_msg = (
-            "Provide either `input_dict` OR [`observation`, ...] as "
-            "args to `Algorithm.compute_single_action()`!"
-        )
-        if input_dict is not None:
-            assert (
-                observation is None
-                and prev_action is None
-                and prev_reward is None
-                and state is None
-            ), err_msg
-            observation = input_dict[Columns.OBS]
-        else:
-            assert observation is not None, err_msg
-
-        # Get the policy to compute the action for (in the multi-agent case,
-        # Algorithm may hold >1 policies).
-        policy = self.get_policy(policy_id)
-        if policy is None:
-            raise KeyError(
-                f"PolicyID '{policy_id}' not found in PolicyMap of the "
-                f"Algorithm's local worker!"
-            )
-        # Just preprocess observations, similar to how it used to be done before.
-        pp = policy.agent_connectors[ObsPreprocessorConnector]
-
-        # convert the observation to array if possible
-        if not isinstance(observation, (np.ndarray, dict, tuple)):
-            try:
-                observation = np.asarray(observation)
-            except Exception:
-                raise ValueError(
-                    f"Observation type {type(observation)} cannot be converted to "
-                    f"np.ndarray."
-                )
-        if pp:
-            assert len(pp) == 1, "Only one preprocessor should be in the pipeline"
-            pp = pp[0]
-
-            if not pp.is_identity():
-                # Note(Kourosh): This call will leave the policy's connector
-                # in eval mode. would that be a problem?
-                pp.in_eval()
-                if observation is not None:
-                    _input_dict = {Columns.OBS: observation}
-                elif input_dict is not None:
-                    _input_dict = {Columns.OBS: input_dict[Columns.OBS]}
-                else:
-                    raise ValueError(
-                        "Either observation or input_dict must be provided."
-                    )
-
-                # TODO (Kourosh): Create a new util method for algorithm that
-                # computes actions based on raw inputs from env and can keep track
-                # of its own internal state.
-                acd = AgentConnectorDataType("0", "0", _input_dict)
-                # make sure the state is reset since we are only applying the
-                # preprocessor
-                pp.reset(env_id="0")
-                ac_o = pp([acd])[0]
-                observation = ac_o.data[Columns.OBS]
-
-        # Input-dict.
-        if input_dict is not None:
-            input_dict[Columns.OBS] = observation
-            action, state, extra = policy.compute_single_action(
-                input_dict=input_dict,
-                explore=explore,
-                timestep=timestep,
-                episode=episode,
-            )
-        # Individual args.
-        else:
-            action, state, extra = policy.compute_single_action(
-                obs=observation,
-                state=state,
-                prev_action=prev_action,
-                prev_reward=prev_reward,
-                info=info,
-                explore=explore,
-                timestep=timestep,
-                episode=episode,
-            )
-
-        # If we work in normalized action space (normalize_actions=True),
-        # we re-translate here into the env's action space.
-        if unsquash_action:
-            action = space_utils.unsquash_action(action, policy.action_space_struct)
-        # Clip, according to env's action space.
-        elif clip_action:
-            action = space_utils.clip_action(action, policy.action_space_struct)
-
-        # Return 3-Tuple: Action, states, and extra-action fetches.
-        if state or full_fetch:
-            return action, state, extra
-        # Ensure backward compatibility.
-        else:
-            return action
-
-    @OldAPIStack
-    def compute_actions(
-        self,
-        observations: TensorStructType,
-        state: Optional[List[TensorStructType]] = None,
-        *,
-        prev_action: Optional[TensorStructType] = None,
-        prev_reward: Optional[TensorStructType] = None,
-        info: Optional[EnvInfoDict] = None,
-        policy_id: PolicyID = DEFAULT_POLICY_ID,
-        full_fetch: bool = False,
-        explore: Optional[bool] = None,
-        timestep: Optional[int] = None,
-        episodes=None,
-        unsquash_actions: Optional[bool] = None,
-        clip_actions: Optional[bool] = None,
-        **kwargs,
-    ):
-        """Computes an action for the specified policy on the local Worker.
-
-        Note that you can also access the policy object through
-        self.get_policy(policy_id) and call compute_actions() on it directly.
-
-        Args:
-            observation: Observation from the environment.
-            state: RNN hidden state, if any. If state is not None,
-                then all of compute_single_action(...) is returned
-                (computed action, rnn state(s), logits dictionary).
-                Otherwise compute_single_action(...)[0] is returned
-                (computed action).
-            prev_action: Previous action value, if any.
-            prev_reward: Previous reward, if any.
-            info: Env info dict, if any.
-            policy_id: Policy to query (only applies to multi-agent).
-            full_fetch: Whether to return extra action fetch results.
-                This is always set to True if RNN state is specified.
-            explore: Whether to pick an exploitation or exploration
-                action (default: None -> use self.config.explore).
-            timestep: The current (sampling) time step.
-            episodes: This provides access to all of the internal episodes'
-                state, which may be useful for model-based or multi-agent
-                algorithms.
-            unsquash_actions: Should actions be unsquashed according
-                to the env's/Policy's action space? If None, use
-                self.config.normalize_actions.
-            clip_actions: Should actions be clipped according to the
-                env's/Policy's action space? If None, use
-                self.config.clip_actions.
-
-        Keyword Args:
-            kwargs: forward compatibility placeholder
-
-        Returns:
-            The computed action if full_fetch=False, or a tuple consisting of
-            the full output of policy.compute_actions_from_input_dict() if
-            full_fetch=True or we have an RNN-based Policy.
-        """
-        # `unsquash_actions` is None: Use value of config['normalize_actions'].
-        if unsquash_actions is None:
-            unsquash_actions = self.config.normalize_actions
-        # `clip_actions` is None: Use value of config['clip_actions'].
-        elif clip_actions is None:
-            clip_actions = self.config.clip_actions
-
-        # Preprocess obs and states.
-        state_defined = state is not None
-        policy = self.get_policy(policy_id)
-        filtered_obs, filtered_state = [], []
-        for agent_id, ob in observations.items():
-            worker = self.env_runner_group.local_env_runner
-            if worker.preprocessors.get(policy_id) is not None:
-                preprocessed = worker.preprocessors[policy_id].transform(ob)
-            else:
-                preprocessed = ob
-            filtered = worker.filters[policy_id](preprocessed, update=False)
-            filtered_obs.append(filtered)
-            if state is None:
-                continue
-            elif agent_id in state:
-                filtered_state.append(state[agent_id])
-            else:
-                filtered_state.append(policy.get_initial_state())
-
-        # Batch obs and states
-        obs_batch = np.stack(filtered_obs)
-        if state is None:
-            state = []
-        else:
-            state = list(zip(*filtered_state))
-            state = [np.stack(s) for s in state]
-
-        input_dict = {Columns.OBS: obs_batch}
-
-        # prev_action and prev_reward can be None, np.ndarray, or tensor-like structure.
-        # Explicitly check for None here to avoid the error message "The truth value of
-        # an array with more than one element is ambiguous.", when np arrays are passed
-        # as arguments.
-        if prev_action is not None:
-            input_dict[SampleBatch.PREV_ACTIONS] = prev_action
-        if prev_reward is not None:
-            input_dict[SampleBatch.PREV_REWARDS] = prev_reward
-        if info:
-            input_dict[Columns.INFOS] = info
-        for i, s in enumerate(state):
-            input_dict[f"state_in_{i}"] = s
-
-        # Batch compute actions
-        actions, states, infos = policy.compute_actions_from_input_dict(
-            input_dict=input_dict,
-            explore=explore,
-            timestep=timestep,
-            episodes=episodes,
-        )
-
-        # Unbatch actions for the environment into a multi-agent dict.
-        single_actions = space_utils.unbatch(actions)
-        actions = {}
-        for key, a in zip(observations, single_actions):
-            # If we work in normalized action space (normalize_actions=True),
-            # we re-translate here into the env's action space.
-            if unsquash_actions:
-                a = space_utils.unsquash_action(a, policy.action_space_struct)
-            # Clip, according to env's action space.
-            elif clip_actions:
-                a = space_utils.clip_action(a, policy.action_space_struct)
-            actions[key] = a
-
-        # Unbatch states into a multi-agent dict.
-        unbatched_states = {}
-        for idx, agent_id in enumerate(observations):
-            unbatched_states[agent_id] = [s[idx] for s in states]
-
-        # Return only actions or full tuple
-        if state_defined or full_fetch:
-            return actions, unbatched_states, infos
-        else:
-            return actions
-
     @OldAPIStack
     def add_policy(
         self,
@@ -4143,6 +3828,208 @@ def _compile_iteration_results_old_api_stack(
 
         return results
 
+    @OldAPIStack
+    @Deprecated(
+        help="`Algorithm.compute_single_action` should no longer be used. Get the "
+        "RLModule instance through `Algorithm.get_module([module ID])`, then compute "
+        "actions through `RLModule.forward_inference({'obs': [obs batch]})`.",
+        error=False,
+    )
+    def compute_single_action(
+        self,
+        observation: Optional[TensorStructType] = None,
+        state: Optional[List[TensorStructType]] = None,
+        *,
+        prev_action: Optional[TensorStructType] = None,
+        prev_reward: Optional[float] = None,
+        info: Optional[EnvInfoDict] = None,
+        input_dict: Optional[SampleBatch] = None,
+        policy_id: PolicyID = DEFAULT_POLICY_ID,
+        full_fetch: bool = False,
+        explore: Optional[bool] = None,
+        timestep: Optional[int] = None,
+        episode=None,
+        unsquash_action: Optional[bool] = None,
+        clip_action: Optional[bool] = None,
+    ) -> Union[
+        TensorStructType,
+        Tuple[TensorStructType, List[TensorType], Dict[str, TensorType]],
+    ]:
+        if unsquash_action is None:
+            unsquash_action = self.config.normalize_actions
+        elif clip_action is None:
+            clip_action = self.config.clip_actions
+
+        err_msg = (
+            "Provide either `input_dict` OR [`observation`, ...] as "
+            "args to `Algorithm.compute_single_action()`!"
+        )
+        if input_dict is not None:
+            assert (
+                observation is None
+                and prev_action is None
+                and prev_reward is None
+                and state is None
+            ), err_msg
+            observation = input_dict[Columns.OBS]
+        else:
+            assert observation is not None, err_msg
+
+        policy = self.get_policy(policy_id)
+        if policy is None:
+            raise KeyError(
+                f"PolicyID '{policy_id}' not found in PolicyMap of the "
+                f"Algorithm's local worker!"
+            )
+        pp = policy.agent_connectors[ObsPreprocessorConnector]
+
+        if not isinstance(observation, (np.ndarray, dict, tuple)):
+            try:
+                observation = np.asarray(observation)
+            except Exception:
+                raise ValueError(
+                    f"Observation type {type(observation)} cannot be converted to "
+                    f"np.ndarray."
+                )
+        if pp:
+            assert len(pp) == 1, "Only one preprocessor should be in the pipeline"
+            pp = pp[0]
+
+            if not pp.is_identity():
+                pp.in_eval()
+                if observation is not None:
+                    _input_dict = {Columns.OBS: observation}
+                elif input_dict is not None:
+                    _input_dict = {Columns.OBS: input_dict[Columns.OBS]}
+                else:
+                    raise ValueError(
+                        "Either observation or input_dict must be provided."
+                    )
+
+                acd = AgentConnectorDataType("0", "0", _input_dict)
+                pp.reset(env_id="0")
+                ac_o = pp([acd])[0]
+                observation = ac_o.data[Columns.OBS]
+
+        if input_dict is not None:
+            input_dict[Columns.OBS] = observation
+            action, state, extra = policy.compute_single_action(
+                input_dict=input_dict,
+                explore=explore,
+                timestep=timestep,
+                episode=episode,
+            )
+        else:
+            action, state, extra = policy.compute_single_action(
+                obs=observation,
+                state=state,
+                prev_action=prev_action,
+                prev_reward=prev_reward,
+                info=info,
+                explore=explore,
+                timestep=timestep,
+                episode=episode,
+            )
+
+        if unsquash_action:
+            action = space_utils.unsquash_action(action, policy.action_space_struct)
+        elif clip_action:
+            action = space_utils.clip_action(action, policy.action_space_struct)
+
+        if state or full_fetch:
+            return action, state, extra
+        else:
+            return action
+
+    @OldAPIStack
+    @Deprecated(
+        help="`Algorithm.compute_actions` should no longer be used. Get the RLModule "
+        "instance through `Algorithm.get_module([module ID])`, then compute actions "
+        "through `RLModule.forward_inference({'obs': [obs batch]})`.",
+        error=False,
+    )
+    def compute_actions(
+        self,
+        observations: TensorStructType,
+        state: Optional[List[TensorStructType]] = None,
+        *,
+        prev_action: Optional[TensorStructType] = None,
+        prev_reward: Optional[TensorStructType] = None,
+        info: Optional[EnvInfoDict] = None,
+        policy_id: PolicyID = DEFAULT_POLICY_ID,
+        full_fetch: bool = False,
+        explore: Optional[bool] = None,
+        timestep: Optional[int] = None,
+        episodes=None,
+        unsquash_actions: Optional[bool] = None,
+        clip_actions: Optional[bool] = None,
+    ):
+        if unsquash_actions is None:
+            unsquash_actions = self.config.normalize_actions
+        elif clip_actions is None:
+            clip_actions = self.config.clip_actions
+
+        state_defined = state is not None
+        policy = self.get_policy(policy_id)
+        filtered_obs, filtered_state = [], []
+        for agent_id, ob in observations.items():
+            worker = self.env_runner_group.local_env_runner
+            if worker.preprocessors.get(policy_id) is not None:
+                preprocessed = worker.preprocessors[policy_id].transform(ob)
+            else:
+                preprocessed = ob
+            filtered = worker.filters[policy_id](preprocessed, update=False)
+            filtered_obs.append(filtered)
+            if state is None:
+                continue
+            elif agent_id in state:
+                filtered_state.append(state[agent_id])
+            else:
+                filtered_state.append(policy.get_initial_state())
+
+        obs_batch = np.stack(filtered_obs)
+        if state is None:
+            state = []
+        else:
+            state = list(zip(*filtered_state))
+            state = [np.stack(s) for s in state]
+
+        input_dict = {Columns.OBS: obs_batch}
+
+        if prev_action is not None:
+            input_dict[SampleBatch.PREV_ACTIONS] = prev_action
+        if prev_reward is not None:
+            input_dict[SampleBatch.PREV_REWARDS] = prev_reward
+        if info:
+            input_dict[Columns.INFOS] = info
+        for i, s in enumerate(state):
+            input_dict[f"state_in_{i}"] = s
+
+        actions, states, infos = policy.compute_actions_from_input_dict(
+            input_dict=input_dict,
+            explore=explore,
+            timestep=timestep,
+            episodes=episodes,
+        )
+
+        single_actions = space_utils.unbatch(actions)
+        actions = {}
+        for key, a in zip(observations, single_actions):
+            if unsquash_actions:
+                a = space_utils.unsquash_action(a, policy.action_space_struct)
+            elif clip_actions:
+                a = space_utils.clip_action(a, policy.action_space_struct)
+            actions[key] = a
+
+        unbatched_states = {}
+        for idx, agent_id in enumerate(observations):
+            unbatched_states[agent_id] = [s[idx] for s in states]
+
+        if state_defined or full_fetch:
+            return actions, unbatched_states, infos
+        else:
+            return actions
+
     @Deprecated(
         new="Algorithm.env_runner_group",
         error=False,

From e33624ce14a5eff616866067e7d94e3513b0d2a0 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sun, 5 Jan 2025 16:33:51 +0100
Subject: [PATCH 02/22] Merge branch 'master' of
 https://github.com/ray-project/ray into docs_redo_getting_started

Signed-off-by: sven1977 <svenmika1977@gmail.com>

# Conflicts:
#	doc/source/rllib/rllib-training.rst
---
 doc/source/rllib/doc_code/getting_started.py  |  33 ----
 doc/source/rllib/rllib-training.rst           | 154 +++++++++++++++---
 rllib/algorithms/algorithm.py                 | 120 +++++++-------
 rllib/examples/_docs/rllib_on_rllib_readme.py |   6 +-
 4 files changed, 191 insertions(+), 122 deletions(-)

diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py
index 549a506ed043..95236f14aa41 100644
--- a/doc/source/rllib/doc_code/getting_started.py
+++ b/doc/source/rllib/doc_code/getting_started.py
@@ -49,39 +49,6 @@
 # __rllib-tuner-end__
 
 
-# __rllib-compute-action-begin__
-import pathlib
-import gymnasium as gym
-import numpy as np
-import torch
-from ray.rllib.core.rl_module import RLModule
-
-env = gym.make("CartPole-v1")
-
-# Create only the neural network (RLModule) from our checkpoint.
-rl_module = RLModule.from_checkpoint(
-    pathlib.Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module"
-)["default_policy"]
-
-episode_return = 0
-terminated = truncated = False
-
-obs, info = env.reset()
-
-while not terminated and not truncated:
-    # Compute the next action from a batch (B=1) of observations.
-    torch_obs_batch = torch.from_numpy(np.array([obs]))
-    action_logits = rl_module.forward_inference({"obs": torch_obs_batch})[
-        "action_dist_inputs"
-    ]
-    # The default RLModule used here produces action logits (from which
-    # we'll have to sample an action or use the max-likelihood one).
-    action = torch.argmax(action_logits[0]).numpy()
-    obs, reward, terminated, truncated, info = env.step(action)
-    episode_return += reward
-
-print(f"Reached episode return of {episode_return}.")
-# __rllib-compute-action-end__
 
 
 del rl_module
diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst
index fb8617179cc2..6ddf077307c2 100644
--- a/doc/source/rllib/rllib-training.rst
+++ b/doc/source/rllib/rllib-training.rst
@@ -7,22 +7,30 @@
 Getting Started
 ===============
 
-All RLlib experiments are run using an ``Algorithm`` class which holds a policy for environment interaction.
-Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing).
-In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once.
+In this tutorial, you learn how to design, customize, and run an RLlib learning experiment from scratch.
 
-In this guide, we will explain in detail RLlib's Python API for running learning experiments.
 
+.. _rllib-in-15min:
 
 RLlib in 15 minutes
 -------------------
 
-
 .. _rllib-python-api:
 
 Python API
 ~~~~~~~~~~
 
+You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class. An
+Algorithm typically holds a neural network for computing actions, called "policy", the :ref:`RL environment <rllib-key-concepts-environments>`
+you want to optimize against, a loss function, an optimizer, and some code describing the algorithm's execution logic, like determining when to
+take which particular steps.
+
+Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing).
+In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once.
+
+
+
+
 The Python API provides all the flexibility required for applying RLlib to any type of problem.
 
 Let's start with an example of the API's basic usage.
@@ -58,10 +66,6 @@ iterations (here `10`) and `save` the resulting policy periodically (here every
             checkpoint_dir = ppo.save_to_path()
             print(f"Algorithm checkpoint saved in: {checkpoint_dir}")
 
-.. testcode::
-    :hide:
-
-    algo.stop()
 
 
 .. _rllib-with-ray-tune:
@@ -123,10 +127,72 @@ To load newer RLlib checkpoints (version >= 1.0), use the following code:
 Customizing your RL environment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-In the preceding examples, your RL environment was always "CartPole-v1", however, you would probably like to
-run your actual experiments against a different environment or even write your own custom one.
-
-See here ...blabla
+In the preceding examples, your :ref:`RL environment <rllib-key-concepts-environments>` was a `Farama gymnasium <gymnasium.farama.org>`__
+pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a different environment or even write a custom one,
+see this tab below for a less-than-50-lines example of a custom ``gym.Env`` class.
+
+See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <rllib-environments-doc>` and how to customize them.
+
+.. dropdown:: Quickstart: Custom RL environment
+    :animate: fade-in-slide-down
+
+    .. testcode::
+
+        import gymnasium as gym
+        from ray.rllib.algorithms.ppo import PPOConfig
+
+        # 1) Define your custom env class:
+
+        class ParrotEnv(gym.Env):
+            """Environment in which the agent learns to repeat the seen observations.
+
+            Observations are float numbers indicating the to-be-repeated values,
+            e.g. -1.0, 5.1, or 3.2.
+            The action space is the same as the observation space.
+            Rewards are `r=-abs([observation] - [action])`, for all steps.
+            """
+            def __init__(self, config=None):
+                # Since actions should repeat observations, their spaces must be the same.
+                self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32)
+                self.action_space = self.observation_space
+                self._cur_obs = None
+                self._episode_len = 0
+
+            def reset(self, *, seed=None, options=None):
+                """Resets the environment, starting a new episode."""
+                # Reset the episode len.
+                self._episode_len = 0
+                # Sample a random number from our observation space.
+                self._cur_obs = self.observation_space.sample()
+                # Return initial observation.
+                return self._cur_obs, {}
+
+            def step(self, action):
+                """Takes a single step in the episode given `action`."""
+                # Set `terminated` and `truncated` flags to True after 10 steps.
+                self._episode_len += 1
+                terminated = truncated = self._episode_len >= 10
+                # Compute the reward: `r = -abs([obs] - [action])`
+                reward = -sum(abs(self._cur_obs - action))
+                # Set a new observation (random sample).
+                self._cur_obs = self.observation_space.sample()
+                return self._cur_obs, reward, terminated, truncated, {}
+
+        # 2) Configure it through RLlib's algorithm configs:
+        config = (
+            PPOConfig()
+            .environment(ParrotEnv)  # add `env_config=[some Box space] to customize the env
+        )
+
+        # 3) Build the PPO and train
+        ppo_w_custom_env = config.build()
+
+    .. testcode::
+        :hide:
+
+        # Test that our setup is working.
+        ppo_w_custom_env.train()
+        ppo_w_custom_env.stop()
 
 Customizing your models
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -140,21 +206,61 @@ for a detailed guide on how to do so.
 Deploying your models and computing actions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+The simplest way to programmatically compute actions from a trained :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
+is to get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module`,
+then call the module's :py:meth:`~ray.rllib.core.rl_module.rl_module.forward_inference` method.
+
+Here is an example of how to test a trained agent for one episode:
 
+.. testcode::
 
-The simplest way to programmatically compute actions from a trained agent is to
-use ``Algorithm.compute_single_action()``.
-This method preprocesses and filters the observation before passing it to the agent
-policy.
-Here is a simple example of testing a trained agent for one episode:
+    import gymnasium as gym
+    import numpy as np
+    import torch
+    from ray.rllib.core.rl_module import RLModule
 
-.. literalinclude:: ./doc_code/getting_started.py
-    :language: python
-    :start-after: rllib-compute-action-begin
-    :end-before: rllib-compute-action-end
+    env = gym.make("CartPole-v1")
+
+    # Get the RLModule from the up and running Algorithm instance:
+    rl_module = ppo.get_module()
+
+    episode_return = 0
+    terminated = truncated = False
+
+    obs, info = env.reset()
+
+    while not terminated and not truncated:
+        # Compute the next action from a batch (B=1) of observations.
+        obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
+        # Extract the logits from the output and dissolve batch again.
+        action_logits = rl_module.forward_inference({"obs": obs_batch})[
+            "action_dist_inputs"
+        ][0]
+        # PPO's default RLModule produces action logits (from which
+        # you have to sample an action or use the max-likelihood one).
+        action = numpy.argmax(action_logits.numpy())
+        # Send the action to the environment for the next step.
+        obs, reward, terminated, truncated, info = env.step(action)
+        episode_return += reward
+
+    print(f"Reached episode return of {episode_return}.")
+
+
+If you don't have your Algorithm instance up and running anymore and would like to create the trained RLModule
+from a checkpoint, you can do the following instead.
+Note that `best_checkpoint` is the highest performing Algorithm checkpoint you created
+in the preceding experiment. To learn more about checkpoints and their structure, see this :ref:`checkpointing guide <rllib-checkpointing-docs>`.
+
+.. testcode::
+
+    from pathlib import Path
+
+    # Create only the neural network (RLModule) from our checkpoint.
+    rl_module = RLModule.from_checkpoint(
+        Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module"
+    )["default_policy"]
 
-For more advanced usage on computing actions and other functionality,
-you can consult the :ref:`RLlib Algorithm API documentation <rllib-algorithm-api>`.
+    # Do the same computations with `rl_module` as in the preceding code snippet.
 
 
 Accessing Policy State
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index fd52894dc432..dc1123942a84 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -230,32 +230,33 @@ def _get_learner_bundles(cf: AlgorithmConfig) -> List[Dict[str, int]]:
 class Algorithm(Checkpointable, Trainable, AlgorithmBase):
     """An RLlib algorithm responsible for optimizing one or more Policies.
 
-    Algorithms contain a EnvRunnerGroup under `self.env_runner_group`. An EnvRunnerGroup
-    is composed of a single local EnvRunner (`self.env_runner_group.local_env_runner`),
+    Algorithms contain a EnvRunnerGroup under `self.env_runner_group`, which is
+    composed of one local EnvRunner (`self.env_runner_group.local_env_runner`),
     serving as the reference copy of the NeuralNetwork(s) to be trained and optionally
-    one or more remote EnvRunners used to generate environment samples in parallel.
+    one or more remote EnvRunner actors used to generate environment samples.
     EnvRunnerGroup is fault-tolerant and elastic. It tracks health states for all
     the managed remote EnvRunner actors. As a result, Algorithm should never
     access the underlying actor handles directly. Instead, always access them
     via all the foreach APIs with assigned IDs of the underlying EnvRunners.
 
-    Each EnvRunners (remotes or local) contains a PolicyMap, which itself
-    may contain either one policy for single-agent training or one or more
-    policies for multi-agent training. Policies are synchronized
+    Each EnvRunners (remotes or local) contains an RLModule, which
+    contains either one policy network for single-agent training or one or more
+    policy networks for multi-agent training. RLModules are synchronized
     automatically from time to time using ray.remote calls. The exact
-    synchronization logic depends on the specific algorithm used,
-    but this usually happens from local worker to all remote workers and
-    after each training update.
+    synchronization logic depends on the specific algorithm used and typically
+    happens from local worker to all remote workers and after each training update.
 
     You can write your own Algorithm classes by sub-classing from `Algorithm`
     or any of its built-in sub-classes.
-    This allows you to override the `training_step` method to implement
-    your own algorithm logic. You can find the different built-in
-    algorithms' `training_step()` methods in their respective main .py files,
-    e.g. rllib.algorithms.dqn.dqn.py or rllib.algorithms.impala.impala.py.
-
-    The most important API methods a Algorithm exposes are `train()`,
-    `evaluate()`, `save_to_path()` and `restore_from_path()`.
+    Override the `training_step` method to implement your own algorithm logic. You can
+    find the different built-in algorithms' `training_step()` methods in their
+    respective [algorithm name].py files,
+    e.g. rllib.algorithms.ppo.ppo.py or rllib.algorithms.dqn.dqn.py.
+
+    The most important API methods an Algorithm exposes are `train()` for running a
+    single training iteration, `evaluate()` for running a single round of evaluation,
+    `save_to_path()` for creating a checkpoint, and `restore_from_path()` for loading a
+    state from an existing checkpoint.
     """
 
     # Whether to allow unknown top-level config keys.
@@ -307,7 +308,7 @@ class Algorithm(Checkpointable, Trainable, AlgorithmBase):
     @override(Checkpointable)
     def from_checkpoint(
         cls,
-        path: Optional[Union[str, Checkpoint]] = None,
+        path: Union[str, Checkpoint],
         filesystem: Optional["pyarrow.fs.FileSystem"] = None,
         *,
         # @OldAPIStack
@@ -349,19 +350,17 @@ def from_checkpoint(
             deprecation_warning(
                 old="Algorithm.from_checkpoint(checkpoint=...)",
                 new="Algorithm.from_checkpoint(path=...)",
-                error=False,
-            )
-            path = checkpoint
-        if path is None:
-            raise ValueError(
-                "`path` not provided in call to Algorithm.from_checkpoint()!"
+                error=True,
             )
-
         checkpoint_info = get_checkpoint_info(path)
 
+        # New API stack -> Use Checkpointable's default implementation.
+        if checkpoint_info["checkpoint_version"] >= version.Version("2.0"):
+            return super().from_checkpoint(path, filesystem=filesystem, **kwargs)
+
         # Not possible for (v0.1) (algo class and config information missing
         # or very hard to retrieve).
-        if checkpoint_info["checkpoint_version"] == version.Version("0.1"):
+        elif checkpoint_info["checkpoint_version"] == version.Version("0.1"):
             raise ValueError(
                 "Cannot restore a v0 checkpoint using `Algorithm.from_checkpoint()`!"
                 "In this case, do the following:\n"
@@ -375,9 +374,6 @@ def from_checkpoint(
                 "()` must be 1.0 or later! You are using a checkpoint with "
                 f"version v{checkpoint_info['checkpoint_version']}."
             )
-        # New API stack -> Use Checkpointable's default implementation.
-        elif checkpoint_info["checkpoint_version"] >= version.Version("2.0"):
-            return super().from_checkpoint(path, filesystem=filesystem, **kwargs)
 
         # This is a msgpack checkpoint.
         if checkpoint_info["format"] == "msgpack":
@@ -411,40 +407,6 @@ def from_checkpoint(
 
         return Algorithm.from_state(state)
 
-    @OldAPIStack
-    @staticmethod
-    def from_state(state: Dict) -> "Algorithm":
-        """Recovers an Algorithm from a state object.
-
-        The `state` of an instantiated Algorithm can be retrieved by calling its
-        `get_state` method. It contains all information necessary
-        to create the Algorithm from scratch. No access to the original code (e.g.
-        configs, knowledge of the Algorithm's class, etc..) is needed.
-
-        Args:
-            state: The state to recover a new Algorithm instance from.
-
-        Returns:
-            A new Algorithm instance.
-        """
-        algorithm_class: Type[Algorithm] = state.get("algorithm_class")
-        if algorithm_class is None:
-            raise ValueError(
-                "No `algorithm_class` key was found in given `state`! "
-                "Cannot create new Algorithm."
-            )
-        # algo_class = get_trainable_cls(algo_class_name)
-        # Create the new algo.
-        config = state.get("config")
-        if not config:
-            raise ValueError("No `config` found in given Algorithm state!")
-        new_algo = algorithm_class(config=config)
-        # Set the new algo's state.
-        new_algo.__setstate__(state)
-
-        # Return the new algo.
-        return new_algo
-
     @PublicAPI
     def __init__(
         self,
@@ -2279,6 +2241,40 @@ def fn(worker):
         if remove_from_eval_env_runners and self.eval_env_runner_group is not None:
             self.eval_env_runner_group.foreach_env_runner(fn, local_env_runner=True)
 
+    @OldAPIStack
+    @staticmethod
+    def from_state(state: Dict) -> "Algorithm":
+        """Recovers an Algorithm from a state object.
+
+        The `state` of an instantiated Algorithm can be retrieved by calling its
+        `get_state` method. It contains all information necessary
+        to create the Algorithm from scratch. No access to the original code (e.g.
+        configs, knowledge of the Algorithm's class, etc..) is needed.
+
+        Args:
+            state: The state to recover a new Algorithm instance from.
+
+        Returns:
+            A new Algorithm instance.
+        """
+        algorithm_class: Type[Algorithm] = state.get("algorithm_class")
+        if algorithm_class is None:
+            raise ValueError(
+                "No `algorithm_class` key was found in given `state`! "
+                "Cannot create new Algorithm."
+            )
+        # algo_class = get_trainable_cls(algo_class_name)
+        # Create the new algo.
+        config = state.get("config")
+        if not config:
+            raise ValueError("No `config` found in given Algorithm state!")
+        new_algo = algorithm_class(config=config)
+        # Set the new algo's state.
+        new_algo.__setstate__(state)
+
+        # Return the new algo.
+        return new_algo
+
     @OldAPIStack
     def export_policy_model(
         self,
diff --git a/rllib/examples/_docs/rllib_on_rllib_readme.py b/rllib/examples/_docs/rllib_on_rllib_readme.py
index 4463eba4ce85..be63d2da2c78 100644
--- a/rllib/examples/_docs/rllib_on_rllib_readme.py
+++ b/rllib/examples/_docs/rllib_on_rllib_readme.py
@@ -46,7 +46,7 @@ def step(self, action):
 
         Returns: New observation, reward, done-flag, info-dict (empty).
         """
-        # Set `done` and `truncated` flags after 10 steps.
+        # Set `terminated` and `truncated` flags to True after 10 steps.
         self.episode_len += 1
         terminated = truncated = self.episode_len >= 10
         # r = -abs(obs - action)
@@ -60,9 +60,9 @@ def step(self, action):
 # act in the above environment.
 config = (
     PPOConfig().environment(
-        # Env class to use (here: our gym.Env sub-class from above).
+        # Env class to use (your gym.Env subclass from above).
         env=ParrotEnv,
-        # Config dict to be passed to our custom env's constructor.
+        # Config dict to be passed to your custom env's constructor.
         env_config={"parrot_shriek_range": gym.spaces.Box(-5.0, 5.0, (1,))},
     )
     # Parallelize environment rollouts.

From 042909da64a2a02eb662441d3af8c291f67d1735 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 8 Jan 2025 20:08:01 +0100
Subject: [PATCH 03/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/doc_code/rllib_in_60s.py | 25 --------
 doc/source/rllib/index.rst                | 72 ++++++++++++++++++-----
 doc/source/rllib/rllib-training.rst       | 62 +++++++++++++------
 3 files changed, 99 insertions(+), 60 deletions(-)
 delete mode 100644 doc/source/rllib/doc_code/rllib_in_60s.py

diff --git a/doc/source/rllib/doc_code/rllib_in_60s.py b/doc/source/rllib/doc_code/rllib_in_60s.py
deleted file mode 100644
index 6d214504f15d..000000000000
--- a/doc/source/rllib/doc_code/rllib_in_60s.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# flake8: noqa
-
-# __rllib-in-60s-begin__
-from ray.rllib.algorithms.ppo import PPOConfig
-from ray.rllib.connectors.env_to_module import FlattenObservations
-
-# 1. Configure the algorithm,
-config = (
-    PPOConfig()
-    .environment("Taxi-v3")
-    .env_runners(
-        num_env_runners=2,
-        # Observations are discrete (ints) -> We need to flatten (one-hot) them.
-        env_to_module_connector=lambda env: FlattenObservations(),
-    )
-    .evaluation(evaluation_num_env_runners=1)
-)
-# 2. build the algorithm ..
-algo = config.build()
-# 3. .. train it ..
-for _ in range(5):
-    print(algo.train())
-# 4. .. and evaluate it.
-algo.evaluate()
-# __rllib-in-60s-end__
diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst
index 1d7c05099141..d42a5c272583 100644
--- a/doc/source/rllib/index.rst
+++ b/doc/source/rllib/index.rst
@@ -101,32 +101,72 @@ Install RLlib and `PyTorch <https://pytorch.org>`__, as shown below:
 .. note::
 
     To be able to run the Atari or MuJoCo examples, you also need to do:
-    `pip install "gymnasium[atari,accept-rom-license,mujoco]"`.
 
-This is all. You can now start coding against RLlib. Here is an example for running the PPO Algorithm on the
+    .. code-block:: bash
+
+        `pip install "gymnasium[atari,accept-rom-license,mujoco]"`.
+
+This is all, you can now start coding against RLlib. Here is an example for running the :ref:`PPO Algorithm <ppo>` on the
 `Taxi domain <https://gymnasium.farama.org/environments/toy_text/taxi/>`__.
-You first create a `config` for the algorithm, which defines the RL environment and
-any other needed settings and parameters.
+You first create a `config` for the algorithm, which defines the :ref:`RL environment <rllib-key-concepts-environments>` and any other needed settings and parameters.
+
+.. testcode::
+
+    from ray.rllib.algorithms.ppo import PPOConfig
+    from ray.rllib.connectors.env_to_module import FlattenObservations
+
+    # Configure the algorithm.
+    config = (
+        PPOConfig()
+        .environment("Taxi-v3")
+        .env_runners(
+            num_env_runners=2,
+            # Observations are discrete (ints) -> We need to flatten (one-hot) them.
+            env_to_module_connector=lambda env: FlattenObservations(),
+        )
+        .evaluation(evaluation_num_env_runners=1)
+    )
+
+
+Next, ``build`` the algorithm and ``train`` it for a total of five iterations.
+One training iteration includes parallel, distributed sample collection by the
+:py:class:`~ray.rllib.env.env_runner.EnvRunner` actors, followed by loss calculation
+on the collected data, and a model update step.
+
+.. testcode::
+
+    from pprint import pprint
+
+    # Build the algorithm.
+    algo = config.build_algo()
+
+    # Train it for 5 iterations ...
+    for _ in range(5):
+        pprint(algo.train())
+
+At the end of your script, you evaluate the trained Algorithm:
+
+.. testcode::
+
+    # ... and evaluate it.
+    pprint(algo.evaluate())
+
 
-Next, `build` the algorithm and `train` it for a total of five iterations.
-One training iteration includes parallel, distributed sample collection by the :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors,
-followed by loss calculation on the collected data, and a model update step.
+.. testcode::
+    :hide:
 
-At the end of your script, RLlib evaluates the trained Algorithm:
+    algo.stop()
 
-.. literalinclude:: doc_code/rllib_in_60s.py
-    :language: python
-    :start-after: __rllib-in-60s-begin__
-    :end-before: __rllib-in-60s-end__
 
 You can use any `Farama-Foundation Gymnasium <https://github.com/Farama-Foundation/Gymnasium>`__ registered environment
-with the `env` argument.
+with the ``env`` argument.
 
-In `config.env_runners()` you can specify - amongst many other things - the number of parallel
+In ``config.env_runners()`` you can specify - amongst many other things - the number of parallel
 :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors to collect samples from the environment.
 
-You can also tweak the NN architecture used by tweaking RLlib's `DefaultModelConfig`, as well as, set up a separate
-config for the evaluation :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors through the `config.evaluation()` method.
+You can also tweak the NN architecture used by tweaking RLlib's :py:class:`~ray.rllib.core.rl_module.default_model_cnofig.DefaultModelConfig`,
+as well as, set up a separate config for the evaluation
+:py:class:`~ray.rllib.env.env_runner.EnvRunner` actors through the ``config.evaluation()`` method.
 
 `See here <rllib-training.html#using-the-python-api>`_, if you want to learn more about the RLlib training APIs.
 Also, `see here <https://github.com/ray-project/ray/blob/master/rllib/examples/inference/policy_inference_after_training.py>`__
diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst
index 6ddf077307c2..692f3507bf90 100644
--- a/doc/source/rllib/rllib-training.rst
+++ b/doc/source/rllib/rllib-training.rst
@@ -7,31 +7,54 @@
 Getting Started
 ===============
 
-In this tutorial, you learn how to design, customize, and run an RLlib learning experiment from scratch.
+.. _rllib-in-60min:
 
+RLlib in 60 minutes
+-------------------
 
-.. _rllib-in-15min:
+.. figure:: images/rllib-index-header.svg
+
+In this tutorial, you learn how to design, customize, and run an end-to-end RLlib learning experiment
+from scratch. This includes picking and configuring an Algorithm, running a couple of training iterations,
+saving the state of your Algorithm from time to time, running a separate evaluation loop,
+and finally utilizing one of the checkpoints to deploy your trained model in an environment outside of RLlib
+and compute actions through it.
+
+You also learn how to optionally customize your RL environment and your neural network model.
+
+Installation
+~~~~~~~~~~~~
+
+First, install RLlib and `PyTorch <https://pytorch.org>`__, as shown below:
+
+.. code-block:: bash
+
+    pip install "ray[rllib]" "gymnasium[atari,accept-rom-license,mujoco]" torch
 
-RLlib in 15 minutes
--------------------
 
 .. _rllib-python-api:
 
 Python API
 ~~~~~~~~~~
 
-You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` class. An
-Algorithm typically holds a neural network for computing actions, called "policy", the :ref:`RL environment <rllib-key-concepts-environments>`
-you want to optimize against, a loss function, an optimizer, and some code describing the algorithm's execution logic, like determining when to
-take which particular steps.
+RLlib's Python API provides all the flexibility required for applying the library to any
+type of RL problem.
+
+You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
+class. An :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` typically holds a neural
+network for computing actions, called "policy", the :ref:`RL environment <rllib-key-concepts-environments>`
+you want to optimize against, a loss function, an optimizer, and some code describing the
+algorithm's execution logic, like determining when to take which particular steps.
+
+In multi-agent training, :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
+manages the querying and optimization of multiple policies at once.
 
-Through the algorithm's interface, you can train the policy, compute actions, or store your algorithm's state (checkpointing).
-In multi-agent training, the algorithm manages the querying and optimization of multiple policies at once.
+Through the algorithm's interface, you can train the policy, compute actions, or store your
+algorithm's state through checkpointing.
 
 
 
 
-The Python API provides all the flexibility required for applying RLlib to any type of problem.
 
 Let's start with an example of the API's basic usage.
 We first create a `PPOConfig` instance and set some properties through the config class' various methods.
@@ -114,7 +137,7 @@ of the training results and retrieving the checkpoint(s) of the trained agent.
 
 Loading and restoring a trained algorithm from a checkpoint is simple.
 Let's assume you have a local checkpoint directory called ``checkpoint_path``.
-To load newer RLlib checkpoints (version >= 1.0), use the following code:
+To load newer RLlib checkpoints (version >= 2.1), use the following code:
 
 
 .. code-block:: python
@@ -128,8 +151,8 @@ Customizing your RL environment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 In the preceding examples, your :ref:`RL environment <rllib-key-concepts-environments>` was a `Farama gymnasium <gymnasium.farama.org>`__
-pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a different environment or even write a custom one,
-see this tab below for a less-than-50-lines example of a custom ``gym.Env`` class.
+pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a custom one,
+see this tab below for a less-than-50-lines example.
 
 See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <rllib-environments-doc>` and how to customize them.
 
@@ -141,7 +164,7 @@ See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <r
         import gymnasium as gym
         from ray.rllib.algorithms.ppo import PPOConfig
 
-        # 1) Define your custom env class:
+        # Define your custom env class by subclassing gymnasium.Env:
 
         class ParrotEnv(gym.Env):
             """Environment in which the agent learns to repeat the seen observations.
@@ -178,22 +201,23 @@ See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <r
                 self._cur_obs = self.observation_space.sample()
                 return self._cur_obs, reward, terminated, truncated, {}
 
-        # 2) Configure it through RLlib's algorithm configs:
+        # Point your config to your custom env class:
         config = (
             PPOConfig()
             .environment(ParrotEnv)  # add `env_config=[some Box space] to customize the env
         )
 
-        # 3) Build the PPO and train
-        ppo_w_custom_env = config.build()
+        # Build a PPO algorithm and train it.
+        ppo_w_custom_env = config.build_algo()
+        ppo_w_custom_env.train()
 
     .. testcode::
         :hide:
 
         # Test that our setup is working.
-        ppo_w_custom_env.train()
         ppo_w_custom_env.stop()
 
+
 Customizing your models
 ~~~~~~~~~~~~~~~~~~~~~~~
 

From 84b63f48853b7f768fa9e9b7cf9eac719f887916 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 9 Jan 2025 10:05:20 +0100
Subject: [PATCH 04/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/rllib-training.rst | 88 ++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 15 deletions(-)

diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst
index 692f3507bf90..c86ac015b137 100644
--- a/doc/source/rllib/rllib-training.rst
+++ b/doc/source/rllib/rllib-training.rst
@@ -1,12 +1,12 @@
 .. include:: /_includes/rllib/we_are_hiring.rst
 
-.. include:: /_includes/rllib/new_api_stack.rst
-
 .. _rllib-getting-started:
 
 Getting Started
 ===============
 
+.. include:: /_includes/rllib/new_api_stack.rst
+
 .. _rllib-in-60min:
 
 RLlib in 60 minutes
@@ -20,7 +20,7 @@ saving the state of your Algorithm from time to time, running a separate evaluat
 and finally utilizing one of the checkpoints to deploy your trained model in an environment outside of RLlib
 and compute actions through it.
 
-You also learn how to optionally customize your RL environment and your neural network model.
+You also learn how to customize your RL environment and your neural network model.
 
 Installation
 ~~~~~~~~~~~~
@@ -46,24 +46,28 @@ network for computing actions, called "policy", the :ref:`RL environment <rllib-
 you want to optimize against, a loss function, an optimizer, and some code describing the
 algorithm's execution logic, like determining when to take which particular steps.
 
-In multi-agent training, :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
-manages the querying and optimization of multiple policies at once.
+In :ref:`multi-agent training <rllib-multi-agent-environments-doc>`,
+:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` manages the querying and optimization of multiple policies at once.
 
 Through the algorithm's interface, you can train the policy, compute actions, or store your
 algorithm's state through checkpointing.
 
 
+Configure and build the algorithm
++++++++++++++++++++++++++++++++++
 
+You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance
+and change some default settings through the config object's various methods.
 
+For example, we can set the RL environment we want to use by calling the config's
+:py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment` method.
 
-Let's start with an example of the API's basic usage.
-We first create a `PPOConfig` instance and set some properties through the config class' various methods.
-For example, we can set the RL environment we want to use by calling the config's `environment` method.
-To scale our algorithm and define, how many environment workers (EnvRunners) we want to leverage, we can call
-the `env_runners` method.
-After we `build` the `PPO` Algorithm from its configuration, we can `train` it for a number of
-iterations (here `10`) and `save` the resulting policy periodically (here every `5` iterations).
+To scale our setup and define, how many EnvRunner actors you want to leverage,
+you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method.
 
+Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance
+through calling the :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo`
+method.
 
 .. testcode::
 
@@ -72,11 +76,65 @@ iterations (here `10`) and `save` the resulting policy periodically (here every
     # Configure the Algorithm (PPO).
     config = (
         PPOConfig()
-        .environment("CartPole-v1")
-        .env_runners(num_env_runners=1)
+        .environment("Pendulum-v1")
+        .env_runners(num_env_runners=3)
+        .training(
+            lr=0.0002,
+            train_batch_size_per_learner=2000,
+            num_epochs=10,
+        )
     )
+
     # Build the Algorithm (PPO).
-    ppo = config.build()
+    ppo = config.build_algo()
+
+
+.. note::
+
+    See here to learn, which config methods you can use to configure your Algorithm, see here.
+
+
+Run the algorithm
++++++++++++++++++
+
+After you have built your :ref:`PPO <ppo>` from its configuration, you can ``train`` it for a number of
+iterations through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.train` method.
+
+It returns a result dictionary that you can pretty-print for debugging purposes:
+
+.. testcode::
+
+    from pprint import pprint
+
+    for _ in range(5):
+        pprint(ppo.train())
+
+
+Checkpoint the algorithm
+++++++++++++++++++++++++
+
+To save your Algorithm's current state, create a so-called ``checkpoint`` through
+calling its `save_to_path` method. It returns the location of the saved checkpoint.
+
+Alternatively to not passing any arguments and letting the algorithm decide, where to save
+the checkpoint, you can provide a checkpoint directory yourself:
+
+.. testcode::
+
+    checkpoint_path = ppo.save_to_path()
+
+    # OR:
+    # ppo.save_to_path([a checkpoint location of your choice])
+
+
+Evaluate the algorithm
+++++++++++++++++++++++
+
+
+Restore the model
+
+
+Let's start with an example of the API's basic usage.
 
     # Train for 10 iterations.
     for i in range(10):

From e0d6ce65d09361b5290a544ec8a6f22b8649833b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 17 Jan 2025 21:03:28 +0100
Subject: [PATCH 05/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/doc_code/getting_started.py |  81 ---
 doc/source/rllib/doc_code/training.py        |   3 +-
 doc/source/rllib/getting-started.rst         | 487 +++++++++++++++++++
 doc/source/rllib/index.rst                   |   2 +-
 doc/source/rllib/rllib-training.rst          | 462 +-----------------
 5 files changed, 493 insertions(+), 542 deletions(-)
 delete mode 100644 doc/source/rllib/doc_code/getting_started.py
 create mode 100644 doc/source/rllib/getting-started.rst

diff --git a/doc/source/rllib/doc_code/getting_started.py b/doc/source/rllib/doc_code/getting_started.py
deleted file mode 100644
index 95236f14aa41..000000000000
--- a/doc/source/rllib/doc_code/getting_started.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# flake8: noqa
-
-if False:
-    # __rllib-tune-config-begin__
-    from ray import train, tune
-
-    config = (
-        PPOConfig()
-        .environment("CartPole-v1")
-        .training(
-            lr=tune.grid_search([0.01, 0.001, 0.0001]),
-        )
-    )
-
-    tuner = tune.Tuner(
-        "PPO",
-        param_space=config,
-        run_config=train.RunConfig(
-            stop={"env_runners/episode_return_mean": 150.0},
-        ),
-    )
-
-    tuner.fit()
-    # __rllib-tune-config-end__
-
-
-# __rllib-tuner-begin__
-from ray import train, tune
-
-# Tuner.fit() allows setting a custom log directory (other than ~/ray-results).
-tuner = tune.Tuner(
-    "PPO",
-    param_space=config,
-    run_config=train.RunConfig(
-        stop={"num_env_steps_sampled_lifetime": 20000},
-        checkpoint_config=train.CheckpointConfig(checkpoint_at_end=True),
-    ),
-)
-
-results = tuner.fit()
-
-# Get the best result based on a particular metric.
-best_result = results.get_best_result(
-    metric="env_runners/episode_return_mean", mode="max"
-)
-
-# Get the best checkpoint corresponding to the best result.
-best_checkpoint = best_result.checkpoint
-# __rllib-tuner-end__
-
-
-
-
-del rl_module
-
-
-# __rllib-get-state-begin__
-from ray.rllib.algorithms.ppo import PPOConfig
-
-algo = (
-    PPOConfig()
-    .environment("CartPole-v1")
-    .env_runners(num_env_runners=2)
-).build()
-
-# Get weights of the algo's RLModule.
-algo.get_module().get_state()
-
-# Same as above
-algo.env_runner.module.get_state()
-
-# Get list of weights of each EnvRunner, including remote replicas.
-algo.env_runner_group.foreach_worker(lambda env_runner: env_runner.module.get_state())
-
-# Same as above, but with index.
-algo.env_runner_group.foreach_worker_with_id(
-    lambda _id, env_runner: env_runner.module.get_state()
-)
-# __rllib-get-state-end__
-
-algo.stop()
diff --git a/doc/source/rllib/doc_code/training.py b/doc/source/rllib/doc_code/training.py
index 75bf8a48f18c..2e50cd6e0425 100644
--- a/doc/source/rllib/doc_code/training.py
+++ b/doc/source/rllib/doc_code/training.py
@@ -36,7 +36,8 @@
 algo = (
     DQNConfig()
     .api_stack(
-        enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False
+        enable_rl_module_and_learner=False,
+        enable_env_runner_and_connector_v2=False,
     )
     .framework("torch")
     .environment("CartPole-v1")
diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
new file mode 100644
index 000000000000..f280185cc4e6
--- /dev/null
+++ b/doc/source/rllib/getting-started.rst
@@ -0,0 +1,487 @@
+.. include:: /_includes/rllib/we_are_hiring.rst
+
+.. _rllib-getting-started:
+
+Getting Started
+===============
+
+.. include:: /_includes/rllib/new_api_stack.rst
+
+.. _rllib-in-60min:
+
+RLlib in 60 minutes
+-------------------
+
+.. figure:: images/rllib-index-header.svg
+
+In this tutorial, you learn how to design, customize, and run an end-to-end RLlib learning experiment
+from scratch. This includes picking and configuring an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`,
+running a couple of training iterations, saving the state of your
+:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` from time to time, running a separate
+evaluation loop, and finally utilizing one of the checkpoints to deploy your trained model
+to an environment outside of RLlib and compute actions.
+
+You also learn how to customize your :ref:`RL environment <rllib-key-concepts-environments>`
+and your :ref:`neural network model <rllib-key-concepts-rl-modules>`.
+
+Installation
+~~~~~~~~~~~~
+
+First, install RLlib, `PyTorch <https://pytorch.org>`__, and `Farama Gymnasium <https://gymnasium.farama.org>`__ as shown below:
+
+.. code-block:: bash
+
+    pip install "ray[rllib]" torch "gymnasium[atari,accept-rom-license,mujoco]"
+
+
+.. _rllib-python-api:
+
+Python API
+~~~~~~~~~~
+
+RLlib's Python API provides all the flexibility required for applying the library to any
+type of RL problem.
+
+You manage RLlib experiments through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
+class. An :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` typically holds a neural
+network for computing actions, called ``policy``, the :ref:`RL environment <rllib-key-concepts-environments>`
+that you want to optimize against, a loss function, an optimizer, and some code describing the
+algorithm's execution logic, like determining when to collect samples, when to update your model, etc..
+
+In :ref:`multi-agent training <rllib-multi-agent-environments-doc>`,
+:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` manages the querying and optimization of multiple policies at once.
+
+Through the algorithm's interface, you can train the policy, compute actions, or store your
+algorithm's state through checkpointing.
+
+
+Configure and build the algorithm
++++++++++++++++++++++++++++++++++
+
+You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance
+and change some default settings through the config object's various methods.
+
+For example, we can set the :ref:`RL environment <rllib-key-concepts-environments>`
+we want to use by calling the config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment`
+method:
+
+.. testcode::
+
+    from ray.rllib.algorithms.ppo import PPOConfig
+
+    # Create a config instance for the PPO algorithm.
+    config = (
+        PPOConfig()
+        .environment("Pendulum-v1")
+    )
+
+
+To scale our setup and define, how many EnvRunner actors you want to leverage,
+you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method:
+
+.. testcode::
+
+    config.env_runners(num_env_runners=3)
+
+For training-related settings or any algorithm-specific settings, use the
+:py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.training` method:
+
+.. testcode::
+
+    config.training(
+        lr=0.0002,
+        train_batch_size_per_learner=2000,
+        num_epochs=10,
+    )
+
+Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance
+through calling your config's :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo`
+method.
+
+.. testcode::
+
+    # Build the Algorithm (PPO).
+    ppo = config.build_algo()
+
+
+.. note::
+
+    See here to learn about the :ref:`methods you can use to configure your Algorithm <rllib-algo-configuration-docs>`.
+
+
+Run the algorithm
++++++++++++++++++
+
+After you built your :ref:`PPO <ppo>` from its configuration, you can ``train`` it for a number of
+iterations through calling the :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.train` method,
+which returns a result dictionary that you can pretty-print for debugging purposes:
+
+.. testcode::
+
+    from pprint import pprint
+
+    for _ in range(5):
+        pprint(ppo.train())
+
+
+Checkpoint the algorithm
+++++++++++++++++++++++++
+
+To save the current state of your :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`,
+create a ``checkpoint`` through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.save_to_path` method,
+which returns the directory of the saved checkpoint.
+
+Instead of not passing any arguments to this call and letting the algorithm decide where to save
+the checkpoint, you can also provide a checkpoint directory yourself:
+
+.. testcode::
+
+    checkpoint_path = ppo.save_to_path()
+
+    # OR:
+    # ppo.save_to_path([a checkpoint location of your choice])
+
+
+Evaluate the algorithm
+++++++++++++++++++++++
+
+RLlib supports setting up a separate :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup`
+for the sole purpose of evaluating your model from time to time on the RL environment.
+
+Use your config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.evaluation` method
+to set up the details. By default, RLlib doesn't perform evaluation during training and only reports the
+results of collecting training samples with its "regular" :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup`.
+
+
+.. testcode::
+    :hide:
+
+    ppo.stop()
+
+
+.. testcode::
+
+    config.evaluation(
+        # Run one evaluation round every iteration.
+        evaluation_interval=1,
+
+        # Create 2 eval EnvRunners in the extra EnvRunnerGroup.
+        evaluation_num_env_runners=2,
+
+        # Run evaluation for exactly 10 episodes. Note that because you have
+        # 2 EnvRunners, each one runs through 5 episodes.
+        evaluation_duration_unit="episodes",
+        evaluation_duration=10,
+    )
+
+    # Rebuild the PPO, but with the extra evaluation EnvRunnerGroup
+    ppo_with_evaluation = config.build()
+
+    for _ in range(3):
+        pprint(ppo_with_evaluation.train())
+
+
+.. _rllib-with-ray-tune:
+
+RLlib with Ray Tune
++++++++++++++++++++
+
+All RLlib :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` classes are compatible with
+the :ref:`Ray Tune API <tune-api-ref>`.
+
+This allows for easy utilization of your configured :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` in
+:ref:`Ray Tune <tune-main>` experiments.
+
+For example, the following code performs a simple hyper-parameter sweep of your :ref:`PPO <ppo>`
+through creating three ``Trials``, one for each configured learning rate:
+
+.. testcode::
+
+    from ray import train, tune
+    from ray.rllib.algorithms.ppo import PPOConfig
+
+    config = (
+        PPOConfig()
+        .environment("Pendulum-v1")
+        # Specify a simple tune hyperparameter sweep.
+        .training(
+            lr=tune.grid_search([0.001, 0.0005, 0.0001]),
+        )
+    )
+
+    # Create a Tuner instance to manage the trials.
+    tuner = tune.Tuner(
+        config.algo_class,
+        param_space=config,
+        # Specify a stopping criterion. Note that the criterion has to match one of the
+        # pretty printed result metrics from the results returned previously by
+        # ``.train()``.
+        run_config=train.RunConfig(
+            stop={"env_runners/episode_return_mean": -1000.0},
+        ),
+    )
+    # Run the Tuner and capture the results.
+    results = tuner.fit()
+
+Note that each :py:class:`~ray.tune.trial.Trial` creates a separate
+:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance as a :ref:`Ray actor <actor-guide>`,
+assigns compute resources to each ``Trial``, and runs them in parallel, if possible,
+on your Ray cluster:
+
+.. code-block:: text
+
+    Trial status: 3 RUNNING
+    Current time: 2025-01-17 18:47:33. Total running time: 3min 0s
+    Logical resource usage: 9.0/12 CPUs, 0/0 GPUs
+    ╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+    │ Trial name                    status         lr     iter     total time (s)  episode_return_mean    ..._sampled_lifetime │
+    ├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
+    │ PPO_Pendulum-v1_b5c41_00000   RUNNING    0.01         29            86.2426             -998.449                  108000 │
+    │ PPO_Pendulum-v1_b5c41_00001   RUNNING    0.001        25            74.4335             -997.079                  100000 │
+    │ PPO_Pendulum-v1_b5c41_00002   RUNNING    0.0001       20            60.0421             -960.293                   80000 │
+    ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
+``Tuner.fit()`` returns an ``ResultGrid`` object that allows for a detailed analysis of the
+training process and for retrieving the :ref:`checkpoints <rllib-checkpoints-docs>` of the trained
+algorithms and their models:
+
+.. testcode::
+    # Get the best result based on a particular metric.
+    best_result = results.get_best_result(
+        metric="env_runners/episode_return_mean", mode="max"
+    )
+
+    # Get the best checkpoint corresponding to the best result
+    # from the preceding experiment.
+    best_checkpoint = best_result.checkpoint
+
+
+Deploy a trained model for production inference
++++++++++++++++++++++++++++++++++++++++++++++++
+
+After training, you might want to deploy your models into a new environment, for example
+to run inference in production. You can do so using the checkpoint directory created in the
+preceding example. To read more about checkpoints, model deployments, and algorithm state restoration,
+see this :ref:`page on checkpointing <rllib-checkpoints-docs>` here.
+
+.. testcode::
+
+    from pathlib import Path
+    import gymnasium as gym
+    import numpy as np
+    import torch
+    from ray.rllib.core.rl_module import RLModule
+
+    # Create only the neural network (RLModule) from our checkpoint.
+    rl_module = RLModule.from_checkpoint(
+        Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module"
+    )["default_policy"]
+
+    # Create the RL environment to test against (same as was used for
+    # training earlier).
+    env = gym.make("Pendulum-v1")
+
+    episode_return = 0.0
+    done = False
+
+    # Reset the env to get the initial observation.
+    obs, info = env.reset()
+
+    while not done:
+        # Compute the next action from a batch (B=1) of observations.
+        obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
+
+        # Extract the logits from the output and dissolve batch again.
+        action_logits = rl_module.forward_inference({"obs": obs_batch})[
+            "action_dist_inputs"
+        ][0]
+
+        # PPO's default RLModule produces action logits (from which
+        # you have to sample an action or use the max-likelihood one).
+        action = numpy.argmax(action_logits.numpy())
+
+        # Send the action to the environment for the next step.
+        obs, reward, terminated, truncated, info = env.step(action)
+
+        # Perform env-loop bookkeeping.
+        episode_return += reward
+        done = terminated or truncated
+
+    print(f"Reached episode return of {episode_return}.")
+
+
+If you still have an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance up and running
+in your script, you can also get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the
+:py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module` method:
+
+.. testcode::
+
+    rl_module = ppo_with_evaluation.get_module("default_policy")
+
+
+Customizing your RL environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In the preceding examples, your :ref:`RL environment <rllib-key-concepts-environments>` was
+a `Farama gymnasium <gymnasium.farama.org>`__ pre-registered one,
+like ``Pendulum-v1`` or ``CartPole-v1``. However, if you would like to run your
+experiments against a custom one, see this tab below for a less-than-50-lines example.
+
+See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <rllib-environments-doc>` and how to customize them.
+
+.. dropdown:: Quickstart: Custom RL environment
+    :animate: fade-in-slide-down
+
+    .. testcode::
+
+        import gymnasium as gym
+        from ray.rllib.algorithms.ppo import PPOConfig
+
+        # Define your custom env class by subclassing gymnasium.Env:
+
+        class ParrotEnv(gym.Env):
+            """Environment in which the agent learns to repeat the seen observations.
+
+            Observations are float numbers indicating the to-be-repeated values,
+            e.g. -1.0, 5.1, or 3.2.
+            The action space is the same as the observation space.
+            Rewards are `r=-abs([observation] - [action])`, for all steps.
+            """
+            def __init__(self, config=None):
+                # Since actions should repeat observations, their spaces must be the same.
+                self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32)
+                self.action_space = self.observation_space
+                self._cur_obs = None
+                self._episode_len = 0
+
+            def reset(self, *, seed=None, options=None):
+                """Resets the environment, starting a new episode."""
+                # Reset the episode len.
+                self._episode_len = 0
+                # Sample a random number from our observation space.
+                self._cur_obs = self.observation_space.sample()
+                # Return initial observation.
+                return self._cur_obs, {}
+
+            def step(self, action):
+                """Takes a single step in the episode given `action`."""
+                # Set `terminated` and `truncated` flags to True after 10 steps.
+                self._episode_len += 1
+                terminated = truncated = self._episode_len >= 10
+                # Compute the reward: `r = -abs([obs] - [action])`
+                reward = -sum(abs(self._cur_obs - action))
+                # Set a new observation (random sample).
+                self._cur_obs = self.observation_space.sample()
+                return self._cur_obs, reward, terminated, truncated, {}
+
+        # Point your config to your custom env class:
+        config = (
+            PPOConfig()
+            .environment(ParrotEnv)  # add `env_config=[some Box space] to customize the env
+        )
+
+        # Build a PPO algorithm and train it.
+        ppo_w_custom_env = config.build_algo()
+        ppo_w_custom_env.train()
+
+    .. testcode::
+        :hide:
+
+        # Test that our setup is working.
+        ppo_w_custom_env.stop()
+
+
+Customizing your models
+~~~~~~~~~~~~~~~~~~~~~~~
+
+In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything
+in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define
+the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here
+for a detailed guide on how to do so.
+
+
+
+
+Accessing Model State
+~~~~~~~~~~~~~~~~~~~~~
+
+Similar to accessing policy state, you may want to get a reference to the
+underlying neural network model being trained. For example, you may want to
+pre-train it separately, or otherwise update its weights outside of RLlib.
+This can be done by accessing the ``model`` of the policy.
+
+Below you find three explicit examples showing how to access the model state of
+an algorithm.
+
+.. dropdown:: **Example: Preprocessing observations for feeding into a model**
+
+
+    Then for the code:
+
+    .. literalinclude:: doc_code/training.py
+        :language: python
+        :start-after: __preprocessing_observations_start__
+        :end-before: __preprocessing_observations_end__
+
+.. dropdown:: **Example: Querying a policy's action distribution**
+
+    .. literalinclude:: doc_code/training.py
+        :language: python
+        :start-after: __query_action_dist_start__
+        :end-before: __query_action_dist_end__
+
+.. dropdown:: **Example: Getting Q values from a DQN model**
+
+    .. literalinclude:: doc_code/training.py
+        :language: python
+        :start-after: __get_q_values_dqn_start__
+        :end-before: __get_q_values_dqn_end__
+
+    This is especially useful when used with
+    `custom model classes <rllib-models.html>`__.
+
+
+.. Debugging RLlib Experiments
+    ---------------------------
+    Eager Mode
+    ~~~~~~~~~~
+    Policies built with ``build_tf_policy`` (most of the reference algorithms are)
+    can be run in eager mode by setting the
+    ``"framework": "tf2"`` / ``"eager_tracing": true`` config options.
+    This will tell RLlib to execute the model forward pass, action distribution,
+    loss, and stats functions in eager mode.
+    Eager mode makes debugging much easier, since you can now use line-by-line
+    debugging with breakpoints or Python ``print()`` to inspect
+    intermediate tensor values.
+    However, eager can be slower than graph mode unless tracing is enabled.
+    Episode Traces
+    ~~~~~~~~~~~~~~
+    You can use the `data output API <rllib-offline.html>`__ to save episode traces
+    for debugging. For example, the following command will run PPO while saving episode
+    traces to ``/tmp/debug``.
+    .. code-block:: bash
+    cd rllib/tuned_examples/ppo
+    python cartpole_ppo.py --output /tmp/debug
+    # episode traces will be saved in /tmp/debug, for example
+    output-2019-02-23_12-02-03_worker-2_0.json
+    output-2019-02-23_12-02-04_worker-1_0.json
+Log Verbosity
+~~~~~~~~~~~~~
+You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG",
+"INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the
+verbosity of internal logging.
+For example:
+    .. code-block:: bash
+    cd rllib/tuned_examples/ppo
+    python atari_ppo.py --env ALE/Pong-v5 --log-level INFO
+    python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG
+The default log level is ``WARN``. We strongly recommend using at least ``INFO``
+level logging for development.
+Stack Traces
+~~~~~~~~~~~~
+You can use the ``ray stack`` command to dump the stack traces of all the
+Python workers on a single node. This can be useful for debugging unexpected
+hangs or performance issues.
+Next Steps
+----------
+- To check how your application is doing, you can use the :ref:`Ray dashboard <observability-getting-started>`.
diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst
index 107ddbc02728..2bee34ef0931 100644
--- a/doc/source/rllib/index.rst
+++ b/doc/source/rllib/index.rst
@@ -102,7 +102,7 @@ Install RLlib and `PyTorch <https://pytorch.org>`__, as shown below:
 
     .. code-block:: bash
 
-        `pip install "gymnasium[atari,accept-rom-license,mujoco]"`.
+        pip install "gymnasium[atari,accept-rom-license,mujoco]"
 
 This is all, you can now start coding against RLlib. Here is an example for running the :ref:`PPO Algorithm <ppo>` on the
 `Taxi domain <https://gymnasium.farama.org/environments/toy_text/taxi/>`__.
diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst
index c86ac015b137..22ad7993cebe 100644
--- a/doc/source/rllib/rllib-training.rst
+++ b/doc/source/rllib/rllib-training.rst
@@ -1,460 +1,4 @@
-.. include:: /_includes/rllib/we_are_hiring.rst
+.. raw:: html
 
-.. _rllib-getting-started:
-
-Getting Started
-===============
-
-.. include:: /_includes/rllib/new_api_stack.rst
-
-.. _rllib-in-60min:
-
-RLlib in 60 minutes
--------------------
-
-.. figure:: images/rllib-index-header.svg
-
-In this tutorial, you learn how to design, customize, and run an end-to-end RLlib learning experiment
-from scratch. This includes picking and configuring an Algorithm, running a couple of training iterations,
-saving the state of your Algorithm from time to time, running a separate evaluation loop,
-and finally utilizing one of the checkpoints to deploy your trained model in an environment outside of RLlib
-and compute actions through it.
-
-You also learn how to customize your RL environment and your neural network model.
-
-Installation
-~~~~~~~~~~~~
-
-First, install RLlib and `PyTorch <https://pytorch.org>`__, as shown below:
-
-.. code-block:: bash
-
-    pip install "ray[rllib]" "gymnasium[atari,accept-rom-license,mujoco]" torch
-
-
-.. _rllib-python-api:
-
-Python API
-~~~~~~~~~~
-
-RLlib's Python API provides all the flexibility required for applying the library to any
-type of RL problem.
-
-You manage experiments in RLlib through an instance of the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
-class. An :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` typically holds a neural
-network for computing actions, called "policy", the :ref:`RL environment <rllib-key-concepts-environments>`
-you want to optimize against, a loss function, an optimizer, and some code describing the
-algorithm's execution logic, like determining when to take which particular steps.
-
-In :ref:`multi-agent training <rllib-multi-agent-environments-doc>`,
-:py:class:`~ray.rllib.algorithms.algorithm.Algorithm` manages the querying and optimization of multiple policies at once.
-
-Through the algorithm's interface, you can train the policy, compute actions, or store your
-algorithm's state through checkpointing.
-
-
-Configure and build the algorithm
-+++++++++++++++++++++++++++++++++
-
-You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance
-and change some default settings through the config object's various methods.
-
-For example, we can set the RL environment we want to use by calling the config's
-:py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment` method.
-
-To scale our setup and define, how many EnvRunner actors you want to leverage,
-you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method.
-
-Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance
-through calling the :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo`
-method.
-
-.. testcode::
-
-    from ray.rllib.algorithms.ppo import PPOConfig
-
-    # Configure the Algorithm (PPO).
-    config = (
-        PPOConfig()
-        .environment("Pendulum-v1")
-        .env_runners(num_env_runners=3)
-        .training(
-            lr=0.0002,
-            train_batch_size_per_learner=2000,
-            num_epochs=10,
-        )
-    )
-
-    # Build the Algorithm (PPO).
-    ppo = config.build_algo()
-
-
-.. note::
-
-    See here to learn, which config methods you can use to configure your Algorithm, see here.
-
-
-Run the algorithm
-+++++++++++++++++
-
-After you have built your :ref:`PPO <ppo>` from its configuration, you can ``train`` it for a number of
-iterations through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.train` method.
-
-It returns a result dictionary that you can pretty-print for debugging purposes:
-
-.. testcode::
-
-    from pprint import pprint
-
-    for _ in range(5):
-        pprint(ppo.train())
-
-
-Checkpoint the algorithm
-++++++++++++++++++++++++
-
-To save your Algorithm's current state, create a so-called ``checkpoint`` through
-calling its `save_to_path` method. It returns the location of the saved checkpoint.
-
-Alternatively to not passing any arguments and letting the algorithm decide, where to save
-the checkpoint, you can provide a checkpoint directory yourself:
-
-.. testcode::
-
-    checkpoint_path = ppo.save_to_path()
-
-    # OR:
-    # ppo.save_to_path([a checkpoint location of your choice])
-
-
-Evaluate the algorithm
-++++++++++++++++++++++
-
-
-Restore the model
-
-
-Let's start with an example of the API's basic usage.
-
-    # Train for 10 iterations.
-    for i in range(10):
-        result = ppo.train()
-        result.pop("config")
-        print(result)
-
-        # Checkpoint every 5 iterations.
-        if i % 5 == 0:
-            checkpoint_dir = ppo.save_to_path()
-            print(f"Algorithm checkpoint saved in: {checkpoint_dir}")
-
-
-
-.. _rllib-with-ray-tune:
-
-RLlib with Ray Tune
-~~~~~~~~~~~~~~~~~~~
-
-All RLlib algorithms are compatible with the :ref:`Tune API <tune-api-ref>`.
-This enables them to be easily used in experiments with :ref:`Ray Tune <tune-main>`.
-For example, the following code performs a simple hyper-parameter sweep of PPO.
-
-
-.. literalinclude:: ./doc_code/getting_started.py
-    :dedent: 4
-    :language: python
-    :start-after: rllib-tune-config-begin
-    :end-before: rllib-tune-config-end
-
-Tune will schedule the trials to run in parallel on your Ray cluster:
-
-::
-
-    == Status ==
-    Using FIFO scheduling algorithm.
-    Resources requested: 4/4 CPUs, 0/0 GPUs
-    Result logdir: ~/ray_results/my_experiment
-    PENDING trials:
-     - PPO_CartPole-v1_2_lr=0.0001:	PENDING
-    RUNNING trials:
-     - PPO_CartPole-v1_0_lr=0.01:	RUNNING [pid=21940], 16 s, 4013 ts, 22 rew
-     - PPO_CartPole-v1_1_lr=0.001:	RUNNING [pid=21942], 27 s, 8111 ts, 54.7 rew
-
-``Tuner.fit()`` returns an ``ResultGrid`` object that allows further analysis
-of the training results and retrieving the checkpoint(s) of the trained agent.
-
-.. literalinclude:: ./doc_code/getting_started.py
-    :dedent: 0
-    :language: python
-    :start-after: rllib-tuner-begin
-    :end-before: rllib-tuner-end
-
-.. note::
-
-    You can find your checkpoint's version by
-    looking into the ``rllib_checkpoint.json`` file inside your checkpoint directory.
-
-Loading and restoring a trained algorithm from a checkpoint is simple.
-Let's assume you have a local checkpoint directory called ``checkpoint_path``.
-To load newer RLlib checkpoints (version >= 2.1), use the following code:
-
-
-.. code-block:: python
-
-    from ray.rllib.algorithms.algorithm import Algorithm
-
-    algo = Algorithm.from_checkpoint(checkpoint_path)
-
-
-Customizing your RL environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In the preceding examples, your :ref:`RL environment <rllib-key-concepts-environments>` was a `Farama gymnasium <gymnasium.farama.org>`__
-pre-registered one, like ``CartPole-v1``. However, if you would like to run your experiments against a custom one,
-see this tab below for a less-than-50-lines example.
-
-See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <rllib-environments-doc>` and how to customize them.
-
-.. dropdown:: Quickstart: Custom RL environment
-    :animate: fade-in-slide-down
-
-    .. testcode::
-
-        import gymnasium as gym
-        from ray.rllib.algorithms.ppo import PPOConfig
-
-        # Define your custom env class by subclassing gymnasium.Env:
-
-        class ParrotEnv(gym.Env):
-            """Environment in which the agent learns to repeat the seen observations.
-
-            Observations are float numbers indicating the to-be-repeated values,
-            e.g. -1.0, 5.1, or 3.2.
-            The action space is the same as the observation space.
-            Rewards are `r=-abs([observation] - [action])`, for all steps.
-            """
-            def __init__(self, config=None):
-                # Since actions should repeat observations, their spaces must be the same.
-                self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32)
-                self.action_space = self.observation_space
-                self._cur_obs = None
-                self._episode_len = 0
-
-            def reset(self, *, seed=None, options=None):
-                """Resets the environment, starting a new episode."""
-                # Reset the episode len.
-                self._episode_len = 0
-                # Sample a random number from our observation space.
-                self._cur_obs = self.observation_space.sample()
-                # Return initial observation.
-                return self._cur_obs, {}
-
-            def step(self, action):
-                """Takes a single step in the episode given `action`."""
-                # Set `terminated` and `truncated` flags to True after 10 steps.
-                self._episode_len += 1
-                terminated = truncated = self._episode_len >= 10
-                # Compute the reward: `r = -abs([obs] - [action])`
-                reward = -sum(abs(self._cur_obs - action))
-                # Set a new observation (random sample).
-                self._cur_obs = self.observation_space.sample()
-                return self._cur_obs, reward, terminated, truncated, {}
-
-        # Point your config to your custom env class:
-        config = (
-            PPOConfig()
-            .environment(ParrotEnv)  # add `env_config=[some Box space] to customize the env
-        )
-
-        # Build a PPO algorithm and train it.
-        ppo_w_custom_env = config.build_algo()
-        ppo_w_custom_env.train()
-
-    .. testcode::
-        :hide:
-
-        # Test that our setup is working.
-        ppo_w_custom_env.stop()
-
-
-Customizing your models
-~~~~~~~~~~~~~~~~~~~~~~~
-
-In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything
-in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define
-the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here
-for a detailed guide on how to do so.
-
-
-Deploying your models and computing actions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The simplest way to programmatically compute actions from a trained :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`
-is to get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module`,
-then call the module's :py:meth:`~ray.rllib.core.rl_module.rl_module.forward_inference` method.
-
-Here is an example of how to test a trained agent for one episode:
-
-.. testcode::
-
-    import gymnasium as gym
-    import numpy as np
-    import torch
-    from ray.rllib.core.rl_module import RLModule
-
-    env = gym.make("CartPole-v1")
-
-    # Get the RLModule from the up and running Algorithm instance:
-    rl_module = ppo.get_module()
-
-    episode_return = 0
-    terminated = truncated = False
-
-    obs, info = env.reset()
-
-    while not terminated and not truncated:
-        # Compute the next action from a batch (B=1) of observations.
-        obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
-        # Extract the logits from the output and dissolve batch again.
-        action_logits = rl_module.forward_inference({"obs": obs_batch})[
-            "action_dist_inputs"
-        ][0]
-        # PPO's default RLModule produces action logits (from which
-        # you have to sample an action or use the max-likelihood one).
-        action = numpy.argmax(action_logits.numpy())
-        # Send the action to the environment for the next step.
-        obs, reward, terminated, truncated, info = env.step(action)
-        episode_return += reward
-
-    print(f"Reached episode return of {episode_return}.")
-
-
-If you don't have your Algorithm instance up and running anymore and would like to create the trained RLModule
-from a checkpoint, you can do the following instead.
-Note that `best_checkpoint` is the highest performing Algorithm checkpoint you created
-in the preceding experiment. To learn more about checkpoints and their structure, see this :ref:`checkpointing guide <rllib-checkpointing-docs>`.
-
-.. testcode::
-
-    from pathlib import Path
-
-    # Create only the neural network (RLModule) from our checkpoint.
-    rl_module = RLModule.from_checkpoint(
-        Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module"
-    )["default_policy"]
-
-    # Do the same computations with `rl_module` as in the preceding code snippet.
-
-
-Accessing Policy State
-~~~~~~~~~~~~~~~~~~~~~~
-
-It is common to need to access a algorithm's internal state, for instance to set
-or get model weights.
-
-In RLlib algorithm state is replicated across multiple *rollout workers* (Ray actors)
-in the cluster.
-However, you can easily get and update this state between calls to ``train()``
-via ``Algorithm.env_runner_group.foreach_worker()``
-or ``Algorithm.env_runner_group.foreach_worker_with_index()``.
-These functions take a lambda function that is applied with the worker as an argument.
-These functions return values for each worker as a list.
-
-You can also access just the "master" copy of the algorithm state through
-``Algorithm.get_policy()`` or ``Algorithm.env_runner``,
-but note that updates here may not be immediately reflected in
-your rollout workers (if you have configured ``num_env_runners > 0``).
-Here's a quick example of how to access state of a model:
-
-.. literalinclude:: ./doc_code/getting_started.py
-    :language: python
-    :start-after: rllib-get-state-begin
-    :end-before: rllib-get-state-end
-
-Accessing Model State
-~~~~~~~~~~~~~~~~~~~~~
-
-Similar to accessing policy state, you may want to get a reference to the
-underlying neural network model being trained. For example, you may want to
-pre-train it separately, or otherwise update its weights outside of RLlib.
-This can be done by accessing the ``model`` of the policy.
-
-.. note::
-
-    To run these examples, you need to install a few extra dependencies, namely
-    `pip install "gym[atari]" "gym[accept-rom-license]" atari_py`.
-
-Below you find three explicit examples showing how to access the model state of
-an algorithm.
-
-.. dropdown:: **Example: Preprocessing observations for feeding into a model**
-
-
-    Then for the code:
-
-    .. literalinclude:: doc_code/training.py
-        :language: python
-        :start-after: __preprocessing_observations_start__
-        :end-before: __preprocessing_observations_end__
-
-.. dropdown:: **Example: Querying a policy's action distribution**
-
-    .. literalinclude:: doc_code/training.py
-        :language: python
-        :start-after: __query_action_dist_start__
-        :end-before: __query_action_dist_end__
-
-.. dropdown:: **Example: Getting Q values from a DQN model**
-
-    .. literalinclude:: doc_code/training.py
-        :language: python
-        :start-after: __get_q_values_dqn_start__
-        :end-before: __get_q_values_dqn_end__
-
-    This is especially useful when used with
-    `custom model classes <rllib-models.html>`__.
-
-
-.. Debugging RLlib Experiments
-    ---------------------------
-    Eager Mode
-    ~~~~~~~~~~
-    Policies built with ``build_tf_policy`` (most of the reference algorithms are)
-    can be run in eager mode by setting the
-    ``"framework": "tf2"`` / ``"eager_tracing": true`` config options.
-    This will tell RLlib to execute the model forward pass, action distribution,
-    loss, and stats functions in eager mode.
-    Eager mode makes debugging much easier, since you can now use line-by-line
-    debugging with breakpoints or Python ``print()`` to inspect
-    intermediate tensor values.
-    However, eager can be slower than graph mode unless tracing is enabled.
-    Episode Traces
-    ~~~~~~~~~~~~~~
-    You can use the `data output API <rllib-offline.html>`__ to save episode traces
-    for debugging. For example, the following command will run PPO while saving episode
-    traces to ``/tmp/debug``.
-    .. code-block:: bash
-    cd rllib/tuned_examples/ppo
-    python cartpole_ppo.py --output /tmp/debug
-    # episode traces will be saved in /tmp/debug, for example
-    output-2019-02-23_12-02-03_worker-2_0.json
-    output-2019-02-23_12-02-04_worker-1_0.json
-Log Verbosity
-~~~~~~~~~~~~~
-You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG",
-"INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the
-verbosity of internal logging.
-For example:
-    .. code-block:: bash
-    cd rllib/tuned_examples/ppo
-    python atari_ppo.py --env ALE/Pong-v5 --log-level INFO
-    python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG
-The default log level is ``WARN``. We strongly recommend using at least ``INFO``
-level logging for development.
-Stack Traces
-~~~~~~~~~~~~
-You can use the ``ray stack`` command to dump the stack traces of all the
-Python workers on a single node. This can be useful for debugging unexpected
-hangs or performance issues.
-Next Steps
-----------
-- To check how your application is doing, you can use the :ref:`Ray dashboard <observability-getting-started>`.
+   <meta http-equiv="refresh" content="0; url=getting-started.html">
+   <p>If you are not redirected automatically, follow this <a href="getting-started.html">link</a>.</p>

From bcd594c44b4d85d1655b46bc16b2c5cb4a7de1ff Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 18 Jan 2025 18:58:24 +0100
Subject: [PATCH 06/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst          | 121 +++++++++---------
 .../rl_modules/classes/vpg_torch_rlm.py       |   2 +-
 2 files changed, 65 insertions(+), 58 deletions(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index f280185cc4e6..d7f909f954e2 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -256,14 +256,22 @@ algorithms and their models:
     best_checkpoint = best_result.checkpoint
 
 
-Deploy a trained model for production inference
-+++++++++++++++++++++++++++++++++++++++++++++++
+Deploy a trained model for inference
+++++++++++++++++++++++++++++++++++++
 
 After training, you might want to deploy your models into a new environment, for example
-to run inference in production. You can do so using the checkpoint directory created in the
-preceding example. To read more about checkpoints, model deployments, and algorithm state restoration,
+to run inference in production. For this purpose, you can use the checkpoint directory created
+in the preceding example. To read more about checkpoints, model deployments, and restoring algorithm state,
 see this :ref:`page on checkpointing <rllib-checkpoints-docs>` here.
 
+Here is how you would create a new model instance from the checkpoint and run inference through
+a single episode of your RL environment. Note in particular the use of the
+:py:meth:`~ray.rllib.utils.checkpoints.Checkpointable.from_checkpoint` method to create
+the model and the
+:py:meth:`~ray.rllib.core.rl_module.rl_module.RLModule.forward_inference`
+method to compute actions:
+
+
 .. testcode::
 
     from pathlib import Path
@@ -272,14 +280,13 @@ see this :ref:`page on checkpointing <rllib-checkpoints-docs>` here.
     import torch
     from ray.rllib.core.rl_module import RLModule
 
-    # Create only the neural network (RLModule) from our checkpoint.
+    # Create only the neural network (RLModule) from our algorithm checkpoint.
     rl_module = RLModule.from_checkpoint(
         Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module"
     )["default_policy"]
 
-    # Create the RL environment to test against (same as was used for
-    # training earlier).
-    env = gym.make("Pendulum-v1")
+    # Create the RL environment to test against (same as was used for training earlier).
+    env = gym.make("Pendulum-v1", render_mode="human")
 
     episode_return = 0.0
     done = False
@@ -288,14 +295,14 @@ see this :ref:`page on checkpointing <rllib-checkpoints-docs>` here.
     obs, info = env.reset()
 
     while not done:
+        # Render the env.
+        env.render()
+
         # Compute the next action from a batch (B=1) of observations.
         obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
-
+        model_outputs = rl_module.forward_inference({"obs": obs_batch})
         # Extract the logits from the output and dissolve batch again.
-        action_logits = rl_module.forward_inference({"obs": obs_batch})[
-            "action_dist_inputs"
-        ][0]
-
+        action_logits = model_outputs["action_dist_inputs"][0]
         # PPO's default RLModule produces action logits (from which
         # you have to sample an action or use the max-likelihood one).
         action = numpy.argmax(action_logits.numpy())
@@ -310,8 +317,8 @@ see this :ref:`page on checkpointing <rllib-checkpoints-docs>` here.
     print(f"Reached episode return of {episode_return}.")
 
 
-If you still have an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance up and running
-in your script, you can also get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the
+Alternatively, if you still have an :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance up and running
+in your script, you can get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the
 :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module` method:
 
 .. testcode::
@@ -327,7 +334,8 @@ a `Farama gymnasium <gymnasium.farama.org>`__ pre-registered one,
 like ``Pendulum-v1`` or ``CartPole-v1``. However, if you would like to run your
 experiments against a custom one, see this tab below for a less-than-50-lines example.
 
-See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <rllib-environments-doc>` and how to customize them.
+See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <rllib-environments-doc>`
+and how to customize them.
 
 .. dropdown:: Quickstart: Custom RL environment
     :animate: fade-in-slide-down
@@ -394,51 +402,50 @@ See here for an :ref:`in-depth guide on how to setup RL environments in RLlib <r
 Customizing your models
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-In the preceding examples, RLlib provided a default neural network model for you, because you didn't specify anything
-in your AlgorithmConfig. If you would like to either reconfigure the type and size of RLlib's default models, for example define
-the number of hidden layers and their activation functions, or even write your own custom models from scratch using PyTorch, see here
-for a detailed guide on how to do so.
-
-
-
-
-Accessing Model State
-~~~~~~~~~~~~~~~~~~~~~
+In the preceding examples, because you didn't specify anything in your
+:py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig`, RLlib provided a default
+neural network model. If you would like to either reconfigure the type and size of RLlib's default models,
+for example define the number of hidden layers and their activation functions,
+or even write your own custom models from scratch using PyTorch, see here
+for a :ref:`detailed guide on the RLModule class <rlmodule-guide>`.
 
-Similar to accessing policy state, you may want to get a reference to the
-underlying neural network model being trained. For example, you may want to
-pre-train it separately, or otherwise update its weights outside of RLlib.
-This can be done by accessing the ``model`` of the policy.
+See this tab below for a 30-lines example.
 
-Below you find three explicit examples showing how to access the model state of
-an algorithm.
-
-.. dropdown:: **Example: Preprocessing observations for feeding into a model**
-
-
-    Then for the code:
-
-    .. literalinclude:: doc_code/training.py
-        :language: python
-        :start-after: __preprocessing_observations_start__
-        :end-before: __preprocessing_observations_end__
-
-.. dropdown:: **Example: Querying a policy's action distribution**
-
-    .. literalinclude:: doc_code/training.py
-        :language: python
-        :start-after: __query_action_dist_start__
-        :end-before: __query_action_dist_end__
-
-.. dropdown:: **Example: Getting Q values from a DQN model**
+.. dropdown:: Quickstart: Custom RLModule
+    :animate: fade-in-slide-down
 
-    .. literalinclude:: doc_code/training.py
-        :language: python
-        :start-after: __get_q_values_dqn_start__
-        :end-before: __get_q_values_dqn_end__
+    .. testcode::
 
-    This is especially useful when used with
-    `custom model classes <rllib-models.html>`__.
+        import torch
+
+        from ray.rllib.core.columns import Columns
+        from ray.rllib.core.rl_module.torch import TorchRLModule
+
+        # Define your custom env class by subclassing `TorchRLModule`:
+        class CustomTorchRLModule(TorchRLModule):
+            def setup(self):
+                # You have access here to the following already set attributes:
+                # self.observation_space
+                # self.action_space
+                # self.inference_only
+                # self.model_config  # <- a dict with custom settings
+                input_dim = self.observation_space.shape[0]
+                hidden_dim = self.model_config["hidden_dim"]
+                output_dim = self.action_space.n
+
+                # Define and assign your torch subcomponents.
+                self._policy_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, hidden_dim),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(hidden_dim, output_dim),
+                )
+
+            def _forward(self, batch, **kwargs):
+                # Push the observations from the batch through our `self._policy_net`.
+                action_logits = self._policy_net(batch[Columns.OBS])
+                # Return parameters for the (default) action distribution, which is
+                # `TorchCategorical` (due to our action space being `gym.spaces.Discrete`).
+                return {Columns.ACTION_DIST_INPUTS: action_logits}
 
 
 .. Debugging RLlib Experiments
diff --git a/rllib/examples/rl_modules/classes/vpg_torch_rlm.py b/rllib/examples/rl_modules/classes/vpg_torch_rlm.py
index 676598d090dc..77ee0e70dc7a 100644
--- a/rllib/examples/rl_modules/classes/vpg_torch_rlm.py
+++ b/rllib/examples/rl_modules/classes/vpg_torch_rlm.py
@@ -27,7 +27,7 @@ def setup(self):
         )
 
     def _forward(self, batch, **kwargs):
-        # Push the observations from the batch through our pi-head.
+        # Push the observations from the batch through our `self._policy_net`.
         action_logits = self._policy_net(batch[Columns.OBS])
         # Return parameters for the (default) action distribution, which is
         # `TorchCategorical` (due to our action space being `gym.spaces.Discrete`).

From b6001a1f4b4a4f9ca2f98923464266db66690b8f Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 18 Jan 2025 19:01:13 +0100
Subject: [PATCH 07/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/doc_code/training.py | 171 --------------------------
 doc/source/rllib/getting-started.rst  |  53 +-------
 2 files changed, 4 insertions(+), 220 deletions(-)
 delete mode 100644 doc/source/rllib/doc_code/training.py

diff --git a/doc/source/rllib/doc_code/training.py b/doc/source/rllib/doc_code/training.py
deleted file mode 100644
index 2e50cd6e0425..000000000000
--- a/doc/source/rllib/doc_code/training.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# flake8: noqa
-
-# __preprocessing_observations_start__
-try:
-    import gymnasium as gym
-
-    env = gym.make("ale_py:ALE/Pong-v5")
-    obs, infos = env.reset()
-except Exception:
-    import gym
-
-    env = gym.make("PongNoFrameskip-v4")
-    obs = env.reset()
-
-# RLlib uses preprocessors to implement transforms such as one-hot encoding
-# and flattening of tuple and dict observations.
-from ray.rllib.models.preprocessors import get_preprocessor
-
-prep = get_preprocessor(env.observation_space)(env.observation_space)
-# <ray.rllib.models.preprocessors.GenericPixelPreprocessor object at 0x7fc4d049de80>
-
-# Observations should be preprocessed prior to feeding into a model
-obs.shape
-# (210, 160, 3)
-prep.transform(obs).shape
-# (84, 84, 3)
-# __preprocessing_observations_end__
-
-# __query_action_dist_start__
-# Get a reference to the policy
-import numpy as np
-import torch
-
-from ray.rllib.algorithms.dqn import DQNConfig
-
-algo = (
-    DQNConfig()
-    .api_stack(
-        enable_rl_module_and_learner=False,
-        enable_env_runner_and_connector_v2=False,
-    )
-    .framework("torch")
-    .environment("CartPole-v1")
-    .env_runners(num_env_runners=0)
-    .training(
-        replay_buffer_config={
-            "type": "MultiAgentPrioritizedReplayBuffer",
-        }
-    )
-).build()
-# <ray.rllib.algorithms.ppo.PPO object at 0x7fd020186384>
-
-policy = algo.get_policy()
-# <ray.rllib.policy.eager_tf_policy.PPOTFPolicy_eager object at 0x7fd020165470>
-
-# Run a forward pass to get model output logits. Note that complex observations
-# must be preprocessed as in the above code block.
-logits, _ = policy.model({"obs": torch.from_numpy(np.array([[0.1, 0.2, 0.3, 0.4]]))})
-# (<tf.Tensor: id=1274, shape=(1, 2), dtype=float32, numpy=...>, [])
-
-# Compute action distribution given logits
-policy.dist_class
-# <class_object 'ray.rllib.models.tf.tf_action_dist.Categorical'>
-dist = policy.dist_class(logits, policy.model)
-# <ray.rllib.models.tf.tf_action_dist.Categorical object at 0x7fd02301d710>
-
-# Query the distribution for samples, sample logps
-dist.sample()
-# <tf.Tensor: id=661, shape=(1,), dtype=int64, numpy=..>
-dist.logp(torch.tensor([1]))
-# <tf.Tensor: id=1298, shape=(1,), dtype=float32, numpy=...>
-
-# Get the estimated values for the most recent forward pass
-policy.model.value_function()
-# <tf.Tensor: id=670, shape=(1,), dtype=float32, numpy=...>
-
-print(policy.model)
-"""
-Model: "model"
-_____________________________________________________________________
-Layer (type)               Output Shape  Param #  Connected to
-=====================================================================
-observations (InputLayer)  [(None, 4)]   0
-_____________________________________________________________________
-fc_1 (Dense)               (None, 256)   1280     observations[0][0]
-_____________________________________________________________________
-fc_value_1 (Dense)         (None, 256)   1280     observations[0][0]
-_____________________________________________________________________
-fc_2 (Dense)               (None, 256)   65792    fc_1[0][0]
-_____________________________________________________________________
-fc_value_2 (Dense)         (None, 256)   65792    fc_value_1[0][0]
-_____________________________________________________________________
-fc_out (Dense)             (None, 2)     514      fc_2[0][0]
-_____________________________________________________________________
-value_out (Dense)          (None, 1)     257      fc_value_2[0][0]
-=====================================================================
-Total params: 134,915
-Trainable params: 134,915
-Non-trainable params: 0
-_____________________________________________________________________
-"""
-# __query_action_dist_end__
-
-
-# __get_q_values_dqn_start__
-# Get a reference to the model through the policy
-import numpy as np
-import torch
-
-from ray.rllib.algorithms.dqn import DQNConfig
-
-algo = (
-    DQNConfig()
-    .api_stack(
-        enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False
-    )
-    .framework("torch")
-    .environment("CartPole-v1")
-    .training(
-        replay_buffer_config={
-            "type": "MultiAgentPrioritizedReplayBuffer",
-        }
-    )
-).build()
-model = algo.get_policy().model
-# <ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQModel ...>
-
-# List of all model variables
-list(model.parameters())
-
-# Run a forward pass to get base model output. Note that complex observations
-# must be preprocessed. An example of preprocessing is
-# examples/offline_rl/saving_experiences.py
-model_out = model({"obs": torch.from_numpy(np.array([[0.1, 0.2, 0.3, 0.4]]))})
-# (<tf.Tensor: id=832, shape=(1, 256), dtype=float32, numpy=...)
-
-# Access the base Keras models (all default models have a base)
-print(model)
-"""
-Model: "model"
-_______________________________________________________________________
-Layer (type)                Output Shape    Param #  Connected to
-=======================================================================
-observations (InputLayer)   [(None, 4)]     0
-_______________________________________________________________________
-fc_1 (Dense)                (None, 256)     1280     observations[0][0]
-_______________________________________________________________________
-fc_out (Dense)              (None, 256)     65792    fc_1[0][0]
-_______________________________________________________________________
-value_out (Dense)           (None, 1)       257      fc_1[0][0]
-=======================================================================
-Total params: 67,329
-Trainable params: 67,329
-Non-trainable params: 0
-______________________________________________________________________________
-"""
-
-# Access the Q value model (specific to DQN)
-print(model.get_q_value_distributions(model_out[0])[0])
-# tf.Tensor([[ 0.13023682 -0.36805138]], shape=(1, 2), dtype=float32)
-# ^ exact numbers may differ due to randomness
-
-print(model.advantage_module)
-
-# Access the state value model (specific to DQN)
-print(model.get_state_value(model_out[0]))
-# tf.Tensor([[0.09381643]], shape=(1, 1), dtype=float32)
-# ^ exact number may differ due to randomness
-
-print(model.value_module)
-# __get_q_values_dqn_end__
diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index d7f909f954e2..2c0abce42224 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -1,3 +1,4 @@
+
 .. include:: /_includes/rllib/we_are_hiring.rst
 
 .. _rllib-getting-started:
@@ -61,8 +62,8 @@ Configure and build the algorithm
 You first create an :py:class:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig` instance
 and change some default settings through the config object's various methods.
 
-For example, we can set the :ref:`RL environment <rllib-key-concepts-environments>`
-we want to use by calling the config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment`
+For example, you can set the :ref:`RL environment <rllib-key-concepts-environments>`
+you want to use by calling the config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.environment`
 method:
 
 .. testcode::
@@ -443,52 +444,6 @@ See this tab below for a 30-lines example.
             def _forward(self, batch, **kwargs):
                 # Push the observations from the batch through our `self._policy_net`.
                 action_logits = self._policy_net(batch[Columns.OBS])
-                # Return parameters for the (default) action distribution, which is
+                # Return parameters for the default action distribution, which is
                 # `TorchCategorical` (due to our action space being `gym.spaces.Discrete`).
                 return {Columns.ACTION_DIST_INPUTS: action_logits}
-
-
-.. Debugging RLlib Experiments
-    ---------------------------
-    Eager Mode
-    ~~~~~~~~~~
-    Policies built with ``build_tf_policy`` (most of the reference algorithms are)
-    can be run in eager mode by setting the
-    ``"framework": "tf2"`` / ``"eager_tracing": true`` config options.
-    This will tell RLlib to execute the model forward pass, action distribution,
-    loss, and stats functions in eager mode.
-    Eager mode makes debugging much easier, since you can now use line-by-line
-    debugging with breakpoints or Python ``print()`` to inspect
-    intermediate tensor values.
-    However, eager can be slower than graph mode unless tracing is enabled.
-    Episode Traces
-    ~~~~~~~~~~~~~~
-    You can use the `data output API <rllib-offline.html>`__ to save episode traces
-    for debugging. For example, the following command will run PPO while saving episode
-    traces to ``/tmp/debug``.
-    .. code-block:: bash
-    cd rllib/tuned_examples/ppo
-    python cartpole_ppo.py --output /tmp/debug
-    # episode traces will be saved in /tmp/debug, for example
-    output-2019-02-23_12-02-03_worker-2_0.json
-    output-2019-02-23_12-02-04_worker-1_0.json
-Log Verbosity
-~~~~~~~~~~~~~
-You can control the log level via the ``"log_level"`` flag. Valid values are "DEBUG",
-"INFO", "WARN" (default), and "ERROR". This can be used to increase or decrease the
-verbosity of internal logging.
-For example:
-    .. code-block:: bash
-    cd rllib/tuned_examples/ppo
-    python atari_ppo.py --env ALE/Pong-v5 --log-level INFO
-    python atari_ppo.py --env ALE/Pong-v5 --log-level DEBUG
-The default log level is ``WARN``. We strongly recommend using at least ``INFO``
-level logging for development.
-Stack Traces
-~~~~~~~~~~~~
-You can use the ``ray stack`` command to dump the stack traces of all the
-Python workers on a single node. This can be useful for debugging unexpected
-hangs or performance issues.
-Next Steps
-----------
-- To check how your application is doing, you can use the :ref:`Ray dashboard <observability-getting-started>`.

From 581292339f119be6c533bfb65ef2ff12323cbef8 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 18 Jan 2025 19:02:35 +0100
Subject: [PATCH 08/22] vale and lint

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 2c0abce42224..481702b6c765 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -77,7 +77,7 @@ method:
     )
 
 
-To scale our setup and define, how many EnvRunner actors you want to leverage,
+To scale your setup and define, how many EnvRunner actors you want to leverage,
 you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method:
 
 .. testcode::

From e46558e77a7623ecb077de13bdeca51a644db782 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 09:13:59 +0100
Subject: [PATCH 09/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/ray-overview/use-cases.rst |  2 +-
 doc/source/rllib/algorithm-config.rst |  1 +
 doc/source/rllib/index.rst            |  6 +++---
 doc/source/rllib/rllib-algorithms.rst | 16 ++++++++--------
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/doc/source/ray-overview/use-cases.rst b/doc/source/ray-overview/use-cases.rst
index 899d8c93fd9c..feb22203eacd 100644
--- a/doc/source/ray-overview/use-cases.rst
+++ b/doc/source/ray-overview/use-cases.rst
@@ -144,7 +144,7 @@ Learn more about reinforcement learning with the following resources.
 
 - `[Course] Applied Reinforcement Learning with RLlib <https://applied-rl-course.netlify.app/>`_
 - `[Blog] Intro to RLlib: Example Environments <https://medium.com/distributed-computing-with-ray/intro-to-rllib-example-environments-3a113f532c70>`_
-- :doc:`[Guide] Getting Started with RLlib </rllib/rllib-training>`
+- :doc:`[Guide] Getting Started with RLlib </rllib/getting-started>`
 - `[Talk] Deep reinforcement learning at Riot Games <https://www.anyscale.com/events/2022/03/29/deep-reinforcement-learning-at-riot-games>`_
 - :doc:`[Gallery] RLlib Examples Gallery </rllib/rllib-examples>`
 - `[Gallery] More RL Use Cases on the Blog <https://www.anyscale.com/blog?tag=rllib>`_
diff --git a/doc/source/rllib/algorithm-config.rst b/doc/source/rllib/algorithm-config.rst
index 5e5336ebe0bc..a4aa1e4dbf06 100644
--- a/doc/source/rllib/algorithm-config.rst
+++ b/doc/source/rllib/algorithm-config.rst
@@ -127,6 +127,7 @@ instance into the constructor of the :py:class:`~ray.tune.tuner.Tuner`:
     results = tuner.fit()
 
 
+.. _rllib-algo-configuration-generic-settings
 
 Generic config settings
 -----------------------
diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst
index 2bee34ef0931..9091102ddabb 100644
--- a/doc/source/rllib/index.rst
+++ b/doc/source/rllib/index.rst
@@ -16,7 +16,7 @@ RLlib: Industry-Grade, Scalable Reinforcement Learning
 
 .. todo (sven): redo toctree:
     suggestion:
-    getting-started (replaces rllib-training)
+    getting-started
     key-concepts
     rllib-env (single-agent)
         ...  <- multi-agent
@@ -47,7 +47,7 @@ RLlib: Industry-Grade, Scalable Reinforcement Learning
 .. toctree::
     :hidden:
 
-    rllib-training
+    getting-started
     key-concepts
     rllib-env
     algorithm-config
@@ -166,7 +166,7 @@ You can also tweak the NN architecture used by tweaking RLlib's :py:class:`~ray.
 as well as, set up a separate config for the evaluation
 :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors through the ``config.evaluation()`` method.
 
-`See here <rllib-training.html#using-the-python-api>`_, if you want to learn more about the RLlib training APIs.
+:ref:`See here <rllib-python-api>`, if you want to learn more about the RLlib training APIs.
 Also, `see here <https://github.com/ray-project/ray/blob/master/rllib/examples/inference/policy_inference_after_training.py>`__
 for a simple example on how to write an action inference loop after training.
 
diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
index 4c4a5dd1cd5b..42c0d8eb727a 100644
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@@ -74,7 +74,7 @@ Proximal Policy Optimization (PPO)
 `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum_ppo.py>`__.
 
 
-**PPO-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**PPO-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.ppo.ppo.PPOConfig
    :members: training
@@ -121,7 +121,7 @@ See also how to use `parametric-actions in DQN <rllib-models.html#variable-lengt
     "v_max": 10.0``
     (set ``v_min`` and ``v_max`` according to your expected range of returns).
 
-**DQN-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**DQN-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.dqn.dqn.DQNConfig
    :members: training
@@ -150,7 +150,7 @@ Soft Actor Critic (SAC)
 `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/pendulum-sac.yaml>`__,
 `HalfCheetah-v3 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/sac/halfcheetah-sac.yaml>`__,
 
-**SAC-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**SAC-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.sac.sac.SACConfig
    :members: training
@@ -190,7 +190,7 @@ Asynchronous Proximal Policy Optimization (APPO)
 `Pong-v5 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/appo/pong_appo.py>`__
 `HalfCheetah-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/appo/halfcheetah_appo.py>`__
 
-**APPO-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**APPO-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.appo.appo.APPOConfig
    :members: training
@@ -229,7 +229,7 @@ Tuned examples:
     The maximum training throughput reached is ~30k transitions per second (~120k environment frames per second).
 
 
-**IMPALA-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**IMPALA-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.impala.impala.IMPALAConfig
    :members: training
@@ -322,7 +322,7 @@ Behavior Cloning (BC)
 `CartPole-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/bc/cartpole_bc.py>`__
 `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/bc/pendulum_bc.py>`__
 
-**BC-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**BC-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.bc.bc.BCConfig
    :members: training
@@ -347,7 +347,7 @@ Conservative Q-Learning (CQL)
 **Tuned examples:**
 `Pendulum-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/cql/pendulum_cql.py>`__
 
-**CQL-specific configs** and `common configs <rllib-training.html#common-parameters>`__):
+**CQL-specific configs** and :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.cql.cql.CQLConfig
    :members: training
@@ -374,7 +374,7 @@ Monotonic Advantage Re-Weighted Imitation Learning (MARWIL)
 **Tuned examples:**
 `CartPole-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/marwil/cartpole_marwil.py>`__
 
-**MARWIL-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):
+**MARWIL-specific configs** (see also :ref:`generic algorithm settings <rllib-algo-configuration-generic-settings>`):
 
 .. autoclass:: ray.rllib.algorithms.marwil.marwil.MARWILConfig
    :members: training

From be93ff1e9b76f013f2d940d4deafa818769732e9 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 11:35:47 +0100
Subject: [PATCH 10/22] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/algorithm-config.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/rllib/algorithm-config.rst b/doc/source/rllib/algorithm-config.rst
index a4aa1e4dbf06..785d5b4f7186 100644
--- a/doc/source/rllib/algorithm-config.rst
+++ b/doc/source/rllib/algorithm-config.rst
@@ -127,7 +127,7 @@ instance into the constructor of the :py:class:`~ray.tune.tuner.Tuner`:
     results = tuner.fit()
 
 
-.. _rllib-algo-configuration-generic-settings
+.. _rllib-algo-configuration-generic-settings:
 
 Generic config settings
 -----------------------

From 00ea51c4a92d36d77f7a242db4d00eab8951a9ea Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 11:36:46 +0100
Subject: [PATCH 11/22] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 481702b6c765..060c3dab47a9 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -96,7 +96,7 @@ For training-related settings or any algorithm-specific settings, use the
     )
 
 Finally, you build the actual :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` instance
-through calling your config's :py:meth:`~ray.rllib.algorithm.algorithm_config.AlgorithmConfig.build_algo`
+through calling your config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.build_algo`
 method.
 
 .. testcode::

From 85fd1232761a3482291f45d35efbec75b846ccdc Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 14:35:42 +0100
Subject: [PATCH 12/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 55 ++++++++++++++++++----------
 doc/source/rllib/index.rst           |  7 +---
 2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 060c3dab47a9..4b1dc4350cb1 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -77,8 +77,9 @@ method:
     )
 
 
-To scale your setup and define, how many EnvRunner actors you want to leverage,
-you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method:
+To scale your setup and define, how many :py:class:`~ray.rllib.env.env_runner.EnvRunner` actors you want to leverage,
+you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.env_runners` method.
+``EnvRunners`` are used to collect samples for training updates from your :ref:`environment <rllib-key-concepts-environments>`.
 
 .. testcode::
 
@@ -129,10 +130,10 @@ Checkpoint the algorithm
 ++++++++++++++++++++++++
 
 To save the current state of your :py:class:`~ray.rllib.algorithms.algorithm.Algorithm`,
-create a ``checkpoint`` through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.save_to_path` method,
+create a checkpoint through calling its :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.save_to_path` method,
 which returns the directory of the saved checkpoint.
 
-Instead of not passing any arguments to this call and letting the algorithm decide where to save
+Instead of not passing any arguments to this call and letting the :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` decide where to save
 the checkpoint, you can also provide a checkpoint directory yourself:
 
 .. testcode::
@@ -147,7 +148,7 @@ Evaluate the algorithm
 ++++++++++++++++++++++
 
 RLlib supports setting up a separate :py:class:`~ray.rllib.env.env_runner_group.EnvRunnerGroup`
-for the sole purpose of evaluating your model from time to time on the RL environment.
+for the sole purpose of evaluating your model from time to time on the :ref:`RL environment <rllib-key-concepts-environments>`.
 
 Use your config's :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.evaluation` method
 to set up the details. By default, RLlib doesn't perform evaluation during training and only reports the
@@ -187,14 +188,20 @@ results of collecting training samples with its "regular" :py:class:`~ray.rllib.
 RLlib with Ray Tune
 +++++++++++++++++++
 
-All RLlib :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` classes are compatible with
+All online RLlib :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` classes are compatible with
 the :ref:`Ray Tune API <tune-api-ref>`.
 
-This allows for easy utilization of your configured :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` in
+.. note::
+
+    The offline RL algorithms, like :ref:`BC <bs>`, :ref:`CQL <cql>`, and :ref:`MARWIL <marwil>`
+    require more work on :ref:`Tune <tune-main>` and :ref:`Ray Data <data>`
+    to add Ray Tune support.
+
+The integration of :ref:`Ray Tune <tune-main>` allows you to utilize your configured :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` in
 :ref:`Ray Tune <tune-main>` experiments.
 
-For example, the following code performs a simple hyper-parameter sweep of your :ref:`PPO <ppo>`
-through creating three ``Trials``, one for each configured learning rate:
+For example, the following code performs a hyper-parameter sweep of your :ref:`PPO <ppo>`, creating three ``Trials``,
+one for each of the configured learning rates:
 
 .. testcode::
 
@@ -234,22 +241,24 @@ on your Ray cluster:
     Trial status: 3 RUNNING
     Current time: 2025-01-17 18:47:33. Total running time: 3min 0s
     Logical resource usage: 9.0/12 CPUs, 0/0 GPUs
-    ╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-    │ Trial name                    status         lr     iter     total time (s)  episode_return_mean    ..._sampled_lifetime │
-    ├──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
-    │ PPO_Pendulum-v1_b5c41_00000   RUNNING    0.01         29            86.2426             -998.449                  108000 │
-    │ PPO_Pendulum-v1_b5c41_00001   RUNNING    0.001        25            74.4335             -997.079                  100000 │
-    │ PPO_Pendulum-v1_b5c41_00002   RUNNING    0.0001       20            60.0421             -960.293                   80000 │
-    ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+    ╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+    │ Trial name                   status       lr   iter  total time (s)  episode_return_mean  .._sampled_lifetime │
+    ├───────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
+    │ PPO_Pendulum-v1_b5c41_00000  RUNNING  0.01       29         86.2426             -998.449               108000 │
+    │ PPO_Pendulum-v1_b5c41_00001  RUNNING  0.001      25         74.4335             -997.079               100000 │
+    │ PPO_Pendulum-v1_b5c41_00002  RUNNING  0.0001     20         60.0421             -960.293                80000 │
+    ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
 ``Tuner.fit()`` returns an ``ResultGrid`` object that allows for a detailed analysis of the
 training process and for retrieving the :ref:`checkpoints <rllib-checkpoints-docs>` of the trained
 algorithms and their models:
 
 .. testcode::
-    # Get the best result based on a particular metric.
+    # Get the best result of the final iteration, based on a particular metric.
     best_result = results.get_best_result(
-        metric="env_runners/episode_return_mean", mode="max"
+        metric="env_runners/episode_return_mean",
+        mode="max",
+        scope="last",
     )
 
     # Get the best checkpoint corresponding to the best result
@@ -358,7 +367,10 @@ and how to customize them.
             """
             def __init__(self, config=None):
                 # Since actions should repeat observations, their spaces must be the same.
-                self.observation_space = gym.spaces.Box(-1.0, 1.0, (1,), np.float32)
+                self.observation_space = config.get(
+                    "obs_act_space",
+                    gym.spaces.Box(-1.0, 1.0, (1,), np.float32),
+                )
                 self.action_space = self.observation_space
                 self._cur_obs = None
                 self._episode_len = 0
@@ -386,7 +398,10 @@ and how to customize them.
         # Point your config to your custom env class:
         config = (
             PPOConfig()
-            .environment(ParrotEnv)  # add `env_config=[some Box space] to customize the env
+            .environment(
+                ParrotEnv,
+                # Add `env_config={"obs_act_space": [some Box space]}` to customize.
+            )
         )
 
         # Build a PPO algorithm and train it.
diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst
index 9091102ddabb..7cabad8796bb 100644
--- a/doc/source/rllib/index.rst
+++ b/doc/source/rllib/index.rst
@@ -142,17 +142,14 @@ on the collected data, and a model update step.
     for _ in range(5):
         pprint(algo.train())
 
-At the end of your script, you evaluate the trained Algorithm:
+At the end of your script, you evaluate the trained Algorithm and release all its resources:
 
 .. testcode::
 
     # ... and evaluate it.
     pprint(algo.evaluate())
 
-
-.. testcode::
-    :hide:
-
+    # Release the algo's resources (remote actors, like EnvRunners and Learners).
     algo.stop()
 
 

From e5203d2a15c0a4f8e1e611aae9a77c0c98fedb47 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 21:45:02 +0100
Subject: [PATCH 13/22] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 4b1dc4350cb1..1dc8ca4ea6e3 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -193,7 +193,7 @@ the :ref:`Ray Tune API <tune-api-ref>`.
 
 .. note::
 
-    The offline RL algorithms, like :ref:`BC <bs>`, :ref:`CQL <cql>`, and :ref:`MARWIL <marwil>`
+    The offline RL algorithms, like :ref:`BC <bc>`, :ref:`CQL <cql>`, and :ref:`MARWIL <marwil>`
     require more work on :ref:`Tune <tune-main>` and :ref:`Ray Data <data>`
     to add Ray Tune support.
 

From 512806d9459a9f0f0ddd997c4cc284c0d8046b94 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 20 Jan 2025 22:07:46 +0100
Subject: [PATCH 14/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/rllib-training.rst | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 doc/source/rllib/rllib-training.rst

diff --git a/doc/source/rllib/rllib-training.rst b/doc/source/rllib/rllib-training.rst
deleted file mode 100644
index 22ad7993cebe..000000000000
--- a/doc/source/rllib/rllib-training.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-.. raw:: html
-
-   <meta http-equiv="refresh" content="0; url=getting-started.html">
-   <p>If you are not redirected automatically, follow this <a href="getting-started.html">link</a>.</p>

From 7286da0b2922a5e6e5ec5814b5010c9102a22736 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 21 Jan 2025 16:10:48 +0100
Subject: [PATCH 15/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/checkpoints.rst      |  2 +-
 doc/source/rllib/external-envs.rst    |  1 +
 doc/source/rllib/getting-started.rst  | 18 ++++++++++++------
 doc/source/rllib/multi-agent-envs.rst |  4 ++++
 doc/source/rllib/rllib-algorithms.rst | 10 ++++++++++
 doc/source/rllib/rllib-env.rst        |  2 ++
 6 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/doc/source/rllib/checkpoints.rst b/doc/source/rllib/checkpoints.rst
index ed98b263ad40..9f52d2f20f31 100644
--- a/doc/source/rllib/checkpoints.rst
+++ b/doc/source/rllib/checkpoints.rst
@@ -35,7 +35,7 @@ For example, you can deploy a previously trained :py:class:`~ray.rllib.core.rl_m
 any of the other RLlib components, into production.
 
 .. figure:: images/checkpointing/from_checkpoint.svg
-    :width: 500
+    :width: 750
     :align: left
 
     **Creating a new instance directly from a checkpoint**: Use the ``classmethod``
diff --git a/doc/source/rllib/external-envs.rst b/doc/source/rllib/external-envs.rst
index 5de952db46d1..7730a17117c6 100644
--- a/doc/source/rllib/external-envs.rst
+++ b/doc/source/rllib/external-envs.rst
@@ -17,6 +17,7 @@ of training the policies, but wouldn't pose any restrictions on when and how oft
 should step.
 
 .. figure:: images/envs/external_env_setup_client_inference.svg
+    :align: left
     :width: 600
 
     **External application with client-side inference**: An external simulator (for example a game engine)
diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 1dc8ca4ea6e3..53f665e7e907 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -108,7 +108,7 @@ method.
 
 .. note::
 
-    See here to learn about the :ref:`methods you can use to configure your Algorithm <rllib-algo-configuration-docs>`.
+    See here to learn about all the :ref:`methods you can use to configure your Algorithm <rllib-algo-configuration-docs>`.
 
 
 Run the algorithm
@@ -177,7 +177,7 @@ results of collecting training samples with its "regular" :py:class:`~ray.rllib.
     )
 
     # Rebuild the PPO, but with the extra evaluation EnvRunnerGroup
-    ppo_with_evaluation = config.build()
+    ppo_with_evaluation = config.build_algo()
 
     for _ in range(3):
         pprint(ppo_with_evaluation.train())
@@ -197,7 +197,7 @@ the :ref:`Ray Tune API <tune-api-ref>`.
     require more work on :ref:`Tune <tune-main>` and :ref:`Ray Data <data>`
     to add Ray Tune support.
 
-The integration of :ref:`Ray Tune <tune-main>` allows you to utilize your configured :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` in
+This integration allows for utilizing your configured :py:class:`~ray.rllib.algorithms.algorithm.Algorithm` in
 :ref:`Ray Tune <tune-main>` experiments.
 
 For example, the following code performs a hyper-parameter sweep of your :ref:`PPO <ppo>`, creating three ``Trials``,
@@ -249,7 +249,7 @@ on your Ray cluster:
     │ PPO_Pendulum-v1_b5c41_00002  RUNNING  0.0001     20         60.0421             -960.293                80000 │
     ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
-``Tuner.fit()`` returns an ``ResultGrid`` object that allows for a detailed analysis of the
+``Tuner.fit()`` returns a ``ResultGrid`` object that allows for a detailed analysis of the
 training process and for retrieving the :ref:`checkpoints <rllib-checkpoints-docs>` of the trained
 algorithms and their models:
 
@@ -291,9 +291,15 @@ method to compute actions:
     from ray.rllib.core.rl_module import RLModule
 
     # Create only the neural network (RLModule) from our algorithm checkpoint.
+    # See here (https://docs.ray.io/en/master/rllib/checkpoints.html)
+    # to learn more about checkpointing and the specific "path" used.
     rl_module = RLModule.from_checkpoint(
-        Path(best_checkpoint.path) / "learner_group" / "learner" / "rl_module"
-    )["default_policy"]
+        Path(best_checkpoint.path)
+        / "learner_group"
+        / "learner"
+        / "rl_module"
+        / "default_policy"
+    )
 
     # Create the RL environment to test against (same as was used for training earlier).
     env = gym.make("Pendulum-v1", render_mode="human")
diff --git a/doc/source/rllib/multi-agent-envs.rst b/doc/source/rllib/multi-agent-envs.rst
index 9944a1a61945..52312b98fefc 100644
--- a/doc/source/rllib/multi-agent-envs.rst
+++ b/doc/source/rllib/multi-agent-envs.rst
@@ -22,6 +22,7 @@ allowing for any policy to control more than one agent.
 
 .. figure:: images/envs/multi_agent_setup.svg
     :width: 600
+    :align: left
 
     **Multi-agent setup:** ``N`` agents live in the environment and take actions computed by ``M`` policy networks.
     The mapping from agent to policy is flexible and determined by a user-provided mapping function. Here, `agent_1`
@@ -177,6 +178,7 @@ their observations in the returned observations dict.
 
 .. figure:: images/envs/multi_agent_episode_simultaneous.svg
     :width: 600
+    :align: left
 
     **Env with simultaneously acting agents:** Both agents receive their observations at each
     time step, including right after `reset()`. Note that an agent must compute and send an action
@@ -202,6 +204,7 @@ returned observation dict.
 
 .. figure:: images/envs/multi_agent_episode_turn_based.svg
     :width: 600
+    :align: left
 
     **Env with agents taking turns:** The two agents act by taking alternating turns. `agent_1` receives the
     first observation after the `reset()` and thus has to compute and send an action first. Upon receiving
@@ -214,6 +217,7 @@ environments where all agents always act simultaneously, to any arbitrarily comp
 
 .. figure:: images/envs/multi_agent_episode_complex_order.svg
     :width: 600
+    :align: left
 
     **Env with a complex order of turns:** Three agents act in a seemingly chaotic order. `agent_1` and `agent_3` receive their
     initial observation after the `reset()` and thus has to compute and send actions first. Upon receiving
diff --git a/doc/source/rllib/rllib-algorithms.rst b/doc/source/rllib/rllib-algorithms.rst
index 98f08d7a398c..657e35f2d2c3 100644
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@@ -59,6 +59,7 @@ Proximal Policy Optimization (PPO)
 
 .. figure:: images/algos/ppo-architecture.svg
     :width: 750
+    :align: left
 
     **PPO architecture:** In a training iteration, PPO performs three major steps:
     1. Sampling a set of episodes or episode fragments
@@ -92,6 +93,7 @@ Deep Q Networks (DQN, Rainbow, Parametric DQN)
 
 .. figure:: images/algos/dqn-architecture.svg
     :width: 650
+    :align: left
 
     **DQN architecture:** DQN uses a replay buffer to temporarily store episode samples that RLlib collects from the environment.
     Throughout different training iterations, these episodes and episode fragments are re-sampled from the buffer and re-used
@@ -137,6 +139,7 @@ Soft Actor Critic (SAC)
 
 .. figure:: images/algos/sac-architecture.svg
     :width: 750
+    :align: left
 
     **SAC architecture:** SAC uses a replay buffer to temporarily store episode samples that RLlib collects from the environment.
     Throughout different training iterations, these episodes and episode fragments are re-sampled from the buffer and re-used
@@ -173,6 +176,7 @@ Asynchronous Proximal Policy Optimization (APPO)
 
 .. figure:: images/algos/appo-architecture.svg
     :width: 750
+    :align: left
 
     **APPO architecture:** APPO is an asynchronous variant of :ref:`Proximal Policy Optimization (PPO) <ppo>` based on the IMPALA architecture,
     but using a surrogate policy loss with clipping, allowing for multiple SGD passes per collected train batch.
@@ -205,6 +209,7 @@ Importance Weighted Actor-Learner Architecture (IMPALA)
 
 .. figure:: images/algos/impala-architecture.svg
     :width: 750
+    :align: left
 
     **IMPALA architecture:** In a training iteration, IMPALA requests samples from all EnvRunners asynchronously and the collected episodes
     are returned to the main algorithm process as Ray references rather than actual objects available on the local process.
@@ -248,6 +253,7 @@ DreamerV3
 
 .. figure:: images/algos/dreamerv3-architecture.svg
     :width: 850
+    :align: left
 
     **DreamerV3 architecture:** DreamerV3 trains a recurrent WORLD_MODEL in supervised fashion
     using real environment interactions sampled from a replay buffer. The world model's objective
@@ -308,6 +314,7 @@ Behavior Cloning (BC)
 
 .. figure:: images/algos/bc-architecture.svg
     :width: 750
+    :align: left
 
     **BC architecture:** RLlib's behavioral cloning (BC) uses Ray Data to tap into its parallel data
     processing capabilities. In one training iteration, BC reads episodes in parallel from
@@ -337,6 +344,7 @@ Conservative Q-Learning (CQL)
 
 .. figure:: images/algos/cql-architecture.svg
     :width: 750
+    :align: left
 
     **CQL architecture:** CQL (Conservative Q-Learning) is an offline RL algorithm that mitigates the overestimation of Q-values
     outside the dataset distribution through a conservative critic estimate. It adds a simple Q regularizer loss to the standard
@@ -362,6 +370,7 @@ Monotonic Advantage Re-Weighted Imitation Learning (MARWIL)
 
 .. figure:: images/algos/marwil-architecture.svg
     :width: 750
+    :align: left
 
     **MARWIL architecture:** MARWIL is a hybrid imitation learning and policy gradient algorithm suitable for training on
     batched historical data. When the ``beta`` hyperparameter is set to zero, the MARWIL objective reduces to plain
@@ -393,6 +402,7 @@ Curiosity-driven Exploration by Self-supervised Prediction
 
 .. figure:: images/algos/curiosity-architecture.svg
     :width: 850
+    :align: left
 
     **Intrinsic Curiosity Model (ICM) architecture:** The main idea behind ICM is to train a world-model
     (in parallel to the "main" policy) to predict the environment's dynamics. The loss of
diff --git a/doc/source/rllib/rllib-env.rst b/doc/source/rllib/rllib-env.rst
index d42576bcc734..349af48a6761 100644
--- a/doc/source/rllib/rllib-env.rst
+++ b/doc/source/rllib/rllib-env.rst
@@ -63,6 +63,7 @@ action choices eventually maximize the cumulative reward over the agent's lifeti
 
 .. figure:: images/envs/single_agent_setup.svg
     :width: 600
+    :align: left
 
     **Single-agent setup:** One agent lives in the environment and takes actions computed by a single policy.
     The mapping from agent to policy is fixed ("default_agent" maps to "default_policy").
@@ -272,6 +273,7 @@ Performance and Scaling
 
 .. figure:: images/envs/env_runners.svg
     :width: 600
+    :align: left
 
     **EnvRunner with gym.Env setup:** Environments in RLlib are located within the :py:class:`~ray.rllib.envs.env_runner.EnvRunner` actors, whose number
     (`n`) you can scale through the `config.env_runners(num_env_runners=..)` setting. Each :py:class:`~ray.rllib.envs.env_runner.EnvRunner` actor

From 79ab310dbf8a629a5030f93611bd212c8364abc2 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 22 Jan 2025 11:00:44 +0100
Subject: [PATCH 16/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 53f665e7e907..4662e93100ce 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -223,9 +223,10 @@ one for each of the configured learning rates:
         param_space=config,
         # Specify a stopping criterion. Note that the criterion has to match one of the
         # pretty printed result metrics from the results returned previously by
-        # ``.train()``.
+        # ``.train()``. Also note that -1100 is not a good episode return for
+        # Pendulum-v1, we are using it here to shorten the experiment time.
         run_config=train.RunConfig(
-            stop={"env_runners/episode_return_mean": -1000.0},
+            stop={"env_runners/episode_return_mean": -1100.0},
         ),
     )
     # Run the Tuner and capture the results.
@@ -317,14 +318,21 @@ method to compute actions:
         # Compute the next action from a batch (B=1) of observations.
         obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension
         model_outputs = rl_module.forward_inference({"obs": obs_batch})
-        # Extract the logits from the output and dissolve batch again.
-        action_logits = model_outputs["action_dist_inputs"][0]
-        # PPO's default RLModule produces action logits (from which
-        # you have to sample an action or use the max-likelihood one).
-        action = numpy.argmax(action_logits.numpy())
+
+        # Extract the action distribution parameters from the output and dissolve batch dim.
+        action_dist_params = model_outputs["action_dist_inputs"][0].numpy()
+
+        # We have continuous actions -> take the mean (max likelihood).
+        greedy_action = np.clip(
+            action_dist_params[0:1],  # 0=mean, 1=log(stddev), [0:1]=use mean, but keep shape=(1,)
+            a_min=env.action_space.low[0],
+            a_max=env.action_space.high[0],
+        )
+        # For discrete actions, you should take the argmax over the logits:
+        # greedy_action = np.argmax(action_dist_params)
 
         # Send the action to the environment for the next step.
-        obs, reward, terminated, truncated, info = env.step(action)
+        obs, reward, terminated, truncated, info = env.step(greedy_action)
 
         # Perform env-loop bookkeeping.
         episode_return += reward

From 3231c1ce23620e7edd11487ffdbc3cf8883bab4c Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 23 Jan 2025 12:26:21 +0100
Subject: [PATCH 17/22] fixes

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/algorithm.py            | 20 +++++++++++---------
 rllib/algorithms/tests/test_algorithm.py |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index d04b2c9a4c45..30911cfecfce 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -335,20 +335,19 @@ def from_checkpoint(
         """Creates a new algorithm instance from a given checkpoint.
 
         Args:
-            path: The path (str) to the checkpoint directory to use
-                or an AIR Checkpoint instance to restore from.
+            path: The path (str) to the checkpoint directory to use or a Ray Train
+                Checkpoint instance to restore from.
             filesystem: PyArrow FileSystem to use to access data at the `path`. If not
                 specified, this is inferred from the URI scheme of `path`.
             policy_ids: Optional list of PolicyIDs to recover. This allows users to
                 restore an Algorithm with only a subset of the originally present
                 Policies.
-            policy_mapping_fn: An optional (updated) policy mapping function
-                to use from here on.
-            policies_to_train: An optional list of policy IDs to be trained
-                or a callable taking PolicyID and SampleBatchType and
-                returning a bool (trainable or not?).
-                If None, will keep the existing setup in place. Policies,
-                whose IDs are not in the list (or for which the callable
+            policy_mapping_fn: An optional (updated) policy mapping function to use from
+                here on.
+            policies_to_train: An optional list of policy IDs to be trained or a
+                callable taking PolicyID and SampleBatchType and returning a bool
+                (trainable or not?). If None, will keep the existing setup in place.
+                Policies, whose IDs are not in the list (or for which the callable
                 returns False) will not be updated.
 
         Returns:
@@ -364,6 +363,9 @@ def from_checkpoint(
 
         # New API stack -> Use Checkpointable's default implementation.
         if checkpoint_info["checkpoint_version"] >= version.Version("2.0"):
+            # `path` is a Checkpoint instance: Translate to directory and continue.
+            if isinstance(path, Checkpoint):
+                path = path.to_directory()
             return super().from_checkpoint(path, filesystem=filesystem, **kwargs)
 
         # Not possible for (v0.1) (algo class and config information missing
diff --git a/rllib/algorithms/tests/test_algorithm.py b/rllib/algorithms/tests/test_algorithm.py
index 53c643bc4ae8..1b47c02eba62 100644
--- a/rllib/algorithms/tests/test_algorithm.py
+++ b/rllib/algorithms/tests/test_algorithm.py
@@ -316,7 +316,7 @@ def new_mapping_fn(agent_id, episode, worker, i=i, **kwargs):
 
             # Test restoring from the checkpoint (which has more policies
             # than what's defined in the config dict).
-            test = ppo.PPO.from_checkpoint(checkpoint=checkpoint)
+            test = ppo.PPO.from_checkpoint(checkpoint)
 
             # Make sure evaluation worker also got the restored, added policy.
             def _has_policies(w, pid=pid):

From a699e562ffdea3ba09623132e1597aa65898b5f5 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 23 Jan 2025 14:11:22 +0100
Subject: [PATCH 18/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 4662e93100ce..3a15692fff5d 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -182,6 +182,11 @@ results of collecting training samples with its "regular" :py:class:`~ray.rllib.
     for _ in range(3):
         pprint(ppo_with_evaluation.train())
 
+.. testcode::
+    :hide:
+
+    ppo_with_evaluation.stop()
+
 
 .. _rllib-with-ray-tune:
 
@@ -345,9 +350,9 @@ Alternatively, if you still have an :py:class:`~ray.rllib.algorithms.algorithm.A
 in your script, you can get the :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` through the
 :py:meth:`~ray.rllib.algorithms.algorithm.Algorithm.get_module` method:
 
-.. testcode::
+.. code-block:: python
 
-    rl_module = ppo_with_evaluation.get_module("default_policy")
+    rl_module = ppo.get_module("default_policy")  # Equivalent to `rl_module = ppo.get_module()`
 
 
 Customizing your RL environment

From 1ec6e684fbfd29c13c190862682a2342ac78c988 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 23 Jan 2025 15:15:01 +0100
Subject: [PATCH 19/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/source/rllib/getting-started.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 3a15692fff5d..87a481752c0e 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -83,7 +83,7 @@ you can call the :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfi
 
 .. testcode::
 
-    config.env_runners(num_env_runners=3)
+    config.env_runners(num_env_runners=2)
 
 For training-related settings or any algorithm-specific settings, use the
 :py:meth:`~ray.rllib.algorithms.algorithm_config.AlgorithmConfig.training` method:
@@ -122,7 +122,7 @@ which returns a result dictionary that you can pretty-print for debugging purpos
 
     from pprint import pprint
 
-    for _ in range(5):
+    for _ in range(4):
         pprint(ppo.train())
 
 
@@ -317,8 +317,8 @@ method to compute actions:
     obs, info = env.reset()
 
     while not done:
-        # Render the env.
-        env.render()
+        # Uncomment this line to render the env.
+        # env.render()
 
         # Compute the next action from a batch (B=1) of observations.
         obs_batch = torch.from_numpy(obs).unsqueeze(0)  # add batch B=1 dimension

From 5703024161177935ca7ed8871a183fdb66340585 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 23 Jan 2025 15:57:54 +0100
Subject: [PATCH 20/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/BUILD                            | 14 ++++++++++++++
 doc/source/rllib/getting-started.rst |  5 ++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/doc/BUILD b/doc/BUILD
index ed85a4b60249..58427137fd95 100644
--- a/doc/BUILD
+++ b/doc/BUILD
@@ -448,11 +448,25 @@ doctest(
             "source/rllib/**/*.rst",
             "source/rllib/**/*.md",
         ],
+        exclude = [
+            "source/rllib/getting-started.rst",
+        ]
     ),
     data = ["//rllib:cartpole-v1_large"],
     tags = ["team:rllib"],
 )
 
+doctest(
+    name = "doctest[rllib-getting-started]",
+    size = "large",
+    files = glob(
+        include = [
+            "source/rllib/getting-started.rst",
+        ],
+    ),
+    tags = ["team:rllib"],
+)
+
 doctest(
     files = [
         "source/data/batch_inference.rst",
diff --git a/doc/source/rllib/getting-started.rst b/doc/source/rllib/getting-started.rst
index 87a481752c0e..7cf14882a2fd 100644
--- a/doc/source/rllib/getting-started.rst
+++ b/doc/source/rllib/getting-started.rst
@@ -250,8 +250,8 @@ on your Ray cluster:
     ╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
     │ Trial name                   status       lr   iter  total time (s)  episode_return_mean  .._sampled_lifetime │
     ├───────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
-    │ PPO_Pendulum-v1_b5c41_00000  RUNNING  0.01       29         86.2426             -998.449               108000 │
-    │ PPO_Pendulum-v1_b5c41_00001  RUNNING  0.001      25         74.4335             -997.079               100000 │
+    │ PPO_Pendulum-v1_b5c41_00000  RUNNING  0.001      29         86.2426             -998.449               108000 │
+    │ PPO_Pendulum-v1_b5c41_00001  RUNNING  0.0005     25         74.4335             -997.079               100000 │
     │ PPO_Pendulum-v1_b5c41_00002  RUNNING  0.0001     20         60.0421             -960.293                80000 │
     ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 
@@ -430,7 +430,6 @@ and how to customize them.
     .. testcode::
         :hide:
 
-        # Test that our setup is working.
         ppo_w_custom_env.stop()
 
 

From 3612b6a893dc5e7b45518d67d506234692c26d70 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 23 Jan 2025 16:04:40 +0100
Subject: [PATCH 21/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/BUILD b/doc/BUILD
index 58427137fd95..90cda965806f 100644
--- a/doc/BUILD
+++ b/doc/BUILD
@@ -457,7 +457,7 @@ doctest(
 )
 
 doctest(
-    name = "doctest[rllib-getting-started]",
+    name = "doctest[rllib]",
     size = "large",
     files = glob(
         include = [

From 788625173475ac611b4ea145d9ff4788b95ebc93 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 23 Jan 2025 16:22:03 +0100
Subject: [PATCH 22/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 doc/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/BUILD b/doc/BUILD
index 90cda965806f..5cf8a9d0e402 100644
--- a/doc/BUILD
+++ b/doc/BUILD
@@ -457,7 +457,7 @@ doctest(
 )
 
 doctest(
-    name = "doctest[rllib]",
+    name = "doctest[rllib2]",
     size = "large",
     files = glob(
         include = [