ray-project · sven1977 · Dec 11, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
@@ -186,6 +186,16 @@ GPU (for Training and Sampling)
    with performance improvements during evaluation.
 
 
+Hierarchical Training
++++++++++++++++++++++
+
+- `Hierarchical RL Training <https://github.com/ray-project/ray/blob/master/rllib/examples/hierarchical/hierarchical_training.py>`__:
+   Showcases a hierarchical RL setup inspired by automatic subgoal discovery and subpolicy specialization. A high-level policy selects subgoals and assigns one of three
+   specialized low-level policies to achieve them within a time limit, encouraging specialization and efficient task-solving.
+   The agent has to navigate a complex grid-world environment. The example highlights the advantages of hierarchical
+   learning over flat approaches by demonstrating significantly improved learning performance in challenging, goal-oriented tasks.
+
+
 Inference (of Models/Policies)
 ++++++++++++++++++++++++++++++
 

@@ -2590,24 +2590,13 @@ py_test(
 
 # subdirectory: hierarchical/
 # ....................................
-#@OldAPIStack
 py_test(
-    name = "examples/hierarchical/hierarchical_training_tf",
+    name = "examples/hierarchical/hierarchical_training",
     main = "examples/hierarchical/hierarchical_training.py",
     tags = ["team:rllib", "exclusive", "examples"],
-    size = "medium",
-    srcs = ["examples/hierarchical/hierarchical_training.py"],
-    args = [ "--framework=tf", "--stop-reward=0.0"]
-)
-
-#@OldAPIStack
-py_test(
-    name = "examples/hierarchical/hierarchical_training_torch",
-    main = "examples/hierarchical/hierarchical_training.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "medium",
+    size = "large",
     srcs = ["examples/hierarchical/hierarchical_training.py"],
-    args = ["--framework=torch", "--stop-reward=0.0"]
+    args = ["--enable-new-api-stack", "--as-test", "--stop-reward=4.0", "--map=large", "--time-limit=50"]
 )
 
 # subdirectory: inference/

diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py
@@ -51,7 +51,7 @@ class MultiAgentEnv(gym.Env):
     # This attribute should not be changed during the lifetime of this env.
     possible_agents: List[AgentID] = []
 
-    # @OldAPIStack
+    # @OldAPIStack, use `observation_spaces` and `action_spaces`, instead.
     observation_space: Optional[gym.Space] = None
     action_space: Optional[gym.Space] = None
 

@@ -448,7 +448,7 @@ def add_env_step(
                     action_space=self.action_space.get(agent_id),
                 )
             else:
-                sa_episode = self.agent_episodes.get(agent_id)
+                sa_episode = self.agent_episodes[agent_id]
 
             # Collect value to be passed (at end of for-loop) into `add_env_step()`
             # call.
@@ -551,8 +551,8 @@ def add_env_step(
                     # duplicate the previous one (this is a technical "fix" to properly
                     # complete the single agent episode; this last observation is never
                     # used for learning anyway).
-                    _observation = sa_episode.get_observations(-1)
-                    _infos = sa_episode.get_infos(-1)
+                    _observation = sa_episode._last_added_observation
+                    _infos = sa_episode._last_added_infos
                 # Agent is still alive.
                 # [previous obs] [action] (hanging) ...
                 else:
@@ -595,8 +595,8 @@ def add_env_step(
                     # duplicate the previous one (this is a technical "fix" to properly
                     # complete the single agent episode; this last observation is never
                     # used for learning anyway).
-                    _observation = sa_episode.get_observations(-1)
-                    _infos = sa_episode.get_infos(-1)
+                    _observation = sa_episode._last_added_observation
+                    _infos = sa_episode._last_added_infos
                     # `_action` is already `get` above. We don't need to pop out from
                     # the cache as it gets wiped out anyway below b/c the agent is
                     # done.
@@ -1770,7 +1770,7 @@ def get_state(self) -> Dict[str, Any]:
             # TODO (simon): Check, if we can store the `InfiniteLookbackBuffer`
             "env_t_to_agent_t": self.env_t_to_agent_t,
             "_hanging_actions_end": self._hanging_actions_end,
-            "_hanging_extra_model_outputs_end": (self._hanging_extra_model_outputs_end),
+            "_hanging_extra_model_outputs_end": self._hanging_extra_model_outputs_end,
             "_hanging_rewards_end": self._hanging_rewards_end,
             "_hanging_actions_begin": self._hanging_actions_begin,
             "_hanging_extra_model_outputs_begin": (
@@ -2532,12 +2532,15 @@ def _get_single_agent_data_by_index(
                 # buffer, but a dict mapping keys to individual infinite lookback
                 # buffers.
                 if extra_model_outputs_key is None:
+                    assert hanging_val is None or isinstance(hanging_val, dict)
                     return {
                         key: sub_buffer.get(
                             indices=index_incl_lookback - sub_buffer.lookback,
                             neg_index_as_lookback=True,
                             fill=fill,
-                            _add_last_ts_value=hanging_val,
+                            _add_last_ts_value=(
+                                None if hanging_val is None else hanging_val[key]
+                            ),
                             **one_hot_discrete,
                         )
                         for key, sub_buffer in inf_lookback_buffer.items()

diff --git a/rllib/env/single_agent_episode.py b/rllib/env/single_agent_episode.py
@@ -163,6 +163,8 @@ class SingleAgentEpisode:
         "t",
         "t_started",
         "_action_space",
+        "_last_added_observation",
+        "_last_added_infos",
         "_last_step_time",
         "_observation_space",
         "_start_time",
@@ -346,6 +348,9 @@ def __init__(
         self._start_time = None
         self._last_step_time = None
 
+        self._last_added_observation = None
+        self._last_added_infos = None
+
         # Validate the episode data thus far.
         self.validate()
 
@@ -380,6 +385,9 @@ def add_env_reset(
         self.observations.append(observation)
         self.infos.append(infos)
 
+        self._last_added_observation = observation
+        self._last_added_infos = infos
+
         # Validate our data.
         self.validate()
 
@@ -434,6 +442,9 @@ def add_env_step(
         self.is_terminated = terminated
         self.is_truncated = truncated
 
+        self._last_added_observation = observation
+        self._last_added_infos = infos
+
         # Only check spaces if finalized AND every n timesteps.
         if self.is_finalized and self.t % 50:
             if self.observation_space is not None:

diff --git a/rllib/env/utils/infinite_lookback_buffer.py b/rllib/env/utils/infinite_lookback_buffer.py
@@ -533,9 +533,18 @@ def _get_int_index(
     ):
         data_to_use = self.data
         if _ignore_last_ts:
-            data_to_use = self.data[:-1]
+            if self.finalized:
+                data_to_use = tree.map_structure(lambda s: s[:-1], self.data)
+            else:
+                data_to_use = self.data[:-1]
         if _add_last_ts_value is not None:
-            data_to_use = np.append(data_to_use.copy(), _add_last_ts_value)
+            if self.finalized:
+                data_to_use = tree.map_structure(
+                    lambda s, last: np.append(s, last), data_to_use, _add_last_ts_value
+                )  # np.append(data_to_use.copy(), _add_last_ts_value)
+            else:
+                data_to_use = data_to_use.copy()
+                data_to_use.append(_add_last_ts_value)
 
         # If index >= 0 -> Ignore lookback buffer.
         # Otherwise, include lookback buffer.