From 04341e609f9e02019d26d1b024421b7689f1bc40 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Mon, 9 Oct 2023 20:00:02 +0200
Subject: [PATCH 01/24] First implementation of the new dataset interface

---
 examples/car_on_hill_fqi.py                   |   5 +-
 examples/pendulum_a2c.py                      |   9 +-
 .../actor_critic/deep_actor_critic/a2c.py     |   3 +-
 .../actor_critic/deep_actor_critic/ppo.py     |   5 +-
 .../actor_critic/deep_actor_critic/trpo.py    |   5 +-
 .../algorithms/value/batch_td/double_fqi.py   |   2 -
 mushroom_rl/algorithms/value/batch_td/fqi.py  |   3 +-
 mushroom_rl/core/__init__.py                  |   1 +
 mushroom_rl/core/_dataset_types/__init__.py   |   1 +
 .../core/_dataset_types/numpy_dataset.py      |  94 ++++++
 mushroom_rl/core/core.py                      |  52 ++--
 mushroom_rl/core/dataset.py                   | 282 ++++++++++++++++++
 mushroom_rl/utils/dataset.py                  | 202 -------------
 mushroom_rl/utils/spaces.py                   |  16 +-
 14 files changed, 426 insertions(+), 254 deletions(-)
 create mode 100644 mushroom_rl/core/_dataset_types/__init__.py
 create mode 100644 mushroom_rl/core/_dataset_types/numpy_dataset.py
 create mode 100644 mushroom_rl/core/dataset.py

diff --git a/examples/car_on_hill_fqi.py b/examples/car_on_hill_fqi.py
index cef509223..534d716d3 100644
--- a/examples/car_on_hill_fqi.py
+++ b/examples/car_on_hill_fqi.py
@@ -6,7 +6,6 @@
 from mushroom_rl.core import Core, Logger
 from mushroom_rl.environments import *
 from mushroom_rl.policy import EpsGreedy
-from mushroom_rl.utils.dataset import compute_J
 from mushroom_rl.utils.parameters import Parameter
 
 """
@@ -65,7 +64,7 @@ def experiment():
     # Render
     core.evaluate(n_episodes=3, render=True)
 
-    return np.mean(compute_J(dataset, mdp.info.gamma))
+    return np.mean(dataset.discounted_return)
 
 
 if __name__ == '__main__':
@@ -75,5 +74,5 @@ def experiment():
     logger.strong_line()
     logger.info('Experiment Algorithm: ' + FQI.__name__)
 
-    Js = Parallel(n_jobs=-1)(delayed(experiment)() for _ in range(n_experiment))
+    Js = Parallel(n_jobs=None)(delayed(experiment)() for _ in range(n_experiment))
     logger.info((np.mean(Js)))
diff --git a/examples/pendulum_a2c.py b/examples/pendulum_a2c.py
index 58cf76fb3..e37582cef 100644
--- a/examples/pendulum_a2c.py
+++ b/examples/pendulum_a2c.py
@@ -11,7 +11,6 @@
 from mushroom_rl.algorithms.actor_critic import A2C
 
 from mushroom_rl.policy import GaussianTorchPolicy
-from mushroom_rl.utils.dataset import compute_J
 
 
 class Network(nn.Module):
@@ -72,8 +71,8 @@ def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
 
     dataset = core.evaluate(n_steps=n_step_test, render=False)
 
-    J = np.mean(compute_J(dataset, mdp.info.gamma))
-    R = np.mean(compute_J(dataset))
+    J = np.mean(dataset.discounted_return)
+    R = np.mean(dataset.undiscounted_return)
     E = agent.policy.entropy()
 
     logger.epoch_info(0, J=J, R=R, entropy=E)
@@ -82,8 +81,8 @@ def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
         core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
         dataset = core.evaluate(n_steps=n_step_test, render=False)
 
-        J = np.mean(compute_J(dataset, mdp.info.gamma))
-        R = np.mean(compute_J(dataset))
+        J = np.mean(dataset.discounted_return)
+        R = np.mean(dataset.undiscounted_return)
         E = agent.policy.entropy()
 
         logger.epoch_info(it+1, J=J, R=R, entropy=E)
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
index a1c2dbc84..2cbf8f1de 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
@@ -4,7 +4,6 @@
 from mushroom_rl.approximators import Regressor
 from mushroom_rl.approximators.parametric import TorchApproximator
 from mushroom_rl.utils.value_functions import compute_advantage_montecarlo
-from mushroom_rl.utils.dataset import parse_dataset
 from mushroom_rl.utils.parameters import to_parameter
 from mushroom_rl.utils.torch import to_float_tensor
 
@@ -59,7 +58,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
         super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
 
     def fit(self, dataset, **info):
-        state, action, reward, next_state, absorbing, _ = parse_dataset(dataset)
+        state, action, reward, next_state, absorbing, _ = dataset.parse()
 
         v, adv = compute_advantage_montecarlo(self._V, state, next_state,
                                               reward, absorbing,
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
index bef5ddd7c..26e0bf13f 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
@@ -8,7 +8,6 @@
 from mushroom_rl.approximators.parametric import TorchApproximator
 from mushroom_rl.utils.torch import to_float_tensor, update_optimizer_parameters
 from mushroom_rl.utils.minibatches import minibatch_generator
-from mushroom_rl.utils.dataset import parse_dataset, compute_J
 from mushroom_rl.utils.value_functions import compute_gae
 from mushroom_rl.utils.parameters import to_parameter
 
@@ -72,7 +71,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
         super().__init__(mdp_info, policy, None)
 
     def fit(self, dataset, **info):
-        x, u, r, xn, absorbing, last = parse_dataset(dataset)
+        x, u, r, xn, absorbing, last = dataset.parse()
         x = x.astype(np.float32)
         u = u.astype(np.float32)
         r = r.astype(np.float32)
@@ -124,7 +123,7 @@ def _log_info(self, dataset, x, v_target, old_pol_dist):
             new_pol_dist = self.policy.distribution(x)
             logging_kl = torch.mean(torch.distributions.kl.kl_divergence(
                 new_pol_dist, old_pol_dist))
-            avg_rwd = np.mean(compute_J(dataset))
+            avg_rwd = np.mean(dataset.undiscounted_return)
             msg = "Iteration {}:\n\t\t\t\trewards {} vf_loss {}\n\t\t\t\tentropy {}  kl {}".format(
                 self._iter, avg_rwd, logging_verr, logging_ent, logging_kl)
 
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
index df48a9564..e0ab6b7c2 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
@@ -9,7 +9,6 @@
 from mushroom_rl.approximators import Regressor
 from mushroom_rl.approximators.parametric import TorchApproximator
 from mushroom_rl.utils.torch import get_gradient, zero_grad, to_float_tensor
-from mushroom_rl.utils.dataset import parse_dataset, compute_J
 from mushroom_rl.utils.value_functions import compute_gae
 from mushroom_rl.utils.parameters import to_parameter
 
@@ -83,7 +82,7 @@ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, l
         super().__init__(mdp_info, policy, None)
 
     def fit(self, dataset, **info):
-        state, action, reward, next_state, absorbing, last = parse_dataset(dataset)
+        state, action, reward, next_state, absorbing, last = dataset.parse()
         x = state.astype(np.float32)
         u = action.astype(np.float32)
         r = reward.astype(np.float32)
@@ -214,7 +213,7 @@ def _log_info(self, dataset, x, v_target, old_pol_dist):
             logging_kl = torch.mean(
                 torch.distributions.kl.kl_divergence(old_pol_dist, new_pol_dist)
             )
-            avg_rwd = np.mean(compute_J(dataset))
+            avg_rwd = np.mean(dataset.undiscounted_return)
             msg = "Iteration {}:\n\t\t\t\trewards {} vf_loss {}\n\t\t\t\tentropy {}  kl {}".format(
                 self._iter, avg_rwd, logging_verr, logging_ent, logging_kl)
 
diff --git a/mushroom_rl/algorithms/value/batch_td/double_fqi.py b/mushroom_rl/algorithms/value/batch_td/double_fqi.py
index f133b80be..f952af5a5 100644
--- a/mushroom_rl/algorithms/value/batch_td/double_fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/double_fqi.py
@@ -1,8 +1,6 @@
 import numpy as np
 from tqdm import trange
 
-from mushroom_rl.utils.dataset import parse_dataset
-
 from .fqi import FQI
 
 
diff --git a/mushroom_rl/algorithms/value/batch_td/fqi.py b/mushroom_rl/algorithms/value/batch_td/fqi.py
index 97fa2316f..abb5cce2a 100644
--- a/mushroom_rl/algorithms/value/batch_td/fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/fqi.py
@@ -2,7 +2,6 @@
 from tqdm import trange
 
 from mushroom_rl.algorithms.value.batch_td import BatchTD
-from mushroom_rl.utils.dataset import parse_dataset
 from mushroom_rl.utils.parameters import to_parameter
 
 
@@ -35,7 +34,7 @@ def __init__(self, mdp_info, policy, approximator, n_iterations,
         super().__init__(mdp_info, policy, approximator, approximator_params, fit_params)
 
     def fit(self, dataset, **info):
-        state, action, reward, next_state, absorbing, _ = parse_dataset(dataset)
+        action, reward, next_state, absorbing, _ = dataset.parse()
         for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
             if self._target is None:
                 self._target = reward
diff --git a/mushroom_rl/core/__init__.py b/mushroom_rl/core/__init__.py
index ebf9082db..6ac45aa64 100644
--- a/mushroom_rl/core/__init__.py
+++ b/mushroom_rl/core/__init__.py
@@ -1,4 +1,5 @@
 from .core import Core
+from .dataset import Dataset
 from .environment import Environment, MDPInfo
 from .agent import Agent
 from .serialization import Serializable
diff --git a/mushroom_rl/core/_dataset_types/__init__.py b/mushroom_rl/core/_dataset_types/__init__.py
new file mode 100644
index 000000000..177fe69de
--- /dev/null
+++ b/mushroom_rl/core/_dataset_types/__init__.py
@@ -0,0 +1 @@
+from .numpy_dataset import NumpyDataset
\ No newline at end of file
diff --git a/mushroom_rl/core/_dataset_types/numpy_dataset.py b/mushroom_rl/core/_dataset_types/numpy_dataset.py
new file mode 100644
index 000000000..c54246fc1
--- /dev/null
+++ b/mushroom_rl/core/_dataset_types/numpy_dataset.py
@@ -0,0 +1,94 @@
+import numpy as np
+
+from mushroom_rl.core.serialization import Serializable
+
+
+class NumpyDataset(Serializable):
+    def __init__(self, state_type, state_shape, action_type, action_shape, reward_shape):
+        flags_shape = (action_shape[0],)
+
+        self._state_type = state_type
+        self._action_type = action_type
+
+        self._states = np.empty(state_shape, dtype=self._state_type)
+        self._actions = np.empty(action_shape, dtype=self._action_type)
+        self._rewards = np.empty(reward_shape, dtype=float)
+        self._next_states = np.empty(state_shape, dtype=self._state_type)
+        self._absorbing = np.empty(flags_shape, dtype=bool)
+        self._last = np.empty(flags_shape, dtype=bool)
+        self._len = 0
+
+        self._add_save_attr(
+            _states='numpy',
+            _actions='numpy',
+            _rewards='numpy',
+            _next_states='numpy',
+            _absorbing='numpy',
+            _last='numpy',
+            _len='primitive'
+        )
+
+    def __len__(self):
+        return self._len
+
+    def append(self, state, action, reward, next_state, absorbing, last):
+        i = self._len
+
+        self._states[i] = state
+        self._actions[i] = action
+        self._rewards[i] = reward
+        self._next_states[i] = next_state
+        self._absorbing[i] = absorbing
+        self._last[i] = last
+
+        self._len += 1
+
+    def clear(self):
+        self._states = np.empty_like(self._states)
+        self._actions = np.empty_like(self._actions)
+        self._rewards = np.empty_like(self._rewards)
+        self._next_states = np.empty_like(self._next_states)
+        self._absorbing = np.empty_like(self._absorbing)
+        self._last = np.empty_like(self._last)
+        self._len = 0
+
+    def __getitem__(self, index):
+        return self._states[index], self._actions[index], self._rewards[index], self._next_states[index], \
+               self._absorbing[index], self._last[index]
+
+    def __add__(self, other):
+        result = self.copy()
+
+        result._states = np.concatenate((self.state, other.state))
+        result._actions = np.concatenate((self.action, other.action))
+        result._rewards = np.concatenate((self.reward, other.reward))
+        result._next_states = np.concatenate((self.next_state, other.next_state))
+        result._absorbing = np.concatenate((self.absorbing, other.absorbing))
+        result._last = np.concatenate((self.last, other.last))
+        result._len = len(self) + len(other)
+
+        return result
+
+    @property
+    def state(self):
+        return self._states
+
+    @property
+    def action(self):
+        return self._actions
+
+    @property
+    def reward(self):
+        return self._rewards
+
+    @property
+    def next_state(self):
+        return self._next_states
+
+    @property
+    def absorbing(self):
+        return self._absorbing
+
+    @property
+    def last(self):
+        return self._last
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index 22d6e20e3..759e4edf7 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -1,6 +1,6 @@
 from tqdm import tqdm
 
-from collections import defaultdict
+from mushroom_rl.core.dataset import Dataset
 from mushroom_rl.utils.record import VideoRecorder
 
 
@@ -72,12 +72,14 @@ def learn(self, n_steps=None, n_episodes=None, n_steps_per_fit=None,
         if n_steps_per_fit is not None:
             fit_condition = lambda: self._current_steps_counter >= self._n_steps_per_fit
         else:
-            fit_condition = lambda: self._current_episodes_counter  >= self._n_episodes_per_fit
+            fit_condition = lambda: self._current_episodes_counter >= self._n_episodes_per_fit
 
-        self._run(n_steps, n_episodes, fit_condition, render, quiet, record, get_env_info=False)
+        dataset = Dataset(self.mdp.info, self._n_steps_per_fit, self._n_episodes_per_fit)
+
+        self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record)
 
     def evaluate(self, initial_states=None, n_steps=None, n_episodes=None,
-                 render=False, quiet=False, record=False, get_env_info=False):
+                 render=False, quiet=False, record=False):
         """
         This function moves the agent in the environment using its policy.
         The agent is moved for a provided number of steps, episodes, or from a set of initial states for the whole
@@ -90,8 +92,7 @@ def evaluate(self, initial_states=None, n_steps=None, n_episodes=None,
             render (bool, False): whether to render the environment or not;
             quiet (bool, False): whether to show the progress bar or not;
             record (bool, False): whether to record a video of the environment or not. If True, also the render flag
-                should be set to True;
-            get_env_info (bool, False): whether to return the environment info list or not.
+                should be set to True.
 
         Returns:
             The collected dataset and, optionally, an extra dataset of
@@ -102,14 +103,17 @@ def evaluate(self, initial_states=None, n_steps=None, n_episodes=None,
 
         fit_condition = lambda: False
 
-        return self._run(n_steps, n_episodes, fit_condition, render, quiet, record, get_env_info, initial_states)
+        n_episodes_dataset = len(initial_states) if initial_states is not None else n_episodes
+        dataset = Dataset(self.mdp.info, n_steps, n_episodes_dataset)
 
-    def _run(self, n_steps, n_episodes, fit_condition, render, quiet, record, get_env_info, initial_states=None):
+        return self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record, initial_states)
+
+    def _run(self, dataset, n_steps, n_episodes, fit_condition, render, quiet, record, initial_states=None):
         assert n_episodes is not None and n_steps is None and initial_states is None\
             or n_episodes is None and n_steps is not None and initial_states is None\
             or n_episodes is None and n_steps is None and initial_states is not None
 
-        self._n_episodes = len( initial_states) if initial_states is not None else n_episodes
+        self._n_episodes = len(initial_states) if initial_states is not None else n_episodes
 
         if n_steps is not None:
             move_condition = lambda: self._total_steps_counter < n_steps
@@ -122,24 +126,18 @@ def _run(self, n_steps, n_episodes, fit_condition, render, quiet, record, get_en
             steps_progress_bar = tqdm(disable=True)
             episodes_progress_bar = tqdm(total=self._n_episodes, dynamic_ncols=True, disable=quiet, leave=False)
 
-        dataset, dataset_info = self._run_impl(move_condition, fit_condition, steps_progress_bar, episodes_progress_bar,
-                                               render, record, initial_states)
+        self._run_impl(dataset, move_condition, fit_condition, steps_progress_bar, episodes_progress_bar, render,
+                       record, initial_states)
 
-        if get_env_info:
-            return dataset, dataset_info
-        else:
-            return dataset
+        return dataset
 
-    def _run_impl(self, move_condition, fit_condition, steps_progress_bar, episodes_progress_bar, render, record,
-                  initial_states):
+    def _run_impl(self, dataset, move_condition, fit_condition, steps_progress_bar, episodes_progress_bar, render,
+                  record, initial_states):
         self._total_episodes_counter = 0
         self._total_steps_counter = 0
         self._current_episodes_counter = 0
         self._current_steps_counter = 0
 
-        dataset = list()
-        dataset_info = defaultdict(list)
-
         last = True
         while move_condition():
             if last:
@@ -147,7 +145,7 @@ def _run_impl(self, move_condition, fit_condition, steps_progress_bar, episodes_
 
             sample, step_info = self._step(render, record)
 
-            self.callback_step([sample])
+            self.callback_step(sample)
 
             self._total_steps_counter += 1
             self._current_steps_counter += 1
@@ -158,21 +156,17 @@ def _run_impl(self, move_condition, fit_condition, steps_progress_bar, episodes_
                 self._current_episodes_counter += 1
                 episodes_progress_bar.update(1)
 
-            dataset.append(sample)
-
-            for key, value in step_info.items():
-                dataset_info[key].append(value)
+            dataset.append(sample, step_info)
 
             if fit_condition():
-                self.agent.fit(dataset, **dataset_info)
+                self.agent.fit(dataset)
                 self._current_episodes_counter = 0
                 self._current_steps_counter = 0
 
                 for c in self.callbacks_fit:
                     c(dataset)
 
-                dataset = list()
-                dataset_info = defaultdict(list)
+                dataset.clear()
 
             last = sample[-1]
 
@@ -185,8 +179,6 @@ def _run_impl(self, move_condition, fit_condition, steps_progress_bar, episodes_
         steps_progress_bar.close()
         episodes_progress_bar.close()
 
-        return dataset, dataset_info
-
     def _step(self, render, record):
         """
         Single step.
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
new file mode 100644
index 000000000..c6a49eb57
--- /dev/null
+++ b/mushroom_rl/core/dataset.py
@@ -0,0 +1,282 @@
+import numpy as np
+
+from collections import defaultdict
+
+from mushroom_rl.core.serialization import Serializable
+
+from mushroom_rl.core._dataset_types import NumpyDataset
+
+
+class Dataset(Serializable):
+    def __init__(self, mdp_info, n_steps=None, n_episodes=None):
+        assert (n_steps is not None and n_episodes is None) or (n_steps is None and n_episodes is not None)
+
+        if n_steps is not None:
+            n_samples = n_steps
+        else:
+            horizon = mdp_info.horizon
+            assert np.isfinite(horizon)
+
+            n_samples = horizon * n_episodes
+
+        state_shape = (n_samples,) + mdp_info.observation_space.shape
+        action_shape = (n_samples,) + mdp_info.action_space.shape
+        reward_shape = (n_samples,)
+
+        state_type = mdp_info.observation_space.data_type
+        action_type = mdp_info.action_space.data_type
+
+        self._info = defaultdict(list)
+        self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+        self._gamma = mdp_info.gamma
+
+        self._add_save_attr(
+            _info='mushroom',
+            _data='pickle',
+            _gamma='primitive'
+        )
+
+    def append(self, step, info):
+        self._data.append(*step[:6])
+        self._append_info(info)
+
+    def get_info(self, field, index=None):
+        if index is None:
+            return self._info[field]
+        else:
+            return self._info[field][index]
+
+    def clear(self):
+        self._info = defaultdict(list)
+        self._data.clear()
+
+    def __getitem__(self, index):
+        if index < len(self._data):
+            return self._data[index]
+        else:
+            raise IndexError
+
+    def __add__(self, other):
+        result = self.copy()
+
+        new_info = defaultdict(list)
+        for key in self._info.keys():
+            new_info[key] = self._info[key] + other.info[key]
+
+        result._info = new_info
+        result._data = self._data + other._data
+
+        return result
+
+    def __len__(self):
+        return len(self._data)
+
+    @property
+    def state(self):
+        return self._data.state
+
+    @property
+    def action(self):
+        return self._data.action
+
+    @property
+    def reward(self):
+        return self._data.reward
+
+    @property
+    def next_state(self):
+        return self._data.next_state
+
+    @property
+    def absorbing(self):
+        return self._data.absorbing
+
+    @property
+    def last(self):
+        return self._data.last
+
+    @property
+    def episodes_length(self):
+        """
+        Compute the length of each episode in the dataset.
+
+        Args:
+            dataset (list): the dataset to consider.
+
+        Returns:
+            A list of length of each episode in the dataset.
+
+        """
+        lengths = list()
+        l = 0
+        for sample in self:
+            l += 1
+            if sample[-1] == 1:
+                lengths.append(l)
+                l = 0
+
+        return lengths
+
+    @property
+    def undiscounted_return(self):
+        return self.compute_J()
+
+    @property
+    def discounted_return(self):
+        return self.compute_J(self._gamma)
+
+    def parse(self):
+        """
+        Return the dataset as set of arrays.
+
+        Returns:
+            A tuple containing the arrays that define the dataset, i.e. state, action, next state, absorbing and last
+
+        """
+        return self.state, self.action, self.reward, self.next_state, self.absorbing, self.last
+
+    def select_first_episodes(self, n_episodes, parse=False):
+        """
+        Return the first ``n_episodes`` episodes in the provided dataset.
+
+        Args:
+            dataset (list): the dataset to consider;
+            n_episodes (int): the number of episodes to pick from the dataset;
+            parse (bool, False): whether to parse the dataset to return.
+
+        Returns:
+            A subset of the dataset containing the first ``n_episodes`` episodes.
+
+        """
+        raise NotImplementedError
+
+        # assert n_episodes >= 0, 'Number of episodes must be greater than or equal' \
+        #                         'to zero.'
+        # if n_episodes == 0:
+        #     return np.array([[]])
+        #
+        # dataset = np.array(dataset, dtype=object)
+        # last_idxs = np.argwhere(dataset[:, -1] == 1).ravel()
+        # sub_dataset = dataset[:last_idxs[n_episodes - 1] + 1, :]
+        #
+        # return sub_dataset if not parse else parse_dataset(sub_dataset)
+
+    def select_random_samples(self, n_samples, parse=False):
+        """
+        Return the randomly picked desired number of samples in the provided
+        dataset.
+
+        Args:
+            dataset (list): the dataset to consider;
+            n_samples (int): the number of samples to pick from the dataset;
+            parse (bool, False): whether to parse the dataset to return.
+
+        Returns:
+            A subset of the dataset containing randomly picked ``n_samples``
+            samples.
+
+        """
+        raise NotImplementedError
+
+        # assert n_samples >= 0, 'Number of samples must be greater than or equal' \
+        #                        'to zero.'
+        # if n_samples == 0:
+        #     return np.array([[]])
+        #
+        # dataset = np.array(self, dtype=object)
+        # idxs = np.random.randint(dataset.shape[0], size=n_samples)
+
+        #sub_dataset = dataset[idxs, ...]
+
+        #if parse:
+        #    return sub_dataset
+        # else:
+        #     return sub_dataset if not parse else
+
+    def get_init_states(self):
+        """
+        Get the initial states of a dataset
+
+        Args:
+            dataset (list): the dataset to consider.
+
+        Returns:
+            An array of initial states of the considered dataset.
+
+        """
+        pick = True
+        x_0 = list()
+        for d in self:
+            if pick:
+                # if isinstance(d[0], LazyFrames): #FIXME LazyFrames
+                #     x_0.append(np.array(d[0]))
+                # else:
+                x_0.append(d[0])
+            pick = d[-1]
+        return np.array(x_0)
+
+    def compute_J(self, gamma=1.):
+        """
+        Compute the cumulative discounted reward of each episode in the dataset.
+
+        Args:
+            dataset (list): the dataset to consider;
+            gamma (float, 1.): discount factor.
+
+        Returns:
+            The cumulative discounted reward of each episode in the dataset.
+
+        """
+        js = list()
+
+        j = 0.
+        episode_steps = 0
+        for i in range(len(self)):
+            j += gamma ** episode_steps * self.reward[i]
+            episode_steps += 1
+            if self.last[i] or i == len(self) - 1:
+                js.append(j)
+                j = 0.
+                episode_steps = 0
+
+        if len(js) == 0:
+            return [0.]
+        return js
+
+    def compute_metrics(self, gamma=1.):
+        """
+        Compute the metrics of each complete episode in the dataset.
+
+        Args:
+            dataset (list): the dataset to consider;
+            gamma (float, 1.): the discount factor.
+
+        Returns:
+            The minimum score reached in an episode,
+            the maximum score reached in an episode,
+            the mean score reached,
+            the median score reached,
+            the number of completed episodes.
+
+            If no episode has been completed, it returns 0 for all values.
+
+        """
+        for i in reversed(range(len(self))):
+            if self.last[i]:
+                i += 1
+                break
+
+        dataset = self[:i]
+
+        if len(dataset) > 0:
+            J = self.compute_J(gamma)
+            return np.min(J), np.max(J), np.mean(J), np.median(J), len(J)
+        else:
+            return 0, 0, 0, 0, 0
+
+    def _append_info(self, step_info):
+        for key, value in step_info.items():
+            self._info[key].append(value)
+
+
+
diff --git a/mushroom_rl/utils/dataset.py b/mushroom_rl/utils/dataset.py
index 5c8acfb6b..3bcf3321a 100644
--- a/mushroom_rl/utils/dataset.py
+++ b/mushroom_rl/utils/dataset.py
@@ -3,52 +3,6 @@
 from mushroom_rl.utils.frames import LazyFrames
 
 
-def parse_dataset(dataset, features=None):
-    """
-    Split the dataset in its different components and return them.
-
-    Args:
-        dataset (list): the dataset to parse;
-        features (object, None): features to apply to the states.
-
-    Returns:
-        The np.ndarray of state, action, reward, next_state, absorbing flag and
-        last step flag. Features are applied to ``state`` and ``next_state``,
-        when provided.
-
-    """
-    assert len(dataset) > 0
-
-    shape = dataset[0][0].shape if features is None else (features.size,)
-
-    state = np.ones((len(dataset),) + shape)
-    action = np.ones((len(dataset),) + dataset[0][1].shape)
-    reward = np.ones(len(dataset))
-    next_state = np.ones((len(dataset),) + shape)
-    absorbing = np.ones(len(dataset))
-    last = np.ones(len(dataset))
-
-    if features is not None:
-        for i in range(len(dataset)):
-            state[i, ...] = features(dataset[i][0])
-            action[i, ...] = dataset[i][1]
-            reward[i] = dataset[i][2]
-            next_state[i, ...] = features(dataset[i][3])
-            absorbing[i] = dataset[i][4]
-            last[i] = dataset[i][5]
-    else:
-        for i in range(len(dataset)):
-            state[i, ...] = dataset[i][0]
-            action[i, ...] = dataset[i][1]
-            reward[i] = dataset[i][2]
-            next_state[i, ...] = dataset[i][3]
-            absorbing[i] = dataset[i][4]
-            last[i] = dataset[i][5]
-
-    return np.array(state), np.array(action), np.array(reward), np.array(
-        next_state), np.array(absorbing), np.array(last)
-
-
 def arrays_as_dataset(states, actions, rewards, next_states, absorbings, lasts):
     """
     Creates a dataset of transitions from the provided arrays.
@@ -77,159 +31,3 @@ def arrays_as_dataset(states, actions, rewards, next_states, absorbings, lasts):
     return dataset
 
 
-def compute_episodes_length(dataset):
-    """
-    Compute the length of each episode in the dataset.
-
-    Args:
-        dataset (list): the dataset to consider.
-
-    Returns:
-        A list of length of each episode in the dataset.
-
-    """
-    lengths = list()
-    l = 0
-    for sample in dataset:
-        l += 1
-        if sample[-1] == 1:
-            lengths.append(l)
-            l = 0
-
-    return lengths
-
-
-def select_first_episodes(dataset, n_episodes, parse=False):
-    """
-    Return the first ``n_episodes`` episodes in the provided dataset.
-
-    Args:
-        dataset (list): the dataset to consider;
-        n_episodes (int): the number of episodes to pick from the dataset;
-        parse (bool, False): whether to parse the dataset to return.
-
-    Returns:
-        A subset of the dataset containing the first ``n_episodes`` episodes.
-
-    """
-    assert n_episodes >= 0, 'Number of episodes must be greater than or equal' \
-                            'to zero.'
-    if n_episodes == 0:
-        return np.array([[]])
-
-    dataset = np.array(dataset, dtype=object)
-    last_idxs = np.argwhere(dataset[:, -1] == 1).ravel()
-    sub_dataset = dataset[:last_idxs[n_episodes - 1] + 1, :]
-
-    return sub_dataset if not parse else parse_dataset(sub_dataset)
-
-
-def select_random_samples(dataset, n_samples, parse=False):
-    """
-    Return the randomly picked desired number of samples in the provided
-    dataset.
-
-    Args:
-        dataset (list): the dataset to consider;
-        n_samples (int): the number of samples to pick from the dataset;
-        parse (bool, False): whether to parse the dataset to return.
-
-    Returns:
-        A subset of the dataset containing randomly picked ``n_samples``
-        samples.
-
-    """
-    assert n_samples >= 0, 'Number of samples must be greater than or equal' \
-                           'to zero.'
-    if n_samples == 0:
-        return np.array([[]])
-
-    dataset = np.array(dataset, dtype=object)
-    idxs = np.random.randint(dataset.shape[0], size=n_samples)
-    sub_dataset = dataset[idxs, ...]
-
-    return sub_dataset if not parse else parse_dataset(sub_dataset)
-
-
-def get_init_states(dataset):
-    """
-    Get the initial states of a dataset
-
-    Args:
-        dataset (list): the dataset to consider.
-
-    Returns:
-        An array of initial states of the considered dataset.
-
-    """
-    pick = True
-    x_0 = list()
-    for d in dataset:
-        if pick:
-            if isinstance(d[0], LazyFrames):
-                x_0.append(np.array(d[0]))
-            else:
-                x_0.append(d[0])
-        pick = d[-1]
-    return np.array(x_0)
-
-
-def compute_J(dataset, gamma=1.):
-    """
-    Compute the cumulative discounted reward of each episode in the dataset.
-
-    Args:
-        dataset (list): the dataset to consider;
-        gamma (float, 1.): discount factor.
-
-    Returns:
-        The cumulative discounted reward of each episode in the dataset.
-
-    """
-    js = list()
-
-    j = 0.
-    episode_steps = 0
-    for i in range(len(dataset)):
-        j += gamma ** episode_steps * dataset[i][2]
-        episode_steps += 1
-        if dataset[i][-1] or i == len(dataset) - 1:
-            js.append(j)
-            j = 0.
-            episode_steps = 0
-
-    if len(js) == 0:
-        return [0.]
-    return js
-
-
-def compute_metrics(dataset, gamma=1.):
-    """
-    Compute the metrics of each complete episode in the dataset.
-
-    Args:
-        dataset (list): the dataset to consider;
-        gamma (float, 1.): the discount factor.
-
-    Returns:
-        The minimum score reached in an episode,
-        the maximum score reached in an episode,
-        the mean score reached,
-        the median score reached,
-        the number of completed episodes.
-
-        If no episode has been completed, it returns 0 for all values.
-
-    """
-    for i in reversed(range(len(dataset))):
-        if dataset[i][-1]:
-            i += 1
-            break
-
-    dataset = dataset[:i]
-
-    if len(dataset) > 0:
-        J = compute_J(dataset, gamma)
-        return np.min(J), np.max(J), np.mean(J), np.median(J), len(J)
-    else:
-        return 0, 0, 0, 0, 0
diff --git a/mushroom_rl/utils/spaces.py b/mushroom_rl/utils/spaces.py
index c5e252cb7..990ba0fa8 100644
--- a/mushroom_rl/utils/spaces.py
+++ b/mushroom_rl/utils/spaces.py
@@ -9,7 +9,7 @@ class Box(Serializable):
     spaces. It is similar to the ``Box`` class in ``gym.spaces.box``.
 
     """
-    def __init__(self, low, high, shape=None):
+    def __init__(self, low, high, shape=None, data_type=float):
         """
         Constructor.
 
@@ -26,6 +26,7 @@ def __init__(self, low, high, shape=None):
                 of the i-th dimension;
             shape (np.ndarray, None): the dimension of the space. Must match
                 the shape of ``low`` and ``high``, if they are np.ndarray.
+            data_type (class, float): the data type to be used.
 
         """
         if shape is None:
@@ -42,9 +43,12 @@ def __init__(self, low, high, shape=None):
 
         assert self._low.shape == self._high.shape
 
+        self._data_type = data_type
+
         self._add_save_attr(
             _low='numpy',
-            _high='numpy'
+            _high='numpy',
+            _data_type='primitive'
         )
 
     @property
@@ -74,6 +78,10 @@ def shape(self):
         """
         return self._shape
 
+    @property
+    def data_type(self):
+        return self._data_type
+
     def _post_load(self):
         self._shape = self._low.shape
 
@@ -117,3 +125,7 @@ def shape(self):
 
         """
         return 1,
+
+    @property
+    def data_type(self):
+        return int

From 435d62f60029405c0467ca1c2182b5e542ee6e34 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Mon, 9 Oct 2023 22:21:53 +0200
Subject: [PATCH 02/24] Fixing some issues in algorithms and test

---
 .../black_box_optimization/black_box_optimization.py      | 3 +--
 mushroom_rl/algorithms/value/batch_td/boosted_fqi.py      | 4 +---
 mushroom_rl/algorithms/value/batch_td/lspi.py             | 8 +++++---
 tests/algorithms/test_fqi.py                              | 3 +--
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
index 7e2e288c1..895a2d8fa 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
@@ -1,7 +1,6 @@
 import numpy as np
 
 from mushroom_rl.core import Agent
-from mushroom_rl.utils.dataset import compute_J
 
 
 class BlackBoxOptimization(Agent):
@@ -35,7 +34,7 @@ def episode_start(self):
         super().episode_start()
 
     def fit(self, dataset, **info):
-        Jep = compute_J(dataset, self.mdp_info.gamma)
+        Jep = dataset.compute_J(self.mdp_info.gamma)
 
         Jep = np.array(Jep)
         theta = np.array(self._theta_list)
diff --git a/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py b/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py
index ad12da032..de271fd93 100644
--- a/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py
@@ -1,8 +1,6 @@
 import numpy as np
 from tqdm import trange
 
-from mushroom_rl.utils.dataset import parse_dataset
-
 from .fqi import FQI
 
 
@@ -32,7 +30,7 @@ def __init__(self, mdp_info, policy, approximator, n_iterations,
         super().__init__(mdp_info, policy, approximator, n_iterations, approximator_params, fit_params, quiet)
 
     def fit(self, dataset, **info):
-        state, action, reward, next_state, absorbing, _ = parse_dataset(dataset)
+        state, action, reward, next_state, absorbing, _ = dataset.parse()
         for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
             if self._target is None:
                 self._target = reward
diff --git a/mushroom_rl/algorithms/value/batch_td/lspi.py b/mushroom_rl/algorithms/value/batch_td/lspi.py
index 7e28cb816..f618e1d47 100644
--- a/mushroom_rl/algorithms/value/batch_td/lspi.py
+++ b/mushroom_rl/algorithms/value/batch_td/lspi.py
@@ -3,7 +3,6 @@
 from mushroom_rl.algorithms.value.batch_td import BatchTD
 from mushroom_rl.approximators.parametric import LinearApproximator
 from mushroom_rl.features import get_action_features
-from mushroom_rl.utils.dataset import parse_dataset
 from mushroom_rl.utils.parameters import to_parameter
 
 
@@ -30,8 +29,11 @@ def __init__(self, mdp_info, policy, approximator_params=None,
                          approximator_params, fit_params, features)
 
     def fit(self, dataset, **info):
-        phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset(
-            dataset, self.phi)
+        state, action, reward, next_state, absorbing, _ = dataset.parse()
+
+        phi_state = self.phi(state)
+        phi_next_state = self.phi(next_state)
+
         phi_state_action = get_action_features(phi_state, action,
                                                self.mdp_info.action_space.n)
 
diff --git a/tests/algorithms/test_fqi.py b/tests/algorithms/test_fqi.py
index a92fc71a5..ca1911f3e 100644
--- a/tests/algorithms/test_fqi.py
+++ b/tests/algorithms/test_fqi.py
@@ -9,7 +9,6 @@
 from mushroom_rl.core import Core
 from mushroom_rl.environments import *
 from mushroom_rl.policy import EpsGreedy
-from mushroom_rl.utils.dataset import compute_J
 from mushroom_rl.utils.parameters import Parameter
 
 
@@ -44,7 +43,7 @@ def learn(alg, alg_params):
     agent.policy.set_epsilon(test_epsilon)
     dataset = core.evaluate(n_episodes=2)
 
-    return agent, np.mean(compute_J(dataset, mdp.info.gamma))
+    return agent, np.mean(dataset.compute_J(mdp.info.gamma))
 
 
 def test_fqi():

From 20b22f90d81d379799c21110fbe2380bba935736 Mon Sep 17 00:00:00 2001
From: Boris_il_forte <boris.ilpossente@hotmail.it>
Date: Tue, 10 Oct 2023 00:14:33 +0200
Subject: [PATCH 03/24] Fixing more dataset tests

---
 .../algorithms/value/batch_td/double_fqi.py   |  2 +-
 mushroom_rl/algorithms/value/batch_td/fqi.py  |  2 +-
 mushroom_rl/algorithms/value/batch_td/lspi.py | 18 ++--
 .../core/_dataset_types/numpy_dataset.py      | 13 +++
 mushroom_rl/core/dataset.py                   | 83 ++++++++++---------
 tests/algorithms/test_fqi.py                  |  3 +-
 tests/core/test_core.py                       | 10 +--
 tests/{utils => core}/test_dataset.py         | 20 ++---
 8 files changed, 80 insertions(+), 71 deletions(-)
 rename tests/{utils => core}/test_dataset.py (78%)

diff --git a/mushroom_rl/algorithms/value/batch_td/double_fqi.py b/mushroom_rl/algorithms/value/batch_td/double_fqi.py
index f952af5a5..923b1184f 100644
--- a/mushroom_rl/algorithms/value/batch_td/double_fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/double_fqi.py
@@ -28,7 +28,7 @@ def fit(self, dataset, **info):
 
             half = len(dataset) // 2
             for i in range(2):
-                s, a, r, ss, ab, _ = parse_dataset(dataset[i * half:(i + 1) * half])
+                s, a, r, ss, ab, _ = dataset[i * half:(i + 1) * half].parse()
                 state.append(s)
                 action.append(a)
                 reward.append(r)
diff --git a/mushroom_rl/algorithms/value/batch_td/fqi.py b/mushroom_rl/algorithms/value/batch_td/fqi.py
index abb5cce2a..6b6466a90 100644
--- a/mushroom_rl/algorithms/value/batch_td/fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/fqi.py
@@ -34,7 +34,7 @@ def __init__(self, mdp_info, policy, approximator, n_iterations,
         super().__init__(mdp_info, policy, approximator, approximator_params, fit_params)
 
     def fit(self, dataset, **info):
-        action, reward, next_state, absorbing, _ = dataset.parse()
+        state, action, reward, next_state, absorbing, _ = dataset.parse()
         for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
             if self._target is None:
                 self._target = reward
diff --git a/mushroom_rl/algorithms/value/batch_td/lspi.py b/mushroom_rl/algorithms/value/batch_td/lspi.py
index f618e1d47..9de6e3fe4 100644
--- a/mushroom_rl/algorithms/value/batch_td/lspi.py
+++ b/mushroom_rl/algorithms/value/batch_td/lspi.py
@@ -31,11 +31,10 @@ def __init__(self, mdp_info, policy, approximator_params=None,
     def fit(self, dataset, **info):
         state, action, reward, next_state, absorbing, _ = dataset.parse()
 
-        phi_state = self.phi(state)
-        phi_next_state = self.phi(next_state)
+        phi_state = np.array([self.phi(s) for s in state])  # TODO improve with vectorial inputs
+        phi_next_state = np.array([self.phi(ss) for ss in next_state])
 
-        phi_state_action = get_action_features(phi_state, action,
-                                               self.mdp_info.action_space.n)
+        phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n)
 
         norm = np.inf
         while norm > self._epsilon():
@@ -44,14 +43,9 @@ def fit(self, dataset, **info):
                 q *= 1 - absorbing.reshape(-1, 1)
 
             next_action = np.argmax(q, axis=1).reshape(-1, 1)
-            phi_next_state_next_action = get_action_features(
-                phi_next_state,
-                next_action,
-                self.mdp_info.action_space.n
-            )
-
-            tmp = phi_state_action - self.mdp_info.gamma *\
-                phi_next_state_next_action
+            phi_next_state_next_action = get_action_features(phi_next_state, next_action, self.mdp_info.action_space.n)
+
+            tmp = phi_state_action - self.mdp_info.gamma * phi_next_state_next_action
             A = phi_state_action.T.dot(tmp)
             b = (phi_state_action.T.dot(reward)).reshape(-1, 1)
 
diff --git a/mushroom_rl/core/_dataset_types/numpy_dataset.py b/mushroom_rl/core/_dataset_types/numpy_dataset.py
index c54246fc1..cb579409e 100644
--- a/mushroom_rl/core/_dataset_types/numpy_dataset.py
+++ b/mushroom_rl/core/_dataset_types/numpy_dataset.py
@@ -52,6 +52,19 @@ def clear(self):
         self._last = np.empty_like(self._last)
         self._len = 0
 
+    def get_view(self, index):
+        view = self.copy()
+
+        view._states = self._states[index, ...]
+        view._actions = self._actions[index, ...]
+        view._rewards = self._rewards[index, ...]
+        view._next_states = self._next_states[index, ...]
+        view._absorbing = self._absorbing[index, ...]
+        view._last = self._last[index, ...]
+        view._len = self._states.shape[0]
+
+        return view
+
     def __getitem__(self, index):
         return self._states[index], self._actions[index], self._rewards[index], self._next_states[index], \
                self._absorbing[index], self._last[index]
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index c6a49eb57..2a4e438b4 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -50,8 +50,22 @@ def clear(self):
         self._info = defaultdict(list)
         self._data.clear()
 
+    def get_view(self, index):
+        dataset = self.copy()
+
+        info_slice = defaultdict(list)
+        for key in self._info.keys():
+            info_slice[key] = self._info[key][index]
+
+        dataset._info = info_slice
+        dataset._data = self._data.get_view(index)
+
+        return dataset
+
     def __getitem__(self, index):
-        if index < len(self._data):
+        if isinstance(index, slice):
+            return self.get_view(self, index)
+        elif isinstance(index, int) and index < len(self._data):
             return self._data[index]
         else:
             raise IndexError
@@ -95,6 +109,10 @@ def absorbing(self):
     def last(self):
         return self._data.last
 
+    @property
+    def info(self):
+        return self._info
+
     @property
     def episodes_length(self):
         """
@@ -125,73 +143,61 @@ def undiscounted_return(self):
     def discounted_return(self):
         return self.compute_J(self._gamma)
 
-    def parse(self):
+    def parse(self, index=None):
         """
         Return the dataset as set of arrays.
 
+        Args (index, [int, slice]): index or slicee of dataset to be selected
+
         Returns:
             A tuple containing the arrays that define the dataset, i.e. state, action, next state, absorbing and last
 
         """
-        return self.state, self.action, self.reward, self.next_state, self.absorbing, self.last
+        if index is None:
+            return self.state, self.action, self.reward, self.next_state, self.absorbing, self.last
+        else:
+            return self.state[index], self.action[index], self.reward[index], self.next_state[index], \
+                   self.absorbing[index], self.last[index]
 
-    def select_first_episodes(self, n_episodes, parse=False):
+    def select_first_episodes(self, n_episodes):
         """
         Return the first ``n_episodes`` episodes in the provided dataset.
 
         Args:
             dataset (list): the dataset to consider;
             n_episodes (int): the number of episodes to pick from the dataset;
-            parse (bool, False): whether to parse the dataset to return.
 
         Returns:
             A subset of the dataset containing the first ``n_episodes`` episodes.
 
         """
-        raise NotImplementedError
-
-        # assert n_episodes >= 0, 'Number of episodes must be greater than or equal' \
-        #                         'to zero.'
-        # if n_episodes == 0:
-        #     return np.array([[]])
-        #
-        # dataset = np.array(dataset, dtype=object)
-        # last_idxs = np.argwhere(dataset[:, -1] == 1).ravel()
-        # sub_dataset = dataset[:last_idxs[n_episodes - 1] + 1, :]
-        #
-        # return sub_dataset if not parse else parse_dataset(sub_dataset)
-
-    def select_random_samples(self, n_samples, parse=False):
+        assert n_episodes > 0, 'Number of episodes must be greater than zero.'
+
+        last_idxs = np.argwhere(self.last is True).ravel()
+        return self[:last_idxs[n_episodes - 1] + 1]
+
+    def select_random_samples(self, n_samples):
         """
         Return the randomly picked desired number of samples in the provided
         dataset.
 
         Args:
             dataset (list): the dataset to consider;
-            n_samples (int): the number of samples to pick from the dataset;
-            parse (bool, False): whether to parse the dataset to return.
+            n_samples (int): the number of samples to pick from the dataset.
 
         Returns:
             A subset of the dataset containing randomly picked ``n_samples``
             samples.
 
         """
-        raise NotImplementedError
+        assert n_samples >= 0, 'Number of samples must be greater than or equal to zero.'
 
-        # assert n_samples >= 0, 'Number of samples must be greater than or equal' \
-        #                        'to zero.'
-        # if n_samples == 0:
-        #     return np.array([[]])
-        #
-        # dataset = np.array(self, dtype=object)
-        # idxs = np.random.randint(dataset.shape[0], size=n_samples)
+        if n_samples == 0:
+            return np.array([[]])
 
-        #sub_dataset = dataset[idxs, ...]
+        idxs = np.random.randint(len(self), size=n_samples)
 
-        #if parse:
-        #    return sub_dataset
-        # else:
-        #     return sub_dataset if not parse else
+        return self[idxs]
 
     def get_init_states(self):
         """
@@ -206,13 +212,10 @@ def get_init_states(self):
         """
         pick = True
         x_0 = list()
-        for d in self:
+        for step in self:
             if pick:
-                # if isinstance(d[0], LazyFrames): #FIXME LazyFrames
-                #     x_0.append(np.array(d[0]))
-                # else:
-                x_0.append(d[0])
-            pick = d[-1]
+                x_0.append(step[0])
+            pick = step[-1]
         return np.array(x_0)
 
     def compute_J(self, gamma=1.):
diff --git a/tests/algorithms/test_fqi.py b/tests/algorithms/test_fqi.py
index ca1911f3e..253e4b127 100644
--- a/tests/algorithms/test_fqi.py
+++ b/tests/algorithms/test_fqi.py
@@ -30,8 +30,7 @@ def learn(alg, alg_params):
     approximator = ExtraTreesRegressor
 
     # Agent
-    agent = alg(mdp.info, pi, approximator,
-                approximator_params=approximator_params, **alg_params)
+    agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **alg_params)
 
     # Algorithm
     core = Core(agent, mdp)
diff --git a/tests/core/test_core.py b/tests/core/test_core.py
index da821fc05..20b44168d 100644
--- a/tests/core/test_core.py
+++ b/tests/core/test_core.py
@@ -35,13 +35,13 @@ def test_core():
 
     core.learn(n_steps=100, n_steps_per_fit=1)
 
-    dataset, info = core.evaluate(n_steps=20, get_env_info=True)
+    dataset = core.evaluate(n_steps=20)
 
-    assert 'lives' in info
-    assert 'episode_frame_number' in info
-    assert 'frame_number' in info
+    assert 'lives' in dataset.info
+    assert 'episode_frame_number' in dataset.info
+    assert 'frame_number' in dataset.info
 
-    info_lives = np.array(info['lives'])
+    info_lives = np.array(dataset.info['lives'])
 
     print(info_lives)
     lives_gt = np.array([5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
diff --git a/tests/utils/test_dataset.py b/tests/core/test_dataset.py
similarity index 78%
rename from tests/utils/test_dataset.py
rename to tests/core/test_dataset.py
index 37dce8581..c70740fea 100644
--- a/tests/utils/test_dataset.py
+++ b/tests/core/test_dataset.py
@@ -7,7 +7,7 @@
 from mushroom_rl.utils.dataset import *
 
 
-def test_dataset_utils():
+def test_dataset():
     np.random.seed(88)
 
     mdp = GridWorld(3, 3, (2,2))
@@ -20,25 +20,25 @@ def test_dataset_utils():
 
     dataset = core.evaluate(n_episodes=10)
 
-    J = compute_J(dataset, mdp.info.gamma)
+    J = dataset.compute_J(mdp.info.gamma)
     J_test = np.array([1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
                        1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
                        2.28767925e+00, 4.23911583e-01])
     assert np.allclose(J, J_test)
 
-    L = compute_episodes_length(dataset)
+    L = dataset.episodes_length
     L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
     assert np.array_equal(L, L_test)
 
-    dataset_ep = select_first_episodes(dataset, 3)
-    J = compute_J(dataset_ep, mdp.info.gamma)
+    dataset_ep = dataset.select_first_episodes(3)
+    J = dataset_ep.compute_J(mdp.info.gamma)
     assert np.allclose(J, J_test[:3])
 
-    L = compute_episodes_length(dataset_ep)
+    L = dataset_ep.episodes_length
     assert np.allclose(L, L_test[:3])
 
-    samples = select_random_samples(dataset, 2)
-    s, a, r, ss, ab, last = parse_dataset(samples)
+    samples = dataset.select_random_samples(2)
+    s, a, r, ss, ab, last = samples.parse()
     s_test = np.array([[6.], [1.]])
     a_test = np.array([[0.], [1.]])
     r_test = np.zeros(2)
@@ -52,12 +52,12 @@ def test_dataset_utils():
     assert np.array_equal(ab, ab_test)
     assert np.array_equal(last, last_test)
 
-    s0 = get_init_states(dataset)
+    s0 = dataset.get_init_states()
     s0_test = np.zeros((10, 1))
     assert np.array_equal(s0, s0_test)
 
     index = np.sum(L_test[:2]) + L_test[2]//2
-    min_J, max_J, mean_J, median_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma)
+    min_J, max_J, mean_J, median_J, n_episodes = dataset[:index].compute_metrics(mdp.info.gamma)
     assert min_J == 0.0011610630703530948
     assert max_J == 0.2781283894436937
     assert mean_J == 0.1396447262570234

From 90faba850b20d1011bee47defdf05a8af8326c6a Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Tue, 10 Oct 2023 10:52:56 +0200
Subject: [PATCH 04/24] Fixed errors in Dataset class

- fixed most of the bugs
- batch td are still crashing for some reason, need fix
---
 .../core/_dataset_types/numpy_dataset.py      |  2 +-
 mushroom_rl/core/dataset.py                   | 35 ++++++++++++++++---
 mushroom_rl/utils/dataset.py                  | 33 -----------------
 tests/core/test_dataset.py                    |  6 ++--
 4 files changed, 36 insertions(+), 40 deletions(-)
 delete mode 100644 mushroom_rl/utils/dataset.py

diff --git a/mushroom_rl/core/_dataset_types/numpy_dataset.py b/mushroom_rl/core/_dataset_types/numpy_dataset.py
index cb579409e..fa99ae58c 100644
--- a/mushroom_rl/core/_dataset_types/numpy_dataset.py
+++ b/mushroom_rl/core/_dataset_types/numpy_dataset.py
@@ -61,7 +61,7 @@ def get_view(self, index):
         view._next_states = self._next_states[index, ...]
         view._absorbing = self._absorbing[index, ...]
         view._last = self._last[index, ...]
-        view._len = self._states.shape[0]
+        view._len = view._states.shape[0]
 
         return view
 
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 2a4e438b4..d1a38ab28 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -36,6 +36,33 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
             _gamma='primitive'
         )
 
+    @classmethod
+    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, gamma=0.99):
+        """
+        Creates a dataset of transitions from the provided arrays.
+
+        Args:
+            states (np.ndarray): array of states;
+            actions (np.ndarray): array of actions;
+            rewards (np.ndarray): array of rewards;
+            next_states (np.ndarray): array of next_states;
+            absorbings (np.ndarray): array of absorbing flags;
+            lasts (np.ndarray): array of last flags.
+
+        Returns:
+            The list of transitions.
+
+        """
+        assert (len(states) == len(actions) == len(rewards)
+                == len(next_states) == len(absorbings) == len(lasts))
+
+        dataset = cls.__new__(cls)
+        dataset._gamma = gamma
+        dataset._info = defaultdict(list)
+        dataset._data = NumpyDataset.from_numpy(states, actions, rewards, next_states, absorbings, lasts, gamma)
+
+        return dataset
+
     def append(self, step, info):
         self._data.append(*step[:6])
         self._append_info(info)
@@ -63,8 +90,8 @@ def get_view(self, index):
         return dataset
 
     def __getitem__(self, index):
-        if isinstance(index, slice):
-            return self.get_view(self, index)
+        if isinstance(index, (slice, np.ndarray)):
+            return self.get_view(index)
         elif isinstance(index, int) and index < len(self._data):
             return self._data[index]
         else:
@@ -173,7 +200,7 @@ def select_first_episodes(self, n_episodes):
         """
         assert n_episodes > 0, 'Number of episodes must be greater than zero.'
 
-        last_idxs = np.argwhere(self.last is True).ravel()
+        last_idxs = np.argwhere(self.last==True).ravel()
         return self[:last_idxs[n_episodes - 1] + 1]
 
     def select_random_samples(self, n_samples):
@@ -272,7 +299,7 @@ def compute_metrics(self, gamma=1.):
         dataset = self[:i]
 
         if len(dataset) > 0:
-            J = self.compute_J(gamma)
+            J = dataset.compute_J(gamma)
             return np.min(J), np.max(J), np.mean(J), np.median(J), len(J)
         else:
             return 0, 0, 0, 0, 0
diff --git a/mushroom_rl/utils/dataset.py b/mushroom_rl/utils/dataset.py
deleted file mode 100644
index 3bcf3321a..000000000
--- a/mushroom_rl/utils/dataset.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import numpy as np
-
-from mushroom_rl.utils.frames import LazyFrames
-
-
-def arrays_as_dataset(states, actions, rewards, next_states, absorbings, lasts):
-    """
-    Creates a dataset of transitions from the provided arrays.
-
-    Args:
-        states (np.ndarray): array of states;
-        actions (np.ndarray): array of actions;
-        rewards (np.ndarray): array of rewards;
-        next_states (np.ndarray): array of next_states;
-        absorbings (np.ndarray): array of absorbing flags;
-        lasts (np.ndarray): array of last flags.
-
-    Returns:
-        The list of transitions.
-
-    """
-    assert (len(states) == len(actions) == len(rewards)
-            == len(next_states) == len(absorbings) == len(lasts))
-
-    dataset = list()
-    for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
-                                     absorbings.astype(bool), lasts.astype(bool)
-                                     ):
-        dataset.append((s, a, r.item(0), ss, ab.item(0), last.item(0)))
-
-    return dataset
-
-
diff --git a/tests/core/test_dataset.py b/tests/core/test_dataset.py
index c70740fea..0fc641f70 100644
--- a/tests/core/test_dataset.py
+++ b/tests/core/test_dataset.py
@@ -1,11 +1,11 @@
+import numpy as np
+
 from mushroom_rl.core import Core
 from mushroom_rl.algorithms.value import SARSA
 from mushroom_rl.environments import GridWorld
 from mushroom_rl.utils.parameters import Parameter
 from mushroom_rl.policy import EpsGreedy
 
-from mushroom_rl.utils.dataset import *
-
 
 def test_dataset():
     np.random.seed(88)
@@ -65,3 +65,5 @@ def test_dataset():
     assert n_episodes == 2
 
 
+if __name__ == '__main__':
+    test_dataset()

From 9835dec93340cddcb12cd7d90c5dd4307ebbb436 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Tue, 10 Oct 2023 13:01:54 +0200
Subject: [PATCH 05/24] Fixed bug in parse dataset

- now parse dataset works
- all tests passing
- more test required to check all functionality works
---
 .../core/_dataset_types/numpy_dataset.py      | 42 ++++++++++++++++---
 mushroom_rl/core/dataset.py                   |  8 +++-
 tests/core/test_dataset.py                    |  4 --
 3 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/mushroom_rl/core/_dataset_types/numpy_dataset.py b/mushroom_rl/core/_dataset_types/numpy_dataset.py
index fa99ae58c..9de245c96 100644
--- a/mushroom_rl/core/_dataset_types/numpy_dataset.py
+++ b/mushroom_rl/core/_dataset_types/numpy_dataset.py
@@ -19,6 +19,35 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
         self._len = 0
 
         self._add_save_attr(
+            _state_type='primitive',
+            _action_type='primitive',
+            _states='numpy',
+            _actions='numpy',
+            _rewards='numpy',
+            _next_states='numpy',
+            _absorbing='numpy',
+            _last='numpy',
+            _len='primitive'
+        )
+
+    @classmethod
+    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts):
+        dataset = cls.__new__()
+
+        dataset._state_type = states.dtype
+        dataset._action_type = actions.dtype
+
+        dataset._states = states
+        dataset._actions = actions
+        dataset._rewards = rewards
+        dataset._next_states = next_states
+        dataset._absorbing = absorbings
+        dataset._last = lasts
+        dataset._len = len(lasts)
+
+        dataset._add_save_attr(
+            _state_type='primitive',
+            _action_type='primitive',
             _states='numpy',
             _actions='numpy',
             _rewards='numpy',
@@ -50,6 +79,7 @@ def clear(self):
         self._next_states = np.empty_like(self._next_states)
         self._absorbing = np.empty_like(self._absorbing)
         self._last = np.empty_like(self._last)
+
         self._len = 0
 
     def get_view(self, index):
@@ -84,24 +114,24 @@ def __add__(self, other):
 
     @property
     def state(self):
-        return self._states
+        return self._states[:len(self)]
 
     @property
     def action(self):
-        return self._actions
+        return self._actions[:len(self)]
 
     @property
     def reward(self):
-        return self._rewards
+        return self._rewards[:len(self)]
 
     @property
     def next_state(self):
-        return self._next_states
+        return self._next_states[:len(self)]
 
     @property
     def absorbing(self):
-        return self._absorbing
+        return self._absorbing[:len(self)]
 
     @property
     def last(self):
-        return self._last
+        return self._last[:len(self)]
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index d1a38ab28..5359d581f 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -56,11 +56,17 @@ def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, ga
         assert (len(states) == len(actions) == len(rewards)
                 == len(next_states) == len(absorbings) == len(lasts))
 
-        dataset = cls.__new__(cls)
+        dataset = cls.__new__()
         dataset._gamma = gamma
         dataset._info = defaultdict(list)
         dataset._data = NumpyDataset.from_numpy(states, actions, rewards, next_states, absorbings, lasts, gamma)
 
+        dataset._add_save_attr(
+            _info='mushroom',
+            _data='pickle',
+            _gamma='primitive'
+        )
+
         return dataset
 
     def append(self, step, info):
diff --git a/tests/core/test_dataset.py b/tests/core/test_dataset.py
index 0fc641f70..bb3f1d9bb 100644
--- a/tests/core/test_dataset.py
+++ b/tests/core/test_dataset.py
@@ -63,7 +63,3 @@ def test_dataset():
     assert mean_J == 0.1396447262570234
     assert median_J == 0.1396447262570234
     assert n_episodes == 2
-
-
-if __name__ == '__main__':
-    test_dataset()

From 20e9c3c52112c7e8d3654bb1ab6b0c99c7ca0f86 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Tue, 10 Oct 2023 13:08:23 +0200
Subject: [PATCH 06/24] Using vectorial features

---
 mushroom_rl/algorithms/value/batch_td/lspi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mushroom_rl/algorithms/value/batch_td/lspi.py b/mushroom_rl/algorithms/value/batch_td/lspi.py
index 9de6e3fe4..6c741fa56 100644
--- a/mushroom_rl/algorithms/value/batch_td/lspi.py
+++ b/mushroom_rl/algorithms/value/batch_td/lspi.py
@@ -31,8 +31,8 @@ def __init__(self, mdp_info, policy, approximator_params=None,
     def fit(self, dataset, **info):
         state, action, reward, next_state, absorbing, _ = dataset.parse()
 
-        phi_state = np.array([self.phi(s) for s in state])  # TODO improve with vectorial inputs
-        phi_next_state = np.array([self.phi(ss) for ss in next_state])
+        phi_state = self.phi(state)
+        phi_next_state = self.phi(next_state)
 
         phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n)
 

From 8171f72fb589a1eabc16375399927eb409909c62 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Tue, 10 Oct 2023 16:05:10 +0200
Subject: [PATCH 07/24] Fixed more dataset issues

- all test are passing now
- implemented list dataset, mostly working.
- proper dataset conversion code is still needed
---
 mushroom_rl/core/__init__.py                  |  2 +-
 mushroom_rl/core/_dataset_types/__init__.py   |  3 +-
 .../core/_dataset_types/list_dataset.py       | 77 +++++++++++++++++++
 mushroom_rl/core/dataset.py                   | 22 +++---
 4 files changed, 90 insertions(+), 14 deletions(-)
 create mode 100644 mushroom_rl/core/_dataset_types/list_dataset.py

diff --git a/mushroom_rl/core/__init__.py b/mushroom_rl/core/__init__.py
index 6ac45aa64..79e679afc 100644
--- a/mushroom_rl/core/__init__.py
+++ b/mushroom_rl/core/__init__.py
@@ -7,4 +7,4 @@
 
 import mushroom_rl.environments
 
-__all__ = ['Core', 'Environment', 'MDPInfo', 'Agent', 'Serializable', 'Logger']
+__all__ = ['Core', 'Dataset', 'Environment', 'MDPInfo', 'Agent', 'Serializable', 'Logger']
diff --git a/mushroom_rl/core/_dataset_types/__init__.py b/mushroom_rl/core/_dataset_types/__init__.py
index 177fe69de..c3a754a10 100644
--- a/mushroom_rl/core/_dataset_types/__init__.py
+++ b/mushroom_rl/core/_dataset_types/__init__.py
@@ -1 +1,2 @@
-from .numpy_dataset import NumpyDataset
\ No newline at end of file
+from .numpy_dataset import NumpyDataset
+from .list_dataset import ListDataset
\ No newline at end of file
diff --git a/mushroom_rl/core/_dataset_types/list_dataset.py b/mushroom_rl/core/_dataset_types/list_dataset.py
new file mode 100644
index 000000000..8b5a266a1
--- /dev/null
+++ b/mushroom_rl/core/_dataset_types/list_dataset.py
@@ -0,0 +1,77 @@
+from copy import deepcopy
+
+import numpy as np
+
+from mushroom_rl.core.serialization import Serializable
+
+
+class ListDataset(Serializable):
+    def __init__(self):
+        self._dataset = list()
+
+        self._add_save_attr(
+            _dataset='pickle'
+        )
+
+    @classmethod
+    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts):
+        dataset = cls()
+
+        for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
+                                         absorbings.astype(bool), lasts.astype(bool)
+                                         ):
+            dataset.append((s, a, r.item(0), ss, ab.item(0), last.item(0)))
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def append(self, *step):
+        assert len(step) == 6
+        step_copy = deepcopy(step)
+        self._dataset.append(step_copy)
+
+    def clear(self):
+        self._dataset = list()
+
+    def get_view(self, index):
+        view = self.copy()
+
+        if isinstance(index, (int, slice)):
+            view._dataset = self._dataset[index]
+        else:
+            view._dataset = [self._dataset[i] for i in index]
+
+        return view
+
+    def __getitem__(self, index):
+        return self._dataset[index]
+
+    def __add__(self, other):
+        result = self.copy()
+        result._dataset = self._dataset + other._dataset
+
+        return result
+
+    @property
+    def state(self):
+        return [step[0] for step in self._dataset]
+
+    @property
+    def action(self):
+        return [step[1] for step in self._dataset]
+
+    @property
+    def reward(self):
+        return [step[2] for step in self._dataset]
+
+    @property
+    def next_state(self):
+        return [step[3] for step in self._dataset]
+
+    @property
+    def absorbing(self):
+        return [step[4] for step in self._dataset]
+
+    @property
+    def last(self):
+        return [step[5] for step in self._dataset]
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 5359d581f..855a862be 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -4,7 +4,7 @@
 
 from mushroom_rl.core.serialization import Serializable
 
-from mushroom_rl.core._dataset_types import NumpyDataset
+from mushroom_rl.core._dataset_types import *
 
 
 class Dataset(Serializable):
@@ -28,6 +28,7 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
 
         self._info = defaultdict(list)
         self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+
         self._gamma = mdp_info.gamma
 
         self._add_save_attr(
@@ -37,7 +38,7 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
         )
 
     @classmethod
-    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, gamma=0.99):
+    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, info=None, gamma=0.99):
         """
         Creates a dataset of transitions from the provided arrays.
 
@@ -58,7 +59,10 @@ def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, ga
 
         dataset = cls.__new__()
         dataset._gamma = gamma
-        dataset._info = defaultdict(list)
+        if info is None:
+            dataset._info = defaultdict(list)
+        else:
+            dataset._info = info.copy()
         dataset._data = NumpyDataset.from_numpy(states, actions, rewards, next_states, absorbings, lasts, gamma)
 
         dataset._add_save_attr(
@@ -176,21 +180,15 @@ def undiscounted_return(self):
     def discounted_return(self):
         return self.compute_J(self._gamma)
 
-    def parse(self, index=None):
+    def parse(self):
         """
         Return the dataset as set of arrays.
 
-        Args (index, [int, slice]): index or slicee of dataset to be selected
-
         Returns:
             A tuple containing the arrays that define the dataset, i.e. state, action, next state, absorbing and last
 
         """
-        if index is None:
-            return self.state, self.action, self.reward, self.next_state, self.absorbing, self.last
-        else:
-            return self.state[index], self.action[index], self.reward[index], self.next_state[index], \
-                   self.absorbing[index], self.last[index]
+        return self.state, self.action, self.reward, self.next_state, self.absorbing, self.last
 
     def select_first_episodes(self, n_episodes):
         """
@@ -206,7 +204,7 @@ def select_first_episodes(self, n_episodes):
         """
         assert n_episodes > 0, 'Number of episodes must be greater than zero.'
 
-        last_idxs = np.argwhere(self.last==True).ravel()
+        last_idxs = np.argwhere(self.last).ravel()
         return self[:last_idxs[n_episodes - 1] + 1]
 
     def select_random_samples(self, n_samples):

From 4ed423dd431a0c8517a6b86695ff1cf676aad3dd Mon Sep 17 00:00:00 2001
From: robfiras <f.al-hafez@posteo.de>
Date: Wed, 11 Oct 2023 15:28:17 +0200
Subject: [PATCH 08/24] Bugfix Mujoco camera.

- initial mode couldn't be set for "static" and "top_static"
- also "static" and "top_static" have the "lookat" parameter.
---
 mushroom_rl/utils/mujoco/viewer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mushroom_rl/utils/mujoco/viewer.py b/mushroom_rl/utils/mujoco/viewer.py
index 1db075460..6c7ed3390 100644
--- a/mushroom_rl/utils/mujoco/viewer.py
+++ b/mushroom_rl/utils/mujoco/viewer.py
@@ -85,8 +85,8 @@ def __init__(self, model, dt, width=1920, height=1080, start_paused=False,
             self._camera_params = self._assert_camera_params(camera_params)
         self._all_camera_modes = ("static", "follow", "top_static")
         self._camera_mode_iter = cycle(self._all_camera_modes)
-        self._camera_mode = next(self._camera_mode_iter)
-        self._camera_mode_target = self._camera_mode
+        self._camera_mode = None
+        self._camera_mode_target = next(self._camera_mode_iter)
         assert default_camera_mode in self._all_camera_modes
         while self._camera_mode_target != default_camera_mode:
             self._camera_mode_target = next(self._camera_mode_iter)
@@ -477,6 +477,8 @@ def _set_camera_properties(self, mode):
         self._camera.distance = cam_params["distance"]
         self._camera.elevation = cam_params["elevation"]
         self._camera.azimuth = cam_params["azimuth"]
+        if "lookat" in cam_params:
+            self._camera.lookat = np.array(cam_params["lookat"])
         self._camera_mode = mode
 
     def _assert_camera_params(self, camera_params):
@@ -526,6 +528,6 @@ def get_default_camera_params():
 
         """
 
-        return dict(static=dict(distance=15.0, elevation=-45.0, azimuth=90.0),
+        return dict(static=dict(distance=15.0, elevation=-45.0, azimuth=90.0, lookat=np.array([0.0, 0.0, 0.0])),
                     follow=dict(distance=3.5, elevation=0.0, azimuth=90.0),
-                    top_static=dict(distance=5.0, elevation=-90.0, azimuth=90.0))
+                    top_static=dict(distance=5.0, elevation=-90.0, azimuth=90.0, lookat=np.array([0.0, 0.0, 0.0])))

From 00fb703a2ad65beacdae08e78ca39a29e91c3c8e Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 12 Oct 2023 15:14:23 +0200
Subject: [PATCH 09/24] Work on dataset

- renamed _dataset_types in _dataset_impl
- added torch dataset backend
- added backend flag in mdp_info
- added automatic type conversion
---
 mushroom_rl/core/_dataset_impl/__init__.py    |   4 +
 .../list_dataset.py                           |   4 +-
 .../numpy_dataset.py                          |  12 +-
 .../core/_dataset_impl/torch_dataset.py       | 139 ++++++++++++++++++
 .../core/_dataset_impl/type_conversions.py    |  63 ++++++++
 mushroom_rl/core/_dataset_types/__init__.py   |   2 -
 mushroom_rl/core/dataset.py                   |  36 ++++-
 mushroom_rl/core/environment.py               |   9 +-
 8 files changed, 253 insertions(+), 16 deletions(-)
 create mode 100644 mushroom_rl/core/_dataset_impl/__init__.py
 rename mushroom_rl/core/{_dataset_types => _dataset_impl}/list_dataset.py (92%)
 rename mushroom_rl/core/{_dataset_types => _dataset_impl}/numpy_dataset.py (92%)
 create mode 100644 mushroom_rl/core/_dataset_impl/torch_dataset.py
 create mode 100644 mushroom_rl/core/_dataset_impl/type_conversions.py
 delete mode 100644 mushroom_rl/core/_dataset_types/__init__.py

diff --git a/mushroom_rl/core/_dataset_impl/__init__.py b/mushroom_rl/core/_dataset_impl/__init__.py
new file mode 100644
index 000000000..385d2bad9
--- /dev/null
+++ b/mushroom_rl/core/_dataset_impl/__init__.py
@@ -0,0 +1,4 @@
+from .numpy_dataset import NumpyDataset
+from .torch_dataset import TorchDataset
+from .list_dataset import ListDataset
+from .type_conversions import DataConversion, NumpyConversion, TorchConversion, ListConversion
\ No newline at end of file
diff --git a/mushroom_rl/core/_dataset_types/list_dataset.py b/mushroom_rl/core/_dataset_impl/list_dataset.py
similarity index 92%
rename from mushroom_rl/core/_dataset_types/list_dataset.py
rename to mushroom_rl/core/_dataset_impl/list_dataset.py
index 8b5a266a1..1fc45bad3 100644
--- a/mushroom_rl/core/_dataset_types/list_dataset.py
+++ b/mushroom_rl/core/_dataset_impl/list_dataset.py
@@ -14,13 +14,13 @@ def __init__(self):
         )
 
     @classmethod
-    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
         dataset = cls()
 
         for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
                                          absorbings.astype(bool), lasts.astype(bool)
                                          ):
-            dataset.append((s, a, r.item(0), ss, ab.item(0), last.item(0)))
+            dataset.append((s, a, r.item(), ss, ab.item(), last.item()))
 
     def __len__(self):
         return len(self._dataset)
diff --git a/mushroom_rl/core/_dataset_types/numpy_dataset.py b/mushroom_rl/core/_dataset_impl/numpy_dataset.py
similarity index 92%
rename from mushroom_rl/core/_dataset_types/numpy_dataset.py
rename to mushroom_rl/core/_dataset_impl/numpy_dataset.py
index 9de245c96..613e3fd71 100644
--- a/mushroom_rl/core/_dataset_types/numpy_dataset.py
+++ b/mushroom_rl/core/_dataset_impl/numpy_dataset.py
@@ -31,7 +31,15 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
         )
 
     @classmethod
-    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
+        if not isinstance(states, np.ndarray):
+            states = states.numpy()
+            actions = states.numpy()
+            rewards = states.numpy()
+            next_states = states.numpy()
+            absorbings = absorbings.numpy()
+            lasts = lasts.numpy()
+
         dataset = cls.__new__()
 
         dataset._state_type = states.dtype
@@ -57,6 +65,8 @@ def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts):
             _len='primitive'
         )
 
+        return dataset
+
     def __len__(self):
         return self._len
 
diff --git a/mushroom_rl/core/_dataset_impl/torch_dataset.py b/mushroom_rl/core/_dataset_impl/torch_dataset.py
new file mode 100644
index 000000000..3ab4a5557
--- /dev/null
+++ b/mushroom_rl/core/_dataset_impl/torch_dataset.py
@@ -0,0 +1,139 @@
+import torch
+
+from mushroom_rl.core.serialization import Serializable
+
+
+class TorchDataset(Serializable):
+    def __init__(self, state_type, state_shape, action_type, action_shape, reward_shape):
+        flags_len = action_shape[0]
+
+        self._state_type = state_type
+        self._action_type = action_type
+
+        self._states = torch.empty(*state_shape, dtype=self._state_type)
+        self._actions = torch.empty(*action_shape, dtype=self._action_type)
+        self._rewards = torch.empty(*reward_shape, dtype=torch.float)
+        self._next_states = torch.empty(*state_shape, dtype=self._state_type)
+        self._absorbing = torch.empty(flags_len, dtype=torch.bool)
+        self._last = torch.empty(flags_len, dtype=torch.bool)
+        self._len = 0
+
+        self._add_save_attr(
+            _state_type='primitive',
+            _action_type='primitive',
+            _states='torch',
+            _actions='torch',
+            _rewards='torch',
+            _next_states='torch',
+            _absorbing='torch',
+            _last='torch',
+            _len='primitive'
+        )
+
+    @classmethod
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
+        dataset = cls.__new__()
+
+        dataset._state_type = states.dtype
+        dataset._action_type = actions.dtype
+
+        dataset._states = torch.as_tensor(states)
+        dataset._actions = torch.as_tensor(actions)
+        dataset._rewards = torch.as_tensor(rewards)
+        dataset._next_states = torch.as_tensor(next_states)
+        dataset._absorbing = torch.as_tensor(absorbings, dtype=torch.bool)
+        dataset._last = torch.as_tensor(lasts, dtype=torch.bool)
+        dataset._len = len(lasts)
+
+        dataset._add_save_attr(
+            _state_type='primitive',
+            _action_type='primitive',
+            _states='torch',
+            _actions='torch',
+            _rewards='torch',
+            _next_states='torch',
+            _absorbing='torch',
+            _last='torch',
+            _len='primitive'
+        )
+
+        return dataset
+
+    def __len__(self):
+        return self._len
+
+    def append(self, state, action, reward, next_state, absorbing, last):
+        i = self._len
+
+        self._states[i] = state
+        self._actions[i] = action
+        self._rewards[i] = reward
+        self._next_states[i] = next_state
+        self._absorbing[i] = absorbing
+        self._last[i] = last
+
+        self._len += 1
+
+    def clear(self):
+        self._states = torch.empty_like(self._states)
+        self._actions = torch.empty_like(self._actions)
+        self._rewards = torch.empty_like(self._rewards)
+        self._next_states = torch.empty_like(self._next_states)
+        self._absorbing = torch.empty_like(self._absorbing)
+        self._last = torch.empty_like(self._last)
+
+        self._len = 0
+
+    def get_view(self, index):
+        view = self.copy()
+
+        view._states = self._states[index, ...]
+        view._actions = self._actions[index, ...]
+        view._rewards = self._rewards[index, ...]
+        view._next_states = self._next_states[index, ...]
+        view._absorbing = self._absorbing[index, ...]
+        view._last = self._last[index, ...]
+        view._len = view._states.shape[0]
+
+        return view
+
+    def __getitem__(self, index):
+        return self._states[index], self._actions[index], self._rewards[index], self._next_states[index], \
+               self._absorbing[index], self._last[index]
+
+    def __add__(self, other):
+        result = self.copy()
+
+        result._states = torch.concatenate((self.state, other.state))
+        result._actions = torch.concatenate((self.action, other.action))
+        result._rewards = torch.concatenate((self.reward, other.reward))
+        result._next_states = torch.concatenate((self.next_state, other.next_state))
+        result._absorbing = torch.concatenate((self.absorbing, other.absorbing))
+        result._last = torch.concatenate((self.last, other.last))
+        result._len = len(self) + len(other)
+
+        return result
+
+    @property
+    def state(self):
+        return self._states[:len(self)]
+
+    @property
+    def action(self):
+        return self._actions[:len(self)]
+
+    @property
+    def reward(self):
+        return self._rewards[:len(self)]
+
+    @property
+    def next_state(self):
+        return self._next_states[:len(self)]
+
+    @property
+    def absorbing(self):
+        return self._absorbing[:len(self)]
+
+    @property
+    def last(self):
+        return self._last[:len(self)]
diff --git a/mushroom_rl/core/_dataset_impl/type_conversions.py b/mushroom_rl/core/_dataset_impl/type_conversions.py
new file mode 100644
index 000000000..911f50a1e
--- /dev/null
+++ b/mushroom_rl/core/_dataset_impl/type_conversions.py
@@ -0,0 +1,63 @@
+import numpy
+import torch
+
+
+class DataConversion(object):
+    @classmethod
+    def convert(cls, *arrays, to='numpy'):
+        if to == 'numpy':
+            return cls.arrays_to_numpy(*arrays)
+        elif to == 'torch':
+            return cls.arrays_to_torch(*arrays)
+        else:
+            return NotImplementedError
+
+    @classmethod
+    def arrays_to_numpy(cls, *arrays):
+        return (cls.to_numpy(array) for array in arrays)
+
+    @classmethod
+    def arrays_to_torch(cls, *arrays):
+        return (cls.to_torch(array) for array in arrays)
+
+    @staticmethod
+    def to_numpy(array):
+        return NotImplementedError
+
+    @staticmethod
+    def to_torch(array):
+        raise NotImplementedError
+
+
+class NumpyConversion(DataConversion):
+    @staticmethod
+    def to_numpy(array):
+        return array
+
+    @staticmethod
+    def to_torch(array):
+        return torch.from_numpy(array)
+
+
+class TorchConversion(DataConversion):
+    @staticmethod
+    def to_numpy(array):
+        return array.detach().cpu().numpy()
+
+    @staticmethod
+    def to_torch(array):
+        return array
+
+
+class ListConversion(DataConversion):
+    @staticmethod
+    def to_numpy(array):
+        return numpy.array(array)
+
+    @staticmethod
+    def to_torch(array):
+        return torch.as_tensor(array)
+
+
+
+
diff --git a/mushroom_rl/core/_dataset_types/__init__.py b/mushroom_rl/core/_dataset_types/__init__.py
deleted file mode 100644
index c3a754a10..000000000
--- a/mushroom_rl/core/_dataset_types/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .numpy_dataset import NumpyDataset
-from .list_dataset import ListDataset
\ No newline at end of file
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 855a862be..23c8f7c05 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -4,7 +4,7 @@
 
 from mushroom_rl.core.serialization import Serializable
 
-from mushroom_rl.core._dataset_types import *
+from ._dataset_impl import *
 
 
 class Dataset(Serializable):
@@ -27,18 +27,29 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
         action_type = mdp_info.action_space.data_type
 
         self._info = defaultdict(list)
-        self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+
+        if mdp_info.backend == 'numpy':
+            self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+            self._converter = NumpyConversion
+        elif mdp_info.backend == 'torch':
+            self._data = TorchDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+            self._converter = TorchConversion
+        else:
+            self._data = ListDataset()
+            self._converter = ListConversion
 
         self._gamma = mdp_info.gamma
 
         self._add_save_attr(
-            _info='mushroom',
-            _data='pickle',
+            _info='pickle',
+            _data='mushroom',
+            _converter='primitive',
             _gamma='primitive'
         )
 
     @classmethod
-    def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, info=None, gamma=0.99):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, info=None, gamma=0.99,
+                   backend='numpy'):
         """
         Creates a dataset of transitions from the provided arrays.
 
@@ -63,7 +74,13 @@ def from_numpy(cls, states, actions, rewards, next_states, absorbings, lasts, in
             dataset._info = defaultdict(list)
         else:
             dataset._info = info.copy()
-        dataset._data = NumpyDataset.from_numpy(states, actions, rewards, next_states, absorbings, lasts, gamma)
+
+        if backend == 'numpy':
+            dataset._data = NumpyDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
+        elif backend == 'torch':
+            dataset._data = TorchDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
+        else:
+            dataset._data = ListDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
 
         dataset._add_save_attr(
             _info='mushroom',
@@ -180,15 +197,18 @@ def undiscounted_return(self):
     def discounted_return(self):
         return self.compute_J(self._gamma)
 
-    def parse(self):
+    def parse(self, to='numpy'):
         """
         Return the dataset as set of arrays.
 
+        to (str, numpy):  the backend to be used for the returned arrays.
+
         Returns:
             A tuple containing the arrays that define the dataset, i.e. state, action, next state, absorbing and last
 
         """
-        return self.state, self.action, self.reward, self.next_state, self.absorbing, self.last
+        return self._converter.convert(self.state, self.action, self.reward,
+                                       self.next_state, self.absorbing, self.last, to=to)
 
     def select_first_episodes(self, n_episodes):
         """
diff --git a/mushroom_rl/core/environment.py b/mushroom_rl/core/environment.py
index 7a2fa485b..e666585c9 100644
--- a/mushroom_rl/core/environment.py
+++ b/mushroom_rl/core/environment.py
@@ -9,7 +9,7 @@ class MDPInfo(Serializable):
     This class is used to store the information of the environment.
 
     """
-    def __init__(self, observation_space, action_space, gamma, horizon, dt=1e-1):
+    def __init__(self, observation_space, action_space, gamma, horizon, dt=1e-1, backend='numpy'):
         """
         Constructor.
 
@@ -18,7 +18,8 @@ def __init__(self, observation_space, action_space, gamma, horizon, dt=1e-1):
              action_space ([Box, Discrete]): the action space;
              gamma (float): the discount factor;
              horizon (int): the horizon;
-             dt (float, 1e-1): the control timestep of the environment.
+             dt (float, 1e-1): the control timestep of the environment;
+             backend (str, 'numpy'): the type of data library used to generate state and actions.
 
         """
         self.observation_space = observation_space
@@ -26,13 +27,15 @@ def __init__(self, observation_space, action_space, gamma, horizon, dt=1e-1):
         self.gamma = gamma
         self.horizon = horizon
         self.dt = dt
+        self.backend = backend
 
         self._add_save_attr(
             observation_space='mushroom',
             action_space='mushroom',
             gamma='primitive',
             horizon='primitive',
-            dt='primitive'
+            dt='primitive',
+            backend='primitive'
         )
 
     @property

From de005cb2e9a038a503c4818ab216fb018ce1d9e2 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 12 Oct 2023 15:48:27 +0200
Subject: [PATCH 10/24] Modernize on policy actor critic

- using new dataset interface to get everything directly as torch
- most numpy code and functions has been removed
---
 .../actor_critic/deep_actor_critic/a2c.py     | 14 +----
 .../actor_critic/deep_actor_critic/ppo.py     | 27 ++++----
 .../actor_critic/deep_actor_critic/trpo.py    | 62 +++++++++----------
 mushroom_rl/utils/value_functions.py          | 48 +++++++-------
 4 files changed, 68 insertions(+), 83 deletions(-)

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
index 2cbf8f1de..fc6ae6aec 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
@@ -5,7 +5,6 @@
 from mushroom_rl.approximators.parametric import TorchApproximator
 from mushroom_rl.utils.value_functions import compute_advantage_montecarlo
 from mushroom_rl.utils.parameters import to_parameter
-from mushroom_rl.utils.torch import to_float_tensor
 
 from copy import deepcopy
 
@@ -58,7 +57,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
         super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
 
     def fit(self, dataset, **info):
-        state, action, reward, next_state, absorbing, _ = dataset.parse()
+        state, action, reward, next_state, absorbing, _ = dataset.parse(to='torch')
 
         v, adv = compute_advantage_montecarlo(self._V, state, next_state,
                                               reward, absorbing,
@@ -69,15 +68,8 @@ def fit(self, dataset, **info):
         self._optimize_actor_parameters(loss)
 
     def _loss(self, state, action, adv):
-        use_cuda = self.policy.use_cuda
-
-        s = to_float_tensor(state, use_cuda)
-        a = to_float_tensor(action, use_cuda)
-
-        adv_t = to_float_tensor(adv, use_cuda)
-
-        gradient_loss = -torch.mean(self.policy.log_prob_t(s, a)*adv_t)
-        entropy_loss = -self.policy.entropy_t(s)
+        gradient_loss = -torch.mean(self.policy.log_prob_t(state, action)*adv)
+        entropy_loss = -self.policy.entropy_t(state)
 
         return gradient_loss + self._entropy_coeff() * entropy_loss
 
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
index 26e0bf13f..13923348d 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
@@ -71,27 +71,24 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
         super().__init__(mdp_info, policy, None)
 
     def fit(self, dataset, **info):
-        x, u, r, xn, absorbing, last = dataset.parse()
-        x = x.astype(np.float32)
-        u = u.astype(np.float32)
-        r = r.astype(np.float32)
-        xn = xn.astype(np.float32)
+        state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
 
-        obs = to_float_tensor(x, self.policy.use_cuda)
-        act = to_float_tensor(u, self.policy.use_cuda)
-        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda())
-        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
-        adv = to_float_tensor(np_adv, self.policy.use_cuda)
+        v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,
+                                       self.mdp_info.gamma, self._lambda())
+        adv = (adv - torch.mean(adv)) / (torch.std(adv) + 1e-8)
 
-        old_pol_dist = self.policy.distribution_t(obs)
-        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()
+        adv = adv.detach()
+        v_target = v_target.detach()
 
-        self._V.fit(x, v_target, **self._critic_fit_params)
+        old_pol_dist = self.policy.distribution_t(state)
+        old_log_p = old_pol_dist.log_prob(action)[:, None].detach()
 
-        self._update_policy(obs, act, adv, old_log_p)
+        self._V.fit(state, v_target, **self._critic_fit_params)
+
+        self._update_policy(state, action, adv, old_log_p)
 
         # Print fit information
-        self._log_info(dataset, x, v_target, old_pol_dist)
+        self._log_info(dataset, state, v_target, old_pol_dist)
         self._iter += 1
 
     def _update_policy(self, obs, act, adv, old_log_p):
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
index e0ab6b7c2..c7b017a1d 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
@@ -82,26 +82,22 @@ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, l
         super().__init__(mdp_info, policy, None)
 
     def fit(self, dataset, **info):
-        state, action, reward, next_state, absorbing, last = dataset.parse()
-        x = state.astype(np.float32)
-        u = action.astype(np.float32)
-        r = reward.astype(np.float32)
-        xn = next_state.astype(np.float32)
-
-        obs = to_float_tensor(x, self.policy.use_cuda)
-        act = to_float_tensor(u, self.policy.use_cuda)
-        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last,
-                                       self.mdp_info.gamma, self._lambda())
-        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
-        adv = to_float_tensor(np_adv, self.policy.use_cuda)
+        state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
+
+        v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,
+                                    self.mdp_info.gamma, self._lambda())
+        adv = (adv - torch.mean(adv)) / (torch.std(adv) + 1e-8)
+
+        adv = adv.detach()
+        v_target = v_target.detach()
 
         # Policy update
         self._old_policy = deepcopy(self.policy)
-        old_pol_dist = self._old_policy.distribution_t(obs)
-        old_log_prob = self._old_policy.log_prob_t(obs, act).detach()
+        old_pol_dist = self._old_policy.distribution_t(state)
+        old_log_prob = self._old_policy.log_prob_t(state, action).detach()
 
         zero_grad(self.policy.parameters())
-        loss = self._compute_loss(obs, act, adv, old_log_prob)
+        loss = self._compute_loss(state, action, adv, old_log_prob)
 
         prev_loss = loss.item()
 
@@ -110,26 +106,26 @@ def fit(self, dataset, **info):
         g = get_gradient(self.policy.parameters())
 
         # Compute direction through conjugate gradient
-        stepdir = self._conjugate_gradient(g, obs, old_pol_dist)
+        stepdir = self._conjugate_gradient(g, state, old_pol_dist)
 
         # Line search
-        self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir)
+        self._line_search(state, action, adv, old_log_prob, old_pol_dist, prev_loss, stepdir)
 
         # VF update
-        self._V.fit(x, v_target, **self._critic_fit_params)
+        self._V.fit(state, v_target, **self._critic_fit_params)
 
         # Print fit information
-        self._log_info(dataset, x, v_target, old_pol_dist)
+        self._log_info(dataset, state, v_target, old_pol_dist)
         self._iter += 1
 
-    def _fisher_vector_product(self, p, obs, old_pol_dist):
-        p_tensor = torch.from_numpy(p)
-        if self.policy.use_cuda:
-            p_tensor = p_tensor.cuda()
+    # def _fisher_vector_product(self, p, obs, old_pol_dist):
+    #     p_tensor = torch.from_numpy(p)
+    #     if self.policy.use_cuda:
+    #         p_tensor = p_tensor.cuda()
+    #
+    #     return self._fisher_vector_product_t(p_tensor, obs, old_pol_dist)
 
-        return self._fisher_vector_product_t(p_tensor, obs, old_pol_dist)
-
-    def _fisher_vector_product_t(self, p, obs, old_pol_dist):
+    def _fisher_vector_product(self, p, obs, old_pol_dist):
         kl = self._compute_kl(obs, old_pol_dist)
         grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True)
         flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])
@@ -141,13 +137,13 @@ def _fisher_vector_product_t(self, p, obs, old_pol_dist):
         return flat_grad_grad_kl + p * self._cg_damping()
 
     def _conjugate_gradient(self, b, obs, old_pol_dist):
-        p = b.detach().cpu().numpy()
-        r = b.detach().cpu().numpy()
-        x = np.zeros_like(p)
+        p = b.detach()
+        r = b.detach()
+        x = torch.zeros_like(p)
         r2 = r.dot(r)
 
         for i in range(self._n_epochs_cg()):
-            z = self._fisher_vector_product(p, obs, old_pol_dist).detach().cpu().numpy()
+            z = self._fisher_vector_product(p, obs, old_pol_dist).detach()
             v = r2 / p.dot(z)
             x += v * p
             r -= v * z
@@ -162,10 +158,10 @@ def _conjugate_gradient(self, b, obs, old_pol_dist):
 
     def _line_search(self, obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir):
         # Compute optimal step size
-        direction = self._fisher_vector_product(stepdir, obs, old_pol_dist).detach().cpu().numpy()
+        direction = self._fisher_vector_product(stepdir, obs, old_pol_dist).detach()
         shs = .5 * stepdir.dot(direction)
-        lm = np.sqrt(shs / self._max_kl())
-        full_step = stepdir / lm
+        lm = torch.sqrt(shs / self._max_kl())
+        full_step = (stepdir / lm).detach().cpu().numpy()
         stepsize = 1.
 
         # Save old policy parameters
diff --git a/mushroom_rl/utils/value_functions.py b/mushroom_rl/utils/value_functions.py
index 0b5e8e7bb..e93ebb539 100644
--- a/mushroom_rl/utils/value_functions.py
+++ b/mushroom_rl/utils/value_functions.py
@@ -1,4 +1,4 @@
-import numpy as np
+import torch
 
 
 def compute_advantage_montecarlo(V, s, ss, r, absorbing, gamma):
@@ -9,13 +9,13 @@ def compute_advantage_montecarlo(V, s, ss, r, absorbing, gamma):
 
     Args:
         V (Regressor): the current value function regressor;
-        s (numpy.ndarray): the set of states in which we want
+        s (torch.tensor): the set of states in which we want
             to evaluate the advantage;
-        ss (numpy.ndarray): the set of next states in which we want
+        ss (torch.tensor): the set of next states in which we want
             to evaluate the advantage;
-        r (numpy.ndarray): the reward obtained in each transition
+        r (torch.tensor): the reward obtained in each transition
             from state s to state ss;
-        absorbing (numpy.ndarray): an array of boolean flags indicating
+        absorbing (torch.tensor): an array of boolean flags indicating
             if the reached state is absorbing;
         gamma (float): the discount factor of the considered problem.
     Returns:
@@ -23,17 +23,17 @@ def compute_advantage_montecarlo(V, s, ss, r, absorbing, gamma):
         and the advantage function.
     """
     r = r.squeeze()
-    q = np.zeros(len(r))
-    v = V(s).squeeze()
+    q = torch.zeros(len(r))
+    v = V(s, output_tensor=True).squeeze()
 
     q_next = V(ss[-1]).squeeze().item()
     for rev_k in range(len(r)):
         k = len(r) - rev_k - 1
-        q_next = r[k] + gamma * q_next * (1. - absorbing[k])
+        q_next = r[k] + gamma * q_next * (1 - absorbing[k].int())
         q[k] = q_next
 
     adv = q - v
-    return q[:, np.newaxis], adv[:, np.newaxis]
+    return q[:, None], adv[:, None]
 
 
 def compute_advantage(V, s, ss, r, absorbing, gamma):
@@ -43,25 +43,25 @@ def compute_advantage(V, s, ss, r, absorbing, gamma):
 
     Args:
         V (Regressor): the current value function regressor;
-        s (numpy.ndarray): the set of states in which we want
+        s (torch.tensor): the set of states in which we want
             to evaluate the advantage;
-        ss (numpy.ndarray): the set of next states in which we want
+        ss (torch.tensor): the set of next states in which we want
             to evaluate the advantage;
-        r (numpy.ndarray): the reward obtained in each transition
+        r (torch.tensor): the reward obtained in each transition
             from state s to state ss;
-        absorbing (numpy.ndarray): an array of boolean flags indicating
+        absorbing (torch.tensor): an array of boolean flags indicating
             if the reached state is absorbing;
         gamma (float): the discount factor of the considered problem.
     Returns:
         The new estimate for the value function of the next state
         and the advantage function.
     """
-    v = V(s).squeeze()
-    v_next = V(ss).squeeze() * (1 - absorbing)
+    v = V(s, output_tensor=True).squeeze()
+    v_next = V(ss).squeeze() * (1 - absorbing.int())
 
     q = r + gamma * v_next
     adv = q - v
-    return q[:, np.newaxis], adv[:, np.newaxis]
+    return q[:, None], adv[:, None]
 
 
 def compute_gae(V, s, ss, r, absorbing, last, gamma, lam):
@@ -75,15 +75,15 @@ def compute_gae(V, s, ss, r, absorbing, last, gamma, lam):
 
     Args:
         V (Regressor): the current value function regressor;
-        s (numpy.ndarray): the set of states in which we want
+        s (torch.tensor): the set of states in which we want
             to evaluate the advantage;
-        ss (numpy.ndarray): the set of next states in which we want
+        ss (torch.tensor): the set of next states in which we want
             to evaluate the advantage;
-        r (numpy.ndarray): the reward obtained in each transition
+        r (torch.tensor): the reward obtained in each transition
             from state s to state ss;
-        absorbing (numpy.ndarray): an array of boolean flags indicating
+        absorbing (torch.tensor): an array of boolean flags indicating
             if the reached state is absorbing;
-        last (numpy.ndarray): an array of boolean flags indicating
+        last (torch.tensor): an array of boolean flags indicating
             if the reached state is the last of the trajectory;
         gamma (float): the discount factor of the considered problem;
         lam (float): the value for the lamba coefficient used by GEA
@@ -92,9 +92,9 @@ def compute_gae(V, s, ss, r, absorbing, last, gamma, lam):
         The new estimate for the value function of the next state
         and the estimated generalized advantage.
     """
-    v = V(s)
-    v_next = V(ss)
-    gen_adv = np.empty_like(v)
+    v = V(s, output_tensor=True)
+    v_next = V(ss, output_tensor=True)
+    gen_adv = torch.empty_like(v)
     for rev_k in range(len(v)):
         k = len(v) - rev_k - 1
         if last[k] or rev_k == 0:

From 5f253fc2a3df94f6977e6a81bdd25de8b234a54f Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 12 Oct 2023 19:11:33 +0200
Subject: [PATCH 11/24] Improved dataset testing and bug fixes

- fixed some bugs in the generation of the dataset from arrays
- improved testing and saving
---
 .../core/_dataset_impl/list_dataset.py        |  4 +-
 .../core/_dataset_impl/numpy_dataset.py       |  2 +-
 .../core/_dataset_impl/torch_dataset.py       |  2 +-
 mushroom_rl/core/dataset.py                   | 10 +-
 tests/core/test_dataset.py                    | 92 +++++++++++++++----
 5 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/mushroom_rl/core/_dataset_impl/list_dataset.py b/mushroom_rl/core/_dataset_impl/list_dataset.py
index 1fc45bad3..60a3b7bd1 100644
--- a/mushroom_rl/core/_dataset_impl/list_dataset.py
+++ b/mushroom_rl/core/_dataset_impl/list_dataset.py
@@ -20,7 +20,9 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
         for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
                                          absorbings.astype(bool), lasts.astype(bool)
                                          ):
-            dataset.append((s, a, r.item(), ss, ab.item(), last.item()))
+            dataset.append(s, a, r.item(), ss, ab.item(), last.item())
+
+        return dataset
 
     def __len__(self):
         return len(self._dataset)
diff --git a/mushroom_rl/core/_dataset_impl/numpy_dataset.py b/mushroom_rl/core/_dataset_impl/numpy_dataset.py
index 613e3fd71..513a68b0f 100644
--- a/mushroom_rl/core/_dataset_impl/numpy_dataset.py
+++ b/mushroom_rl/core/_dataset_impl/numpy_dataset.py
@@ -40,7 +40,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
             absorbings = absorbings.numpy()
             lasts = lasts.numpy()
 
-        dataset = cls.__new__()
+        dataset = cls.__new__(cls)
 
         dataset._state_type = states.dtype
         dataset._action_type = actions.dtype
diff --git a/mushroom_rl/core/_dataset_impl/torch_dataset.py b/mushroom_rl/core/_dataset_impl/torch_dataset.py
index 3ab4a5557..d304ae9c4 100644
--- a/mushroom_rl/core/_dataset_impl/torch_dataset.py
+++ b/mushroom_rl/core/_dataset_impl/torch_dataset.py
@@ -32,7 +32,7 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
 
     @classmethod
     def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
-        dataset = cls.__new__()
+        dataset = cls.__new__(cls)
 
         dataset._state_type = states.dtype
         dataset._action_type = actions.dtype
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 23c8f7c05..64ab22efe 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -68,7 +68,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
         assert (len(states) == len(actions) == len(rewards)
                 == len(next_states) == len(absorbings) == len(lasts))
 
-        dataset = cls.__new__()
+        dataset = cls.__new__(cls)
         dataset._gamma = gamma
         if info is None:
             dataset._info = defaultdict(list)
@@ -77,14 +77,18 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
 
         if backend == 'numpy':
             dataset._data = NumpyDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
+            dataset._converter = NumpyConversion
         elif backend == 'torch':
             dataset._data = TorchDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
+            dataset._converter = TorchConversion
         else:
             dataset._data = ListDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
+            dataset._converter = ListConversion
 
         dataset._add_save_attr(
-            _info='mushroom',
-            _data='pickle',
+            _info='pickle',
+            _data='mushroom',
+            _converter='primitive',
             _gamma='primitive'
         )
 
diff --git a/tests/core/test_dataset.py b/tests/core/test_dataset.py
index bb3f1d9bb..474205537 100644
--- a/tests/core/test_dataset.py
+++ b/tests/core/test_dataset.py
@@ -1,16 +1,14 @@
 import numpy as np
+import torch
 
-from mushroom_rl.core import Core
+from mushroom_rl.core import Core, Dataset
 from mushroom_rl.algorithms.value import SARSA
 from mushroom_rl.environments import GridWorld
 from mushroom_rl.utils.parameters import Parameter
 from mushroom_rl.policy import EpsGreedy
 
 
-def test_dataset():
-    np.random.seed(88)
-
-    mdp = GridWorld(3, 3, (2,2))
+def generate_dataset(mdp, n_episodes):
     epsilon = Parameter(value=0.)
     alpha = Parameter(value=0.)
     pi = EpsGreedy(epsilon=epsilon)
@@ -18,16 +16,22 @@ def test_dataset():
     agent = SARSA(mdp.info, pi, alpha)
     core = Core(agent, mdp)
 
-    dataset = core.evaluate(n_episodes=10)
+    return core.evaluate(n_episodes=n_episodes)
+
+
+def test_dataset():
+    np.random.seed(42)
+    mdp = GridWorld(3, 3, (2, 2))
+    dataset = generate_dataset(mdp, 10)
 
     J = dataset.compute_J(mdp.info.gamma)
-    J_test = np.array([1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
-                       1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
-                       2.28767925e+00, 4.23911583e-01])
+    J_test = np.array([4.304672100000001, 2.287679245496101, 3.138105960900001,  0.13302794647291147,
+                       7.290000000000001,   1.8530201888518416, 1.3508517176729928, 0.011790184577738602,
+                       1.3508517176729928, 7.290000000000001])
     assert np.allclose(J, J_test)
 
     L = dataset.episodes_length
-    L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
+    L_test = np.array([9, 15, 12, 42, 4, 17, 20, 65, 20, 4])
     assert np.array_equal(L, L_test)
 
     dataset_ep = dataset.select_first_episodes(3)
@@ -39,10 +43,10 @@ def test_dataset():
 
     samples = dataset.select_random_samples(2)
     s, a, r, ss, ab, last = samples.parse()
-    s_test = np.array([[6.], [1.]])
-    a_test = np.array([[0.], [1.]])
+    s_test = np.array([[5.], [6.]])
+    a_test = np.array([[3.], [0.]])
     r_test = np.zeros(2)
-    ss_test = np.array([[3], [4]])
+    ss_test = np.array([[5], [3]])
     ab_test = np.zeros(2)
     last_test = np.zeros(2)
     assert np.array_equal(s, s_test)
@@ -58,8 +62,62 @@ def test_dataset():
 
     index = np.sum(L_test[:2]) + L_test[2]//2
     min_J, max_J, mean_J, median_J, n_episodes = dataset[:index].compute_metrics(mdp.info.gamma)
-    assert min_J == 0.0011610630703530948
-    assert max_J == 0.2781283894436937
-    assert mean_J == 0.1396447262570234
-    assert median_J == 0.1396447262570234
+    assert min_J == 2.287679245496101
+    assert max_J == 4.304672100000001
+    assert mean_J == 3.296175672748051
+    assert median_J == 3.296175672748051
     assert n_episodes == 2
+
+
+def test_dataset_creation():
+    np.random.seed(42)
+
+    mdp = GridWorld(3, 3, (2, 2))
+    dataset = generate_dataset(mdp, 5)
+
+    parsed = tuple(dataset.parse())
+    parsed_torch = (torch.from_numpy(array) for array in parsed)
+
+    print(len(parsed))
+
+    new_numpy_dataset = Dataset.from_array(*parsed, gamma=mdp.info.gamma)
+    new_list_dataset = Dataset.from_array(*parsed, gamma=mdp.info.gamma, backend='list')
+    new_torch_dataset = Dataset.from_array(*parsed, gamma=mdp.info.gamma, backend='torch')
+
+    assert vars(dataset).keys() == vars(new_numpy_dataset).keys()
+    assert vars(dataset).keys() == vars(new_list_dataset).keys()
+    assert vars(dataset).keys() == vars(new_torch_dataset).keys()
+
+    for array_1, array_2 in zip(parsed, new_numpy_dataset.parse()):
+        assert np.array_equal(array_1, array_2)
+
+    for array_1, array_2 in zip(parsed, new_list_dataset.parse()):
+        assert np.array_equal(array_1, array_2)
+
+    for array_1, array_2 in zip(parsed_torch, new_torch_dataset.parse(to='torch')):
+        assert torch.equal(array_1, array_2)
+
+
+def test_dataset_loading(tmpdir):
+    np.random.seed(42)
+
+    mdp = GridWorld(3, 3, (2, 2))
+    dataset = generate_dataset(mdp, 20)
+
+    path = tmpdir / 'dataset_test.msh'
+    dataset.save(path)
+
+    new_dataset = dataset.load(path)
+
+    assert vars(dataset).keys() == vars(new_dataset).keys()
+
+    assert np.array_equal(dataset.state, new_dataset.state) and \
+            np.array_equal(dataset.action, new_dataset.action) and \
+            np.array_equal(dataset.reward, new_dataset.reward) and \
+            np.array_equal(dataset.next_state, new_dataset.next_state) and \
+            np.array_equal(dataset.absorbing, new_dataset.absorbing) and \
+            np.array_equal(dataset.last, new_dataset.last)
+
+    assert dataset._gamma == new_dataset._gamma
+
+

From 6ca1549b1c0f45ba7fdffe547afc2b0f92a701fa Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Fri, 13 Oct 2023 17:42:06 +0200
Subject: [PATCH 12/24] Work on core

- renamed folder containing core implementation details
- removed copy calls on preprocessors
---
 mushroom_rl/core/{_dataset_impl => _impl}/__init__.py         | 0
 mushroom_rl/core/{_dataset_impl => _impl}/list_dataset.py     | 0
 mushroom_rl/core/{_dataset_impl => _impl}/numpy_dataset.py    | 0
 mushroom_rl/core/{_dataset_impl => _impl}/torch_dataset.py    | 0
 mushroom_rl/core/{_dataset_impl => _impl}/type_conversions.py | 0
 mushroom_rl/core/core.py                                      | 4 ++--
 mushroom_rl/core/dataset.py                                   | 2 +-
 7 files changed, 3 insertions(+), 3 deletions(-)
 rename mushroom_rl/core/{_dataset_impl => _impl}/__init__.py (100%)
 rename mushroom_rl/core/{_dataset_impl => _impl}/list_dataset.py (100%)
 rename mushroom_rl/core/{_dataset_impl => _impl}/numpy_dataset.py (100%)
 rename mushroom_rl/core/{_dataset_impl => _impl}/torch_dataset.py (100%)
 rename mushroom_rl/core/{_dataset_impl => _impl}/type_conversions.py (100%)

diff --git a/mushroom_rl/core/_dataset_impl/__init__.py b/mushroom_rl/core/_impl/__init__.py
similarity index 100%
rename from mushroom_rl/core/_dataset_impl/__init__.py
rename to mushroom_rl/core/_impl/__init__.py
diff --git a/mushroom_rl/core/_dataset_impl/list_dataset.py b/mushroom_rl/core/_impl/list_dataset.py
similarity index 100%
rename from mushroom_rl/core/_dataset_impl/list_dataset.py
rename to mushroom_rl/core/_impl/list_dataset.py
diff --git a/mushroom_rl/core/_dataset_impl/numpy_dataset.py b/mushroom_rl/core/_impl/numpy_dataset.py
similarity index 100%
rename from mushroom_rl/core/_dataset_impl/numpy_dataset.py
rename to mushroom_rl/core/_impl/numpy_dataset.py
diff --git a/mushroom_rl/core/_dataset_impl/torch_dataset.py b/mushroom_rl/core/_impl/torch_dataset.py
similarity index 100%
rename from mushroom_rl/core/_dataset_impl/torch_dataset.py
rename to mushroom_rl/core/_impl/torch_dataset.py
diff --git a/mushroom_rl/core/_dataset_impl/type_conversions.py b/mushroom_rl/core/_impl/type_conversions.py
similarity index 100%
rename from mushroom_rl/core/_dataset_impl/type_conversions.py
rename to mushroom_rl/core/_impl/type_conversions.py
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index 759e4edf7..b29baa892 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -206,7 +206,7 @@ def _step(self, render, record):
             self._episode_steps < self.mdp.info.horizon and not absorbing)
 
         state = self._state
-        next_state = self._preprocess(next_state.copy())
+        next_state = self._preprocess(next_state)
         self._state = next_state
 
         return (state, action, reward, next_state, absorbing, last), step_info
@@ -223,7 +223,7 @@ def reset(self, initial_states=None):
 
         self.agent.episode_start()
         
-        self._state = self._preprocess(self.mdp.reset(initial_state).copy())
+        self._state = self._preprocess(self.mdp.reset(initial_state))
         self.agent.next_action = None
         self._episode_steps = 0
 
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 64ab22efe..b73abee21 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -4,7 +4,7 @@
 
 from mushroom_rl.core.serialization import Serializable
 
-from ._dataset_impl import *
+from ._impl import *
 
 
 class Dataset(Serializable):

From bde69baeb555b2cecf1f58d677e8b337f60d59ec Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Sat, 14 Oct 2023 15:31:43 +0200
Subject: [PATCH 13/24] Added type conversion

- now the agent outputs arrays in the same type as the environment
---
 .../algorithms/value/dqn/abstract_dqn.py      | 11 ++------
 mushroom_rl/core/_impl/type_conversions.py    | 25 +++++++++++++++++
 mushroom_rl/core/agent.py                     | 27 ++++++++++++++++---
 mushroom_rl/core/dataset.py                   |  5 ++--
 4 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/mushroom_rl/algorithms/value/dqn/abstract_dqn.py b/mushroom_rl/algorithms/value/dqn/abstract_dqn.py
index c38158cb0..70a9bec10 100644
--- a/mushroom_rl/algorithms/value/dqn/abstract_dqn.py
+++ b/mushroom_rl/algorithms/value/dqn/abstract_dqn.py
@@ -121,16 +121,9 @@ def _fit_prioritized(self, dataset):
             self.approximator.fit(state, action, q, weights=is_weight,
                                   **self._fit_params)
 
-    def draw_action(self, state):
-        action = super().draw_action(np.array(state))
-
-        return action
-
-    def _initialize_regressors(self, approximator, apprx_params_train,
-                               apprx_params_target):
+    def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target):
         self.approximator = Regressor(approximator, **apprx_params_train)
-        self.target_approximator = Regressor(approximator,
-                                             **apprx_params_target)
+        self.target_approximator = Regressor(approximator, **apprx_params_target)
         self._update_target()
 
     def _update_target(self):
diff --git a/mushroom_rl/core/_impl/type_conversions.py b/mushroom_rl/core/_impl/type_conversions.py
index 911f50a1e..2a7410c6e 100644
--- a/mushroom_rl/core/_impl/type_conversions.py
+++ b/mushroom_rl/core/_impl/type_conversions.py
@@ -3,6 +3,15 @@
 
 
 class DataConversion(object):
+    @staticmethod
+    def get_converter(backend):
+        if backend == 'numpy':
+            return NumpyConversion
+        elif backend == 'torch':
+            return TorchConversion
+        else:
+            return ListConversion
+
     @classmethod
     def convert(cls, *arrays, to='numpy'):
         if to == 'numpy':
@@ -28,6 +37,10 @@ def to_numpy(array):
     def to_torch(array):
         raise NotImplementedError
 
+    @staticmethod
+    def to_backend_array(cls, array):
+        raise NotImplementedError
+
 
 class NumpyConversion(DataConversion):
     @staticmethod
@@ -38,6 +51,10 @@ def to_numpy(array):
     def to_torch(array):
         return torch.from_numpy(array)
 
+    @staticmethod
+    def to_backend_array(cls, array):
+        return cls.to_numpy(array)
+
 
 class TorchConversion(DataConversion):
     @staticmethod
@@ -48,6 +65,10 @@ def to_numpy(array):
     def to_torch(array):
         return array
 
+    @staticmethod
+    def to_backend_array(cls, array):
+        return cls.to_torch(array)
+
 
 class ListConversion(DataConversion):
     @staticmethod
@@ -58,6 +79,10 @@ def to_numpy(array):
     def to_torch(array):
         return torch.as_tensor(array)
 
+    @staticmethod
+    def to_backend_array(cls, array):
+        return cls.to_numpy(array)
+
 
 
 
diff --git a/mushroom_rl/core/agent.py b/mushroom_rl/core/agent.py
index c8e18f2a6..c0c20209a 100644
--- a/mushroom_rl/core/agent.py
+++ b/mushroom_rl/core/agent.py
@@ -1,5 +1,7 @@
 from mushroom_rl.core.serialization import Serializable
 
+from ._impl import *
+
 
 class Agent(Serializable):
     """
@@ -8,31 +10,39 @@ class Agent(Serializable):
 
     """
 
-    def __init__(self, mdp_info, policy, features=None):
+    def __init__(self, mdp_info, policy, features=None, backend='numpy'):
         """
         Constructor.
 
         Args:
             mdp_info (MDPInfo): information about the MDP;
             policy (Policy): the policy followed by the agent;
-            features (object, None): features to extract from the state.
+            features (object, None): features to extract from the state;
+            backend (str, 'numpy'): array backend to be used by the algorithm.
 
         """
         self.mdp_info = mdp_info
         self.policy = policy
+        self.backend = backend
 
         self.phi = features
 
         self.next_action = None
 
+        self._agent_converter = DataConversion.get_converter(backend)
+        self._env_converter = DataConversion.get_converter(self.mdp_info.backend)
+
         self._preprocessors = list()
         self._logger = None
 
         self._add_save_attr(
             mdp_info='pickle',
             policy='mushroom',
+            backend='primitive',
             phi='pickle',
             next_action='numpy',
+            _agent_converter = 'primitive',
+            _env_converter='primitive',
             _preprocessors='mushroom',
             _logger='none'
         )
@@ -64,12 +74,14 @@ def draw_action(self, state):
             state = self.phi(state)
 
         if self.next_action is None:
-            return self.policy.draw_action(state)
+            action = self.policy.draw_action(state)
         else:
             action = self.next_action
             self.next_action = None
 
-            return action
+        return action
+
+        #return self._convert_to_env_backend(action)
 
     def episode_start(self):
         """
@@ -116,3 +128,10 @@ def preprocessors(self):
 
         """
         return self._preprocessors
+
+    def _convert_to_env_backend(self, array):
+        return self._env_converter.to_backend_array(self._agent_converter, array)
+
+    def _convert_to_agent_backend(self, array):
+        return self._agent_converter.to_backend_array(self._env_converter, array)
+
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index b73abee21..6ba4e7b62 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -30,13 +30,12 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
 
         if mdp_info.backend == 'numpy':
             self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
-            self._converter = NumpyConversion
         elif mdp_info.backend == 'torch':
             self._data = TorchDataset(state_type, state_shape, action_type, action_shape, reward_shape)
-            self._converter = TorchConversion
         else:
             self._data = ListDataset()
-            self._converter = ListConversion
+
+        self._converter = DataConversion.get_converter(mdp_info.backend)
 
         self._gamma = mdp_info.gamma
 

From ac80f55bee70f0928ed25398f47356b93ec7ab6b Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Sat, 14 Oct 2023 16:27:37 +0200
Subject: [PATCH 14/24] Fixes in TD and envs

- added item method in the dataset class
- Fixed side effects in puddleworld env
- Fixed interface of td to use new dataset class
- Cleanup of various environments to use wider file length convention
---
 .../value/td/sarsa_lambda_continuous.py       | 12 +++-----
 mushroom_rl/algorithms/value/td/td.py         | 24 +---------------
 .../value/td/true_online_sarsa_lambda.py      | 12 +++-----
 .../value/td/weighted_q_learning.py           | 10 ++-----
 mushroom_rl/core/agent.py                     |  2 +-
 mushroom_rl/core/dataset.py                   |  4 +++
 mushroom_rl/environments/atari.py             |  6 ++--
 mushroom_rl/environments/car_on_hill.py       |  6 ++--
 mushroom_rl/environments/cart_pole.py         | 11 ++++----
 mushroom_rl/environments/grid_world.py        | 18 ++++--------
 mushroom_rl/environments/inverted_pendulum.py |  3 +-
 mushroom_rl/environments/lqr.py               | 13 +++------
 mushroom_rl/environments/minigrid_env.py      |  4 +--
 mushroom_rl/environments/puddle_world.py      | 28 ++++++++-----------
 mushroom_rl/environments/segway.py            | 13 ++++-----
 mushroom_rl/environments/ship_steering.py     |  5 +---
 16 files changed, 57 insertions(+), 114 deletions(-)

diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
index 1c434bce1..761c6f191 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
@@ -10,8 +10,7 @@ class SARSALambdaContinuous(TD):
     Continuous version of SARSA(lambda) algorithm.
 
     """
-    def __init__(self, mdp_info, policy, approximator, learning_rate,
-                 lambda_coeff, features, approximator_params=None):
+    def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None):
         """
         Constructor.
 
@@ -19,8 +18,7 @@ def __init__(self, mdp_info, policy, approximator, learning_rate,
             lambda_coeff ([float, Parameter]): eligibility trace coefficient.
 
         """
-        approximator_params = dict() if approximator_params is None else \
-            approximator_params
+        approximator_params = dict() if approximator_params is None else approximator_params
 
         Q = Regressor(approximator, **approximator_params)
         self.e = np.zeros(Q.weights_size)
@@ -39,13 +37,11 @@ def _update(self, state, action, reward, next_state, absorbing):
 
         alpha = self._alpha(state, action)
 
-        self.e = self.mdp_info.gamma * self._lambda() * self.e + self.Q.diff(
-            phi_state, action)
+        self.e = self.mdp_info.gamma * self._lambda() * self.e + self.Q.diff(phi_state, action)
 
         self.next_action = self.draw_action(next_state)
         phi_next_state = self.phi(next_state)
-        q_next = self.Q.predict(phi_next_state,
-                                self.next_action) if not absorbing else 0.
+        q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0.
 
         delta = reward + self.mdp_info.gamma * q_next - q_current
 
diff --git a/mushroom_rl/algorithms/value/td/td.py b/mushroom_rl/algorithms/value/td/td.py
index 29dd92b48..928599f6c 100644
--- a/mushroom_rl/algorithms/value/td/td.py
+++ b/mushroom_rl/algorithms/value/td/td.py
@@ -31,31 +31,9 @@ def __init__(self, mdp_info, policy, approximator, learning_rate,
     def fit(self, dataset, **info):
         assert len(dataset) == 1
 
-        state, action, reward, next_state, absorbing = self._parse(dataset)
+        state, action, reward, next_state, absorbing, _ = dataset.item()
         self._update(state, action, reward, next_state, absorbing)
 
-    @staticmethod
-    def _parse(dataset):
-        """
-        Utility to parse the dataset that is supposed to contain only a sample.
-
-        Args:
-            dataset (list): the current episode step.
-
-        Returns:
-            A tuple containing state, action, reward, next state, absorbing and
-            last flag.
-
-        """
-        sample = dataset[0]
-        state = sample[0]
-        action = sample[1]
-        reward = sample[2]
-        next_state = sample[3]
-        absorbing = sample[4]
-
-        return state, action, reward, next_state, absorbing
-
     def _update(self, state, action, reward, next_state, absorbing):
         """
         Update the Q-table.
diff --git a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
index 8ea545573..509d24d66 100644
--- a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
@@ -22,8 +22,7 @@ def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
             lambda_coeff ([float, Parameter]): eligibility trace coefficient.
 
         """
-        approximator_params = dict() if approximator_params is None else \
-            approximator_params
+        approximator_params = dict() if approximator_params is None else approximator_params
 
         Q = Regressor(LinearApproximator, **approximator_params)
         self.e = np.zeros(Q.weights_size)
@@ -40,8 +39,7 @@ def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
 
     def _update(self, state, action, reward, next_state, absorbing):
         phi_state = self.phi(state)
-        phi_state_action = get_action_features(phi_state, action,
-                                               self.mdp_info.action_space.n)
+        phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n)
         q_current = self.Q.predict(phi_state, action)
 
         if self._q_old is None:
@@ -55,14 +53,12 @@ def _update(self, state, action, reward, next_state, absorbing):
 
         self.next_action = self.draw_action(next_state)
         phi_next_state = self.phi(next_state)
-        q_next = self.Q.predict(phi_next_state,
-                                self.next_action) if not absorbing else 0.
+        q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0.
 
         delta = reward + self.mdp_info.gamma * q_next - self._q_old
 
         theta = self.Q.get_weights()
-        theta += delta * self.e + alpha * (
-            self._q_old - q_current) * phi_state_action
+        theta += delta * self.e + alpha * (self._q_old - q_current) * phi_state_action
         self.Q.set_weights(theta)
 
         self._q_old = q_next
diff --git a/mushroom_rl/algorithms/value/td/weighted_q_learning.py b/mushroom_rl/algorithms/value/td/weighted_q_learning.py
index b6cc15463..bae01dbc7 100644
--- a/mushroom_rl/algorithms/value/td/weighted_q_learning.py
+++ b/mushroom_rl/algorithms/value/td/weighted_q_learning.py
@@ -56,14 +56,11 @@ def _update(self, state, action, reward, next_state, absorbing):
         alpha = self._alpha(state, action)
 
         self.Q[state, action] = q_current + alpha * (target - q_current)
-        self._Q2[state, action] = q2_current + alpha * (
-            target ** 2 - q2_current
-        )
+        self._Q2[state, action] = q2_current + alpha * (target ** 2 - q2_current)
 
         self._n_updates[state, action] += 1
 
-        self._w2[state, action] = (1 - alpha) ** 2 * self._w2[
-            state, action] + alpha ** 2
+        self._w2[state, action] = (1 - alpha) ** 2 * self._w2[state, action] + alpha ** 2
         self._w1[state, action] = (1 - alpha) * self._w1[state, action] + alpha
 
         if self._n_updates[state, action] > 1:
@@ -90,8 +87,7 @@ def _next_q(self, next_state):
             sigmas[a] = self._sigma[next_state, np.array([a])]
 
         if self._sampling:
-            samples = np.random.normal(np.repeat([means], self._precision, 0),
-                                       np.repeat([sigmas], self._precision, 0))
+            samples = np.random.normal(np.repeat([means], self._precision, 0), np.repeat([sigmas], self._precision, 0))
             max_idx = np.argmax(samples, axis=1)
             max_idx, max_count = np.unique(max_idx, return_counts=True)
             count = np.zeros(means.size)
diff --git a/mushroom_rl/core/agent.py b/mushroom_rl/core/agent.py
index c0c20209a..02f1252b8 100644
--- a/mushroom_rl/core/agent.py
+++ b/mushroom_rl/core/agent.py
@@ -52,7 +52,7 @@ def fit(self, dataset, **info):
         Fit step.
 
         Args:
-            dataset (list): the dataset.
+            dataset (Dataset): the dataset.
 
         """
         raise NotImplementedError('Agent is an abstract class')
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 6ba4e7b62..8c62ce819 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -119,6 +119,10 @@ def get_view(self, index):
 
         return dataset
 
+    def item(self):
+        assert len(self) == 1
+        return self[0]
+
     def __getitem__(self, index):
         if isinstance(index, (slice, np.ndarray)):
             return self.get_view(index)
diff --git a/mushroom_rl/environments/atari.py b/mushroom_rl/environments/atari.py
index c11321ea9..c78d9f2bb 100644
--- a/mushroom_rl/environments/atari.py
+++ b/mushroom_rl/environments/atari.py
@@ -129,13 +129,11 @@ def step(self, action):
             if self._episode_ends_at_life:
                 absorbing = True
             self._lives = info['lives']
-            self._force_fire = self.env.unwrapped.get_action_meanings()[
-                1] == 'FIRE'
+            self._force_fire = self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 
         self._state.append(preprocess_frame(obs, self._img_size))
 
-        return LazyFrames(list(self._state),
-                          self._history_length), reward, absorbing, info
+        return LazyFrames(list(self._state), self._history_length), reward, absorbing, info
 
     def render(self, record=False):
         self.env.render(mode='human')
diff --git a/mushroom_rl/environments/car_on_hill.py b/mushroom_rl/environments/car_on_hill.py
index 32eacdc02..e1547ee43 100644
--- a/mushroom_rl/environments/car_on_hill.py
+++ b/mushroom_rl/environments/car_on_hill.py
@@ -55,12 +55,10 @@ def step(self, action):
 
         self._state = new_state[-1, :-1]
 
-        if self._state[0] < -self.max_pos or \
-                np.abs(self._state[1]) > self.max_velocity:
+        if self._state[0] < -self.max_pos or np.abs(self._state[1]) > self.max_velocity:
             reward = -1.
             absorbing = True
-        elif self._state[0] > self.max_pos and \
-                np.abs(self._state[1]) <= self.max_velocity:
+        elif self._state[0] > self.max_pos and np.abs(self._state[1]) <= self.max_velocity:
             reward = 1.
             absorbing = True
         else:
diff --git a/mushroom_rl/environments/cart_pole.py b/mushroom_rl/environments/cart_pole.py
index fe15afe37..ed1b99b05 100644
--- a/mushroom_rl/environments/cart_pole.py
+++ b/mushroom_rl/environments/cart_pole.py
@@ -103,8 +103,7 @@ def render(self, record=False):
 
         direction = -np.sign(self._last_u) * np.array([1, 0])
         value = np.abs(self._last_u)
-        self._viewer.force_arrow(start, direction, value,
-                                 self._max_u, self._l / 5)
+        self._viewer.force_arrow(start, direction, value, self._max_u, self._l / 5)
 
         frame = self._viewer.get_frame() if record else None
 
@@ -120,9 +119,9 @@ def _dynamics(self, state, t, u):
         omega = state[1]
 
         d_theta = omega
-        d_omega = (self._g * np.sin(theta) - self._alpha * self._m * self._l * .5 *
-                   d_theta ** 2 * np.sin(2 * theta) * .5 - self._alpha * np.cos(
-                    theta) * u) / (2 / 3 * self._l - self._alpha * self._m *
-                                   self._l * .5 * np.cos(theta) ** 2)
+        d_omega = (self._g * np.sin(theta)
+                   - self._alpha * self._m * self._l * .5 * d_theta ** 2 * np.sin(2 * theta) * .5
+                   - self._alpha * np.cos(theta) * u) / (2 / 3 * self._l -
+                                                         self._alpha * self._m * self._l * .5 * np.cos(theta) ** 2)
 
         return d_theta, d_omega
diff --git a/mushroom_rl/environments/grid_world.py b/mushroom_rl/environments/grid_world.py
index 360aead01..b45ff5a62 100644
--- a/mushroom_rl/environments/grid_world.py
+++ b/mushroom_rl/environments/grid_world.py
@@ -32,8 +32,7 @@ def __init__(self, mdp_info, height, width, start, goal):
         self._goal = goal
 
         # Visualization
-        self._viewer = Viewer(self._width, self._height, 500,
-                              self._height * 500 // self._width)
+        self._viewer = Viewer(self._width, self._height, 500, self._height * 500 // self._width)
 
         super().__init__(mdp_info)
 
@@ -56,23 +55,18 @@ def step(self, action):
     def render(self, record=False):
         for row in range(1, self._height):
             for col in range(1, self._width):
-                self._viewer.line(np.array([col, 0]),
-                                  np.array([col, self._height]))
-                self._viewer.line(np.array([0, row]),
-                                  np.array([self._width, row]))
+                self._viewer.line(np.array([col, 0]), np.array([col, self._height]))
+                self._viewer.line(np.array([0, row]), np.array([self._width, row]))
 
-        goal_center = np.array([.5 + self._goal[1],
-                                self._height - (.5 + self._goal[0])])
+        goal_center = np.array([.5 + self._goal[1], self._height - (.5 + self._goal[0])])
         self._viewer.square(goal_center, 0, 1, (0, 255, 0))
 
         start_grid = self.convert_to_grid(self._start, self._width)
-        start_center = np.array([.5 + start_grid[1],
-                                 self._height - (.5 + start_grid[0])])
+        start_center = np.array([.5 + start_grid[1], self._height - (.5 + start_grid[0])])
         self._viewer.square(start_center, 0, 1, (255, 0, 0))
 
         state_grid = self.convert_to_grid(self._state, self._width)
-        state_center = np.array([.5 + state_grid[1],
-                                 self._height - (.5 + state_grid[0])])
+        state_center = np.array([.5 + state_grid[1], self._height - (.5 + state_grid[0])])
         self._viewer.circle(state_center, .4, (0, 0, 255))
 
         frame = self._viewer.get_frame() if record else None
diff --git a/mushroom_rl/environments/inverted_pendulum.py b/mushroom_rl/environments/inverted_pendulum.py
index 3fc7ed1b9..97cc34bf7 100644
--- a/mushroom_rl/environments/inverted_pendulum.py
+++ b/mushroom_rl/environments/inverted_pendulum.py
@@ -65,8 +65,7 @@ def reset(self, state=None):
         else:
             self._state = state
             self._state[0] = normalize_angle(self._state[0])
-            self._state[1] = self._bound(self._state[1], -self._max_omega,
-                                         self._max_omega)
+            self._state[1] = self._bound(self._state[1], -self._max_omega, self._max_omega)
 
         self._last_u = 0.0
         return self._state
diff --git a/mushroom_rl/environments/lqr.py b/mushroom_rl/environments/lqr.py
index 8e317d693..528d9a5b0 100644
--- a/mushroom_rl/environments/lqr.py
+++ b/mushroom_rl/environments/lqr.py
@@ -114,16 +114,12 @@ def generate(dimensions=None, s_dim=None, a_dim=None, max_pos=np.inf, max_action
     def reset(self, state=None):
         if state is None:
             if self.random_init:
-                self._state = self._bound(
-                    np.random.uniform(-3, 3, size=self.A.shape[0]),
-                    self.info.observation_space.low,
-                    self.info.observation_space.high
-                )
+                rand_state = np.random.uniform(-3, 3, size=self.A.shape[0])
+                self._state = self._bound(rand_state, self.info.observation_space.low, self.info.observation_space.high)
             elif self._initial_state is not None:
                 self._state = self._initial_state
             else:
-                init_value = .9 * self._max_pos if np.isfinite(
-                    self._max_pos) else 10
+                init_value = .9 * self._max_pos if np.isfinite(self._max_pos) else 10
                 self._state = init_value * np.ones(self.A.shape[0])
         else:
             self._state = state
@@ -132,8 +128,7 @@ def reset(self, state=None):
 
     def step(self, action):
         x = self._state
-        u = self._bound(action, self.info.action_space.low,
-                        self.info.action_space.high)
+        u = self._bound(action, self.info.action_space.low, self.info.action_space.high)
 
         reward = -(x.dot(self.Q).dot(x) + u.dot(self.R).dot(u))
         self._state = self.A.dot(x) + self.B.dot(u)
diff --git a/mushroom_rl/environments/minigrid_env.py b/mushroom_rl/environments/minigrid_env.py
index acbb9084e..dcf549704 100644
--- a/mushroom_rl/environments/minigrid_env.py
+++ b/mushroom_rl/environments/minigrid_env.py
@@ -86,9 +86,9 @@ def reset(self, state=None):
 
     def step(self, action):
         obs, reward, absorbing, info = self.env.step(action)
-        reward *= 1. # Int to float
+        reward = float(reward)
         if reward > 0:
-            reward = 1. # MiniGrid discounts rewards based on timesteps, but we need raw rewards
+            reward = 1.  # MiniGrid discounts rewards based on timesteps, but we need raw rewards
 
         self._state.append(preprocess_frame(obs, self._img_size))
 
diff --git a/mushroom_rl/environments/puddle_world.py b/mushroom_rl/environments/puddle_world.py
index 140623572..ce07dbb72 100644
--- a/mushroom_rl/environments/puddle_world.py
+++ b/mushroom_rl/environments/puddle_world.py
@@ -72,20 +72,20 @@ def reset(self, state=None):
 
     def step(self, action):
         idx = action[0]
-        self._state += self._actions[idx] + np.random.uniform(
-            low=-self._noise_step, high=self._noise_step, size=(2,))
-        self._state = np.clip(self._state, 0., 1.)
+        noise = np.random.uniform(low=-self._noise_step, high=self._noise_step, size=(2,))
+        next_state = self._state + self._actions[idx] + noise
+        next_state = np.clip(next_state, 0., 1.)
 
-        absorbing = np.linalg.norm((self._state - self._goal),
-                                   ord=1) < self._goal_threshold
+        absorbing = np.linalg.norm((next_state - self._goal), ord=1) < self._goal_threshold
 
         if not absorbing:
-            reward = np.random.randn() * self._noise_reward + self._get_reward(
-                self._state)
+            reward = np.random.randn() * self._noise_reward + self._get_reward(next_state)
         else:
             reward = self._reward_goal
 
-        return self._state, reward, absorbing, {}
+        self._state = next_state
+
+        return next_state, reward, absorbing, {}
 
     def render(self, record=False):
         if self._pixels is None:
@@ -95,16 +95,14 @@ def render(self, record=False):
                 for j in range(img_size):
                     x = i / img_size
                     y = j / img_size
-                    pixels[i, img_size - 1 - j] = self._get_reward(
-                        np.array([x, y]))
+                    pixels[i, img_size - 1 - j] = self._get_reward(np.array([x, y]))
 
             pixels -= pixels.min()
             pixels *= 255. / pixels.max()
             self._pixels = np.floor(255 - pixels)
 
         self._viewer.background_image(self._pixels)
-        self._viewer.circle(self._state, 0.01,
-                            color=(0, 255, 0))
+        self._viewer.circle(self._state, 0.01, color=(0, 255, 0))
 
         goal_area = [
             [-self._goal_threshold, 0],
@@ -112,8 +110,7 @@ def render(self, record=False):
             [self._goal_threshold, 0],
             [0, -self._goal_threshold]
         ]
-        self._viewer.polygon(self._goal, 0, goal_area,
-                             color=(255, 0, 0), width=1)
+        self._viewer.polygon(self._goal, 0, goal_area, color=(255, 0, 0), width=1)
 
         frame = self._viewer.get_frame() if record else None
 
@@ -128,7 +125,6 @@ def stop(self):
     def _get_reward(self, state):
         reward = -1.
         for cen, wid in zip(self._puddle_center, self._puddle_width):
-            reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf(
-                state[1], cen[1], wid[1])
+            reward -= 2. * norm.pdf(state[0], cen[0], wid[0]) * norm.pdf(state[1], cen[1], wid[1])
 
         return reward
diff --git a/mushroom_rl/environments/segway.py b/mushroom_rl/environments/segway.py
index f63958b4b..53be85bf1 100644
--- a/mushroom_rl/environments/segway.py
+++ b/mushroom_rl/environments/segway.py
@@ -101,12 +101,10 @@ def _dynamics(self, state, t, u):
 
         omegaP = d_alpha
 
-        dOmegaP = -(h2 * self._l * self._Mp * self._r * np.sin(
-            alpha) * omegaP**2 - self._g * h1 * self._l * self._Mp * np.sin(
-            alpha) + (h2 + h1) * u) / (h1 * h3 - h2**2)
-        dOmegaR = (h3 * self._l * self._Mp * self._r * np.sin(
-            alpha) * omegaP**2 - self._g * h2 * self._l * self._Mp * np.sin(
-            alpha) + (h3 + h2) * u) / (h1 * h3 - h2**2)
+        dOmegaP = -(h2 * self._l * self._Mp * self._r * np.sin( alpha) * omegaP**2
+                    - self._g * h1 * self._l * self._Mp * np.sin(alpha) + (h2 + h1) * u) / (h1 * h3 - h2**2)
+        dOmegaR = (h3 * self._l * self._Mp * self._r * np.sin(alpha) * omegaP**2
+                   - self._g * h2 * self._l * self._Mp * np.sin(alpha) + (h3 + h2) * u) / (h1 * h3 - h2**2)
 
         dx = list()
         dx.append(omegaP)
@@ -124,8 +122,7 @@ def render(self, record=False):
         self._last_x += dx
 
         if self._last_x > 2.5 * self._l or self._last_x < -2.5 * self._l:
-            self._last_x = (2.5 * self._l + self._last_x) % (
-                5 * self._l) - 2.5 * self._l
+            self._last_x = (2.5 * self._l + self._last_x) % (5 * self._l) - 2.5 * self._l
 
         start[0] += self._last_x
         end[0] += -2 * self._l * np.sin(self._state[0]) + self._last_x
diff --git a/mushroom_rl/environments/ship_steering.py b/mushroom_rl/environments/ship_steering.py
index e2cc7c073..ccf09a414 100644
--- a/mushroom_rl/environments/ship_steering.py
+++ b/mushroom_rl/environments/ship_steering.py
@@ -83,10 +83,7 @@ def step(self, action):
             new_state[2] = normalize_angle(state[2] + state[3] * self.info.dt)
             new_state[3] = state[3] + (r - state[3]) * self.info.dt / self._T
 
-            if new_state[0] > self.field_size \
-               or new_state[1] > self.field_size \
-               or new_state[0] < 0 or new_state[1] < 0:
-
+            if new_state[0] > self.field_size or new_state[1] > self.field_size or new_state[0] < 0 or new_state[1] < 0:
                 new_state[0] = self._bound(new_state[0], 0, self.field_size)
                 new_state[1] = self._bound(new_state[1], 0, self.field_size)
 

From 8890fab6bd5a7c2fdbf966dab564c53c910ace03 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Sat, 14 Oct 2023 18:54:53 +0200
Subject: [PATCH 15/24] Added handling of episode_info

- now we have a dictionary also for reset
- we can enable the support of context variables using this methodology
---
 .../classic_actor_critic/stochastic_ac.py     |  4 +-
 .../black_box_optimization.py                 |  4 +-
 mushroom_rl/algorithms/value/td/q_lambda.py   |  4 +-
 .../algorithms/value/td/sarsa_lambda.py       |  4 +-
 .../value/td/sarsa_lambda_continuous.py       |  4 +-
 .../value/td/true_online_sarsa_lambda.py      |  4 +-
 mushroom_rl/approximators/parametric/cmac.py  | 21 ++++------
 .../approximators/parametric/linear.py        | 18 +++-----
 .../parametric/torch_approximator.py          |  5 +--
 mushroom_rl/core/agent.py                     |  9 ++--
 mushroom_rl/core/core.py                      | 10 ++---
 mushroom_rl/core/dataset.py                   | 41 ++++++++++++++-----
 mushroom_rl/core/environment.py               |  2 +-
 mushroom_rl/environments/atari.py             |  2 +-
 mushroom_rl/environments/car_on_hill.py       |  2 +-
 mushroom_rl/environments/cart_pole.py         |  2 +-
 mushroom_rl/environments/dm_control_env.py    |  2 +-
 mushroom_rl/environments/finite_mdp.py        |  2 +-
 mushroom_rl/environments/grid_world.py        |  2 +-
 mushroom_rl/environments/gym_env.py           |  4 +-
 mushroom_rl/environments/habitat_env.py       |  2 +-
 mushroom_rl/environments/igibson_env.py       |  2 +-
 mushroom_rl/environments/inverted_pendulum.py |  2 +-
 mushroom_rl/environments/lqr.py               |  2 +-
 mushroom_rl/environments/minigrid_env.py      |  2 +-
 mushroom_rl/environments/mujoco.py            |  2 +-
 mushroom_rl/environments/puddle_world.py      |  2 +-
 mushroom_rl/environments/pybullet.py          |  2 +-
 mushroom_rl/environments/segway.py            |  2 +-
 mushroom_rl/environments/ship_steering.py     |  2 +-
 .../mujoco_envs/test_ball_in_a_cup.py         |  4 +-
 31 files changed, 89 insertions(+), 81 deletions(-)

diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
index d74b1d41d..42ab1ec1f 100644
--- a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
+++ b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
@@ -58,11 +58,11 @@ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
             _e_theta='numpy'
         )
 
-    def episode_start(self):
+    def episode_start(self, episode_info):
         self._e_v = np.zeros(self._V.weights_size)
         self._e_theta = np.zeros(self.policy.weights_size)
 
-        super().episode_start()
+        super().episode_start(episode_info)
 
     def fit(self, dataset, **info):
         for step in dataset:
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
index 895a2d8fa..707b00507 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
@@ -26,12 +26,12 @@ def __init__(self, mdp_info, distribution, policy, features=None):
 
         super().__init__(mdp_info, policy, features)
 
-    def episode_start(self):
+    def episode_start(self, episode_info):
         theta = self.distribution.sample()
         self._theta_list.append(theta)
         self.policy.set_weights(theta)
 
-        super().episode_start()
+        super().episode_start(episode_info)
 
     def fit(self, dataset, **info):
         Jep = dataset.compute_J(self.mdp_info.gamma)
diff --git a/mushroom_rl/algorithms/value/td/q_lambda.py b/mushroom_rl/algorithms/value/td/q_lambda.py
index 2181fb557..723f388a4 100644
--- a/mushroom_rl/algorithms/value/td/q_lambda.py
+++ b/mushroom_rl/algorithms/value/td/q_lambda.py
@@ -44,7 +44,7 @@ def _update(self, state, action, reward, next_state, absorbing):
         self.Q.table += self._alpha(state, action) * delta * self.e.table
         self.e.table *= self.mdp_info.gamma * self._lambda()
 
-    def episode_start(self):
+    def episode_start(self, episode_info):
         self.e.reset()
 
-        super().episode_start()
+        super().episode_start(episode_info)
diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda.py b/mushroom_rl/algorithms/value/td/sarsa_lambda.py
index 0f5a4999c..0e04027d9 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda.py
@@ -42,7 +42,7 @@ def _update(self, state, action, reward, next_state, absorbing):
         self.Q.table += self._alpha(state, action) * delta * self.e.table
         self.e.table *= self.mdp_info.gamma * self._lambda()
 
-    def episode_start(self):
+    def episode_start(self, episode_info):
         self.e.reset()
 
-        super().episode_start()
+        super().episode_start(episode_info)
diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
index 761c6f191..c5795d0e7 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
@@ -49,7 +49,7 @@ def _update(self, state, action, reward, next_state, absorbing):
         theta += alpha * delta * self.e
         self.Q.set_weights(theta)
 
-    def episode_start(self):
+    def episode_start(self, episode_info):
         self.e = np.zeros(self.Q.weights_size)
 
-        super().episode_start()
+        super().episode_start(episode_info)
diff --git a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
index 509d24d66..cf354e0f1 100644
--- a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
@@ -63,8 +63,8 @@ def _update(self, state, action, reward, next_state, absorbing):
 
         self._q_old = q_next
 
-    def episode_start(self):
+    def episode_start(self, episode_info):
         self._q_old = None
         self.e = np.zeros(self.Q.weights_size)
 
-        super().episode_start()
+        super().episode_start(episode_info)
diff --git a/mushroom_rl/approximators/parametric/cmac.py b/mushroom_rl/approximators/parametric/cmac.py
index f0af55ee9..1d4fea2d9 100644
--- a/mushroom_rl/approximators/parametric/cmac.py
+++ b/mushroom_rl/approximators/parametric/cmac.py
@@ -16,12 +16,9 @@ def __init__(self, tilings, weights=None, output_shape=(1,), **kwargs):
 
         Args:
             tilings (list): list of tilings to discretize the input space.
-            weights (np.ndarray): array of weights to initialize the weights
-                of the approximator;
-            input_shape (np.ndarray, None): the shape of the input of the
-                model;
-            output_shape (np.ndarray, (1,)): the shape of the output of the
-                model;
+            weights (np.ndarray): array of weights to initialize the weights of the approximator;
+            input_shape (np.ndarray, None): the shape of the input of the model;
+            output_shape (np.ndarray, (1,)): the shape of the output of the model;
             **kwargs: other params of the approximator.
 
         """
@@ -40,8 +37,7 @@ def fit(self, x, y, alpha=1.0, **kwargs):
             x (np.ndarray): input;
             y (np.ndarray): target;
             alpha (float): learning rate;
-            **kwargs: other parameters used by the fit method of the
-                regressor.
+            **kwargs: other parameters used by the fit method of the regressor.
 
         """
         y_hat = self.predict(x)
@@ -64,8 +60,7 @@ def predict(self, x, **predict_params):
 
         Args:
             x (np.ndarray): input;
-            **predict_params: other parameters used by the predict method
-                the regressor.
+            **predict_params: other parameters used by the predict method the regressor.
 
         Returns:
             The predictions of the model.
@@ -84,16 +79,14 @@ def predict(self, x, **predict_params):
 
     def diff(self, state, action=None):
         """
-        Compute the derivative of the output w.r.t. ``state``, and ``action``
-        if provided.
+        Compute the derivative of the output w.r.t. ``state``, and ``action`` if provided.
 
         Args:
             state (np.ndarray): the state;
             action (np.ndarray, None): the action.
 
         Returns:
-            The derivative of the output w.r.t. ``state``, and ``action``
-            if provided.
+            The derivative of the output w.r.t. ``state``, and ``action`` if provided.
 
         """
 
diff --git a/mushroom_rl/approximators/parametric/linear.py b/mushroom_rl/approximators/parametric/linear.py
index 942a8c5bc..004c7988e 100644
--- a/mushroom_rl/approximators/parametric/linear.py
+++ b/mushroom_rl/approximators/parametric/linear.py
@@ -14,12 +14,9 @@ def __init__(self, weights=None, input_shape=None, output_shape=(1,),
         Constructor.
 
         Args:
-             weights (np.ndarray): array of weights to initialize the weights
-                of the approximator;
-             input_shape (np.ndarray, None): the shape of the input of the
-                model;
-             output_shape (np.ndarray, (1,)): the shape of the output of the
-                model;
+             weights (np.ndarray): array of weights to initialize the weights of the approximator;
+             input_shape (np.ndarray, None): the shape of the input of the model;
+             output_shape (np.ndarray, (1,)): the shape of the output of the model;
              **kwargs: other params of the approximator.
 
         """
@@ -45,8 +42,7 @@ def fit(self, x, y, **fit_params):
         Args:
             x (np.ndarray): input;
             y (np.ndarray): target;
-            **fit_params: other parameters used by the fit method of the
-                regressor.
+            **fit_params: other parameters used by the fit method of the regressor.
 
         """
         self._w = np.atleast_2d(np.linalg.pinv(x).dot(y).T)
@@ -57,8 +53,7 @@ def predict(self, x, **predict_params):
 
         Args:
             x (np.ndarray): input;
-            **predict_params: other parameters used by the predict method
-                the regressor.
+            **predict_params: other parameters used by the predict method the regressor.
 
         Returns:
             The predictions of the model.
@@ -101,8 +96,7 @@ def set_weights(self, w):
 
     def diff(self, state, action=None):
         """
-        Compute the derivative of the output w.r.t. ``state``, and ``action``
-        if provided.
+        Compute the derivative of the output w.r.t. ``state``, and ``action`` if provided.
 
         Args:
             state (np.ndarray): the state;
diff --git a/mushroom_rl/approximators/parametric/torch_approximator.py b/mushroom_rl/approximators/parametric/torch_approximator.py
index 51a896451..b3a5e4c31 100644
--- a/mushroom_rl/approximators/parametric/torch_approximator.py
+++ b/mushroom_rl/approximators/parametric/torch_approximator.py
@@ -10,9 +10,8 @@
 class TorchApproximator(Serializable):
     """
     Class to interface a pytorch model to the mushroom Regressor interface.
-    This class implements all is needed to use a generic pytorch model and train
-    it using a specified optimizer and objective function.
-    This class supports also minibatches.
+    This class implements all is needed to use a generic pytorch model and train it using a specified optimizer and
+    objective function. This class supports also minibatches.
 
     """
     def __init__(self, input_shape, output_shape, network, optimizer=None,
diff --git a/mushroom_rl/core/agent.py b/mushroom_rl/core/agent.py
index 02f1252b8..f323b5845 100644
--- a/mushroom_rl/core/agent.py
+++ b/mushroom_rl/core/agent.py
@@ -79,14 +79,15 @@ def draw_action(self, state):
             action = self.next_action
             self.next_action = None
 
-        return action
+        return self._convert_to_env_backend(action)
 
-        #return self._convert_to_env_backend(action)
-
-    def episode_start(self):
+    def episode_start(self, episode_info):
         """
         Called by the agent when a new episode starts.
 
+         Args:
+            episode_info (dict): a dictionary containing the information at reset, such as context.
+
         """
         self.policy.reset()
 
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index b29baa892..1b1a11b1b 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -78,8 +78,7 @@ def learn(self, n_steps=None, n_episodes=None, n_steps_per_fit=None,
 
         self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record)
 
-    def evaluate(self, initial_states=None, n_steps=None, n_episodes=None,
-                 render=False, quiet=False, record=False):
+    def evaluate(self, initial_states=None, n_steps=None, n_episodes=None, render=False, quiet=False, record=False):
         """
         This function moves the agent in the environment using its policy.
         The agent is moved for a provided number of steps, episodes, or from a set of initial states for the whole
@@ -221,9 +220,10 @@ def reset(self, initial_states=None):
         else:
             initial_state = initial_states[self._total_episodes_counter]
 
-        self.agent.episode_start()
-        
-        self._state = self._preprocess(self.mdp.reset(initial_state))
+        state, episode_info = self.mdp.reset(initial_state)
+        self.agent.episode_start(episode_info)
+
+        self._state = self._preprocess(state)
         self.agent.next_action = None
         self._episode_steps = 0
 
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 8c62ce819..390783612 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -27,6 +27,7 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
         action_type = mdp_info.action_space.data_type
 
         self._info = defaultdict(list)
+        self._episode_info = defaultdict(list)
 
         if mdp_info.backend == 'numpy':
             self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
@@ -41,14 +42,15 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
 
         self._add_save_attr(
             _info='pickle',
+            _episode_info='pickle',
             _data='mushroom',
             _converter='primitive',
             _gamma='primitive'
         )
 
     @classmethod
-    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, info=None, gamma=0.99,
-                   backend='numpy'):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, info=None, episode_info=None,
+                   gamma=0.99, backend='numpy'):
         """
         Creates a dataset of transitions from the provided arrays.
 
@@ -74,6 +76,11 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
         else:
             dataset._info = info.copy()
 
+        if episode_info is None:
+            dataset._episode_info = defaultdict(list)
+        else:
+            dataset._episode_info = episode_info.copy()
+
         if backend == 'numpy':
             dataset._data = NumpyDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
             dataset._converter = NumpyConversion
@@ -86,6 +93,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
 
         dataset._add_save_attr(
             _info='pickle',
+            _episode_info='pickle',
             _data='mushroom',
             _converter='primitive',
             _gamma='primitive'
@@ -95,7 +103,10 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
 
     def append(self, step, info):
         self._data.append(*step[:6])
-        self._append_info(info)
+        self._append_info(self._info, info)
+
+    def append_episode_info(self, info):
+        self._append_info(self._episode_info, info)
 
     def get_info(self, field, index=None):
         if index is None:
@@ -115,6 +126,7 @@ def get_view(self, index):
             info_slice[key] = self._info[key][index]
 
         dataset._info = info_slice
+        dataset._episode_info = defaultdict(list)
         dataset._data = self._data.get_view(index)
 
         return dataset
@@ -134,11 +146,11 @@ def __getitem__(self, index):
     def __add__(self, other):
         result = self.copy()
 
-        new_info = defaultdict(list)
-        for key in self._info.keys():
-            new_info[key] = self._info[key] + other.info[key]
+        new_info = self._merge_info(result.info, other.info)
+        new_episode_info = self._merge_info(result.episode_info, other.episode_info)
 
         result._info = new_info
+        result._episode_info = new_episode_info
         result._data = self._data + other._data
 
         return result
@@ -174,6 +186,10 @@ def last(self):
     def info(self):
         return self._info
 
+    @property
+    def episode_info(self):
+        return self._episode_info
+
     @property
     def episodes_length(self):
         """
@@ -335,9 +351,14 @@ def compute_metrics(self, gamma=1.):
         else:
             return 0, 0, 0, 0, 0
 
-    def _append_info(self, step_info):
+    @staticmethod
+    def _append_info(info, step_info):
         for key, value in step_info.items():
-            self._info[key].append(value)
-
-
+            info[key].append(value)
 
+    @staticmethod
+    def _merge_info(info, other_info):
+        new_info = defaultdict(list)
+        for key in info.keys():
+            new_info[key] = info[key] + other_info[key]
+        return new_info
diff --git a/mushroom_rl/core/environment.py b/mushroom_rl/core/environment.py
index e666585c9..1f83ce585 100644
--- a/mushroom_rl/core/environment.py
+++ b/mushroom_rl/core/environment.py
@@ -148,7 +148,7 @@ def reset(self, state=None):
             state (np.ndarray, None): the state to set to the current state.
 
         Returns:
-            The current state.
+            The current state and a dictionary containing the info for the episode.
 
         """
         raise NotImplementedError
diff --git a/mushroom_rl/environments/atari.py b/mushroom_rl/environments/atari.py
index c78d9f2bb..15e778eb7 100644
--- a/mushroom_rl/environments/atari.py
+++ b/mushroom_rl/environments/atari.py
@@ -109,7 +109,7 @@ def reset(self, state=None):
 
         self._current_no_op = np.random.randint(self._max_no_op_actions + 1)
 
-        return LazyFrames(list(self._state), self._history_length)
+        return LazyFrames(list(self._state), self._history_length), {}
 
     def step(self, action):
         action = action[0]
diff --git a/mushroom_rl/environments/car_on_hill.py b/mushroom_rl/environments/car_on_hill.py
index e1547ee43..f03368426 100644
--- a/mushroom_rl/environments/car_on_hill.py
+++ b/mushroom_rl/environments/car_on_hill.py
@@ -46,7 +46,7 @@ def reset(self, state=None):
         else:
             self._state = state
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         action = self._discrete_actions[action[0]]
diff --git a/mushroom_rl/environments/cart_pole.py b/mushroom_rl/environments/cart_pole.py
index ed1b99b05..0dd934516 100644
--- a/mushroom_rl/environments/cart_pole.py
+++ b/mushroom_rl/environments/cart_pole.py
@@ -63,7 +63,7 @@ def reset(self, state=None):
             self._state[0] = normalize_angle(self._state[0])
 
         self._last_u = 0
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         if action == 0:
diff --git a/mushroom_rl/environments/dm_control_env.py b/mushroom_rl/environments/dm_control_env.py
index 030d4977d..c5844f68d 100644
--- a/mushroom_rl/environments/dm_control_env.py
+++ b/mushroom_rl/environments/dm_control_env.py
@@ -79,7 +79,7 @@ def reset(self, state=None):
         else:
             raise NotImplementedError
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         step = self.env.step(action)
diff --git a/mushroom_rl/environments/finite_mdp.py b/mushroom_rl/environments/finite_mdp.py
index b49995c00..3296e98dc 100644
--- a/mushroom_rl/environments/finite_mdp.py
+++ b/mushroom_rl/environments/finite_mdp.py
@@ -49,7 +49,7 @@ def reset(self, state=None):
         else:
             self._state = state
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         p = self.p[self._state[0], action[0], :]
diff --git a/mushroom_rl/environments/grid_world.py b/mushroom_rl/environments/grid_world.py
index b45ff5a62..923440c0c 100644
--- a/mushroom_rl/environments/grid_world.py
+++ b/mushroom_rl/environments/grid_world.py
@@ -42,7 +42,7 @@ def reset(self, state=None):
 
         self._state = state
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         state = self.convert_to_grid(self._state, self._width)
diff --git a/mushroom_rl/environments/gym_env.py b/mushroom_rl/environments/gym_env.py
index 66f3e12bf..aa4015ad1 100644
--- a/mushroom_rl/environments/gym_env.py
+++ b/mushroom_rl/environments/gym_env.py
@@ -83,12 +83,12 @@ def __init__(self, name, horizon=None, gamma=0.99, wrappers=None, wrappers_args=
 
     def reset(self, state=None):
         if state is None:
-            return np.atleast_1d(self.env.reset())
+            return np.atleast_1d(self.env.reset()), {}
         else:
             self.env.reset()
             self.env.state = state
 
-            return np.atleast_1d(state)
+            return np.atleast_1d(state), {}
 
     def step(self, action):
         action = self._convert_action(action)
diff --git a/mushroom_rl/environments/habitat_env.py b/mushroom_rl/environments/habitat_env.py
index 685550517..f52c0cb3d 100644
--- a/mushroom_rl/environments/habitat_env.py
+++ b/mushroom_rl/environments/habitat_env.py
@@ -251,7 +251,7 @@ def __init__(self, wrapper, config_file, base_config_file=None, horizon=None, ga
     def reset(self, state=None):
         assert state is None, 'Cannot set Habitat state'
         obs = self._convert_observation(np.atleast_1d(self.env.reset()))
-        return obs
+        return obs, {}
 
     def step(self, action):
         action = self._convert_action(action)
diff --git a/mushroom_rl/environments/igibson_env.py b/mushroom_rl/environments/igibson_env.py
index a9d7b80e6..3293919fc 100644
--- a/mushroom_rl/environments/igibson_env.py
+++ b/mushroom_rl/environments/igibson_env.py
@@ -113,7 +113,7 @@ def __init__(self, config_file, horizon=None, gamma=0.99, is_discrete=False,
 
     def reset(self, state=None):
         assert state is None, 'Cannot set iGibson state'
-        return self._convert_observation(np.atleast_1d(self.env.reset()))
+        return self._convert_observation(np.atleast_1d(self.env.reset())), {}
 
     def step(self, action):
         action = self._convert_action(action)
diff --git a/mushroom_rl/environments/inverted_pendulum.py b/mushroom_rl/environments/inverted_pendulum.py
index 97cc34bf7..9f7f773f0 100644
--- a/mushroom_rl/environments/inverted_pendulum.py
+++ b/mushroom_rl/environments/inverted_pendulum.py
@@ -68,7 +68,7 @@ def reset(self, state=None):
             self._state[1] = self._bound(self._state[1], -self._max_omega, self._max_omega)
 
         self._last_u = 0.0
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         u = self._bound(action[0], -self._max_u, self._max_u)
diff --git a/mushroom_rl/environments/lqr.py b/mushroom_rl/environments/lqr.py
index 528d9a5b0..8b7fb4046 100644
--- a/mushroom_rl/environments/lqr.py
+++ b/mushroom_rl/environments/lqr.py
@@ -124,7 +124,7 @@ def reset(self, state=None):
         else:
             self._state = state
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         x = self._state
diff --git a/mushroom_rl/environments/minigrid_env.py b/mushroom_rl/environments/minigrid_env.py
index dcf549704..daf688f05 100644
--- a/mushroom_rl/environments/minigrid_env.py
+++ b/mushroom_rl/environments/minigrid_env.py
@@ -82,7 +82,7 @@ def reset(self, state=None):
             self._state) for _ in range(self._history_length)],
             maxlen=self._history_length
         )
-        return LazyFrames(list(self._state), self._history_length)
+        return LazyFrames(list(self._state), self._history_length), {}
 
     def step(self, action):
         obs, reward, absorbing, info = self.env.step(action)
diff --git a/mushroom_rl/environments/mujoco.py b/mushroom_rl/environments/mujoco.py
index 2e9ae0457..c68f095a1 100644
--- a/mushroom_rl/environments/mujoco.py
+++ b/mushroom_rl/environments/mujoco.py
@@ -127,7 +127,7 @@ def reset(self, obs=None):
         self.setup(obs)
 
         self._obs = self._create_observation(self.obs_helper._build_obs(self._data))
-        return self._modify_observation(self._obs)
+        return self._modify_observation(self._obs), {}
 
     def step(self, action):
         cur_obs = self._obs.copy()
diff --git a/mushroom_rl/environments/puddle_world.py b/mushroom_rl/environments/puddle_world.py
index ce07dbb72..2f12360d1 100644
--- a/mushroom_rl/environments/puddle_world.py
+++ b/mushroom_rl/environments/puddle_world.py
@@ -68,7 +68,7 @@ def reset(self, state=None):
         else:
             self._state = state
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         idx = action[0]
diff --git a/mushroom_rl/environments/pybullet.py b/mushroom_rl/environments/pybullet.py
index 11931ae88..8bb276b8e 100644
--- a/mushroom_rl/environments/pybullet.py
+++ b/mushroom_rl/environments/pybullet.py
@@ -93,7 +93,7 @@ def reset(self, state=None):
         self._state = self._indexer.create_sim_state()
         observation = self._create_observation(self._state)
 
-        return observation
+        return observation, {}
 
     def render(self, record=False):
         frame = self._viewer.display()
diff --git a/mushroom_rl/environments/segway.py b/mushroom_rl/environments/segway.py
index 53be85bf1..bba32c6f6 100644
--- a/mushroom_rl/environments/segway.py
+++ b/mushroom_rl/environments/segway.py
@@ -66,7 +66,7 @@ def reset(self, state=None):
 
         self._last_x = 0
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
         u = self._bound(action[0], -self._max_u, self._max_u)
diff --git a/mushroom_rl/environments/ship_steering.py b/mushroom_rl/environments/ship_steering.py
index ccf09a414..4502f4c8a 100644
--- a/mushroom_rl/environments/ship_steering.py
+++ b/mushroom_rl/environments/ship_steering.py
@@ -67,7 +67,7 @@ def reset(self, state=None):
         else:
             self._state = state
 
-        return self._state
+        return self._state, {}
 
     def step(self, action):
 
diff --git a/tests/environments/mujoco_envs/test_ball_in_a_cup.py b/tests/environments/mujoco_envs/test_ball_in_a_cup.py
index fd5eebc60..c20982e1b 100644
--- a/tests/environments/mujoco_envs/test_ball_in_a_cup.py
+++ b/tests/environments/mujoco_envs/test_ball_in_a_cup.py
@@ -14,10 +14,10 @@ def test_ball_in_a_cup():
         p_gains = np.array([200, 300, 100, 100, 10, 10, 2.5])/5
         d_gains = np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])/10
 
-        obs_0 = env.reset()
+        obs_0, _ = env.reset()
 
         for _ in [1,2]:
-            obs = env.reset()
+            obs, _ = env.reset()
 
             assert np.array_equal(obs, obs_0)
             done = False

From 707adc6d995cbdf5167e8e2189805b2153f69050 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Tue, 17 Oct 2023 19:57:35 +0200
Subject: [PATCH 16/24] Added support for vectorial policies

- added policy internal state to the datasets classes
- fixed some bugs of dataset implementation
- updated policies to support internal state shape
---
 .../actor_critic/deep_actor_critic/sac.py     |  5 +-
 mushroom_rl/core/_impl/list_dataset.py        | 30 ++++++--
 mushroom_rl/core/_impl/numpy_dataset.py       | 74 +++++++++++++++----
 mushroom_rl/core/_impl/torch_dataset.py       | 58 ++++++++++++++-
 mushroom_rl/core/core.py                      | 15 +++-
 mushroom_rl/core/dataset.py                   | 46 +++++++++---
 mushroom_rl/policy/deterministic_policy.py    |  8 +-
 mushroom_rl/policy/gaussian_policy.py         | 36 +++++----
 mushroom_rl/policy/noise_policy.py            |  8 +-
 mushroom_rl/policy/policy.py                  | 28 +++++++
 mushroom_rl/policy/promps.py                  | 15 +++-
 mushroom_rl/policy/td_policy.py               | 16 ++--
 mushroom_rl/policy/torch_policy.py            | 14 ++--
 tests/core/test_core.py                       |  1 +
 14 files changed, 277 insertions(+), 77 deletions(-)

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
index b4236f2b3..b2b0447e2 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
@@ -22,7 +22,8 @@ class SACPolicy(Policy):
     compute_action_and_log_prob_t methods, that are fundamental for the internals calculations of the SAC algorithm.
 
     """
-    def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_min, log_std_max):
+    def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_min, log_std_max,
+                 policy_state_shape=None):
         """
         Constructor.
 
@@ -35,6 +36,8 @@ def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_mi
             log_std_max ([float, Parameter]): max value for the policy log std.
 
         """
+        super().__init__(policy_state_shape)
+
         self._mu_approximator = mu_approximator
         self._sigma_approximator = sigma_approximator
 
diff --git a/mushroom_rl/core/_impl/list_dataset.py b/mushroom_rl/core/_impl/list_dataset.py
index 60a3b7bd1..4455c02d0 100644
--- a/mushroom_rl/core/_impl/list_dataset.py
+++ b/mushroom_rl/core/_impl/list_dataset.py
@@ -8,19 +8,26 @@
 class ListDataset(Serializable):
     def __init__(self):
         self._dataset = list()
+        self._policy_dataset = list()
 
         self._add_save_attr(
             _dataset='pickle'
         )
 
     @classmethod
-    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, policy_states=None,
+                   policy_next_states=None):
         dataset = cls()
 
-        for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
-                                         absorbings.astype(bool), lasts.astype(bool)
-                                         ):
-            dataset.append(s, a, r.item(), ss, ab.item(), last.item())
+        if policy_states is None:
+            for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
+                                             absorbings.astype(bool), lasts.astype(bool)):
+                dataset.append(s, a, r.item(), ss, ab.item(), last.item())
+        else:
+            for s, a, r, ss, ab, last, ps, pss in zip(states, actions, rewards, next_states,
+                                                      absorbings.astype(bool), lasts.astype(bool),
+                                                      policy_states, policy_next_states):
+                dataset.append(s, a, r.item(), ss, ab.item(), last.item(), ps.item(), pss.item())
 
         return dataset
 
@@ -28,9 +35,10 @@ def __len__(self):
         return len(self._dataset)
 
     def append(self, *step):
-        assert len(step) == 6
         step_copy = deepcopy(step)
-        self._dataset.append(step_copy)
+        self._dataset.append(step_copy[:6])
+        if len(step_copy) == 8:
+            self._policy_dataset.append(step_copy[6:])
 
     def clear(self):
         self._dataset = list()
@@ -77,3 +85,11 @@ def absorbing(self):
     @property
     def last(self):
         return [step[5] for step in self._dataset]
+
+    @property
+    def policy_state(self):
+        return [step[6] for step in self._dataset]
+
+    @property
+    def policy_next_state(self):
+        return [step[7] for step in self._dataset]
diff --git a/mushroom_rl/core/_impl/numpy_dataset.py b/mushroom_rl/core/_impl/numpy_dataset.py
index 513a68b0f..a00b7a04f 100644
--- a/mushroom_rl/core/_impl/numpy_dataset.py
+++ b/mushroom_rl/core/_impl/numpy_dataset.py
@@ -4,8 +4,8 @@
 
 
 class NumpyDataset(Serializable):
-    def __init__(self, state_type, state_shape, action_type, action_shape, reward_shape):
-        flags_shape = (action_shape[0],)
+    def __init__(self, state_type, state_shape, action_type, action_shape, reward_shape, policy_state_shape):
+        flags_len = action_shape[0]
 
         self._state_type = state_type
         self._action_type = action_type
@@ -14,10 +14,17 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
         self._actions = np.empty(action_shape, dtype=self._action_type)
         self._rewards = np.empty(reward_shape, dtype=float)
         self._next_states = np.empty(state_shape, dtype=self._state_type)
-        self._absorbing = np.empty(flags_shape, dtype=bool)
-        self._last = np.empty(flags_shape, dtype=bool)
+        self._absorbing = np.empty(flags_len, dtype=bool)
+        self._last = np.empty(flags_len, dtype=bool)
         self._len = 0
 
+        if policy_state_shape is None:
+            self._policy_states = None
+            self._policy_next_states = None
+        else:
+            self._policy_states = np.empty(policy_state_shape, dtype=float)
+            self._policy_next_states = np.empty(policy_state_shape, dtype=float)
+
         self._add_save_attr(
             _state_type='primitive',
             _action_type='primitive',
@@ -27,16 +34,19 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
             _next_states='numpy',
             _absorbing='numpy',
             _last='numpy',
+            _policy_states='numpy',
+            _policy_next_states='numpy',
             _len='primitive'
         )
 
     @classmethod
-    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts,
+                   policy_states=None, policy_next_states=None):
         if not isinstance(states, np.ndarray):
             states = states.numpy()
-            actions = states.numpy()
-            rewards = states.numpy()
-            next_states = states.numpy()
+            actions = actions.numpy()
+            rewards = rewards.numpy()
+            next_states = next_states.numpy()
             absorbings = absorbings.numpy()
             lasts = lasts.numpy()
 
@@ -53,6 +63,14 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
         dataset._last = lasts
         dataset._len = len(lasts)
 
+        if policy_states is not None and policy_next_states is not None:
+            if not isinstance(policy_states, np.ndarray):
+                policy_states = policy_states.numpy()
+                policy_next_states = policy_next_states.numpy()
+
+            dataset._policy_states = policy_states
+            dataset._policy_next_states = policy_next_states
+
         dataset._add_save_attr(
             _state_type='primitive',
             _action_type='primitive',
@@ -62,6 +80,8 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
             _next_states='numpy',
             _absorbing='numpy',
             _last='numpy',
+            _policy_states='numpy',
+            _policy_next_states='numpy',
             _len='primitive'
         )
 
@@ -70,7 +90,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
     def __len__(self):
         return self._len
 
-    def append(self, state, action, reward, next_state, absorbing, last):
+    def append(self, state, action, reward, next_state, absorbing, last, policy_state=None, policy_next_state=None):
         i = self._len
 
         self._states[i] = state
@@ -80,6 +100,10 @@ def append(self, state, action, reward, next_state, absorbing, last):
         self._absorbing[i] = absorbing
         self._last[i] = last
 
+        if policy_state is not None:
+            self._policy_states[i] = policy_state
+            self._policy_next_states[i] = policy_next_state
+
         self._len += 1
 
     def clear(self):
@@ -90,19 +114,27 @@ def clear(self):
         self._absorbing = np.empty_like(self._absorbing)
         self._last = np.empty_like(self._last)
 
+        if self._policy_states is not None:
+            self._policy_states = np.empty_like(self._policy_states)
+            self._policy_next_states = np.empty_like(self._policy_next_states)
+
         self._len = 0
 
     def get_view(self, index):
         view = self.copy()
 
-        view._states = self._states[index, ...]
-        view._actions = self._actions[index, ...]
-        view._rewards = self._rewards[index, ...]
-        view._next_states = self._next_states[index, ...]
-        view._absorbing = self._absorbing[index, ...]
-        view._last = self._last[index, ...]
+        view._states = self.state[index, ...]
+        view._actions = self.action[index, ...]
+        view._rewards = self.reward[index, ...]
+        view._next_states = self.next_state[index, ...]
+        view._absorbing = self.absorbing[index, ...]
+        view._last = self.last[index, ...]
         view._len = view._states.shape[0]
 
+        if self._policy_states is not None:
+            view._policy_states = self._policy_states[index, ...]
+            view._policy_next_states = self._policy_next_states[index, ...]
+
         return view
 
     def __getitem__(self, index):
@@ -120,6 +152,10 @@ def __add__(self, other):
         result._last = np.concatenate((self.last, other.last))
         result._len = len(self) + len(other)
 
+        if result._policy_states is not None:
+            result._policy_states = np.concatenate((self.policy_state, other.policy_state))
+            result._policy_next_states = np.concatenate((self.policy_next_state, other.policy_next_state))
+
         return result
 
     @property
@@ -145,3 +181,11 @@ def absorbing(self):
     @property
     def last(self):
         return self._last[:len(self)]
+
+    @property
+    def policy_state(self):
+        return self._policy_states[:len(self)]
+
+    @property
+    def policy_next_state(self):
+        return self._policy_next_states[:len(self)]
diff --git a/mushroom_rl/core/_impl/torch_dataset.py b/mushroom_rl/core/_impl/torch_dataset.py
index d304ae9c4..fe9ab683a 100644
--- a/mushroom_rl/core/_impl/torch_dataset.py
+++ b/mushroom_rl/core/_impl/torch_dataset.py
@@ -4,7 +4,7 @@
 
 
 class TorchDataset(Serializable):
-    def __init__(self, state_type, state_shape, action_type, action_shape, reward_shape):
+    def __init__(self, state_type, state_shape, action_type, action_shape, reward_shape, policy_state_shape):
         flags_len = action_shape[0]
 
         self._state_type = state_type
@@ -18,6 +18,13 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
         self._last = torch.empty(flags_len, dtype=torch.bool)
         self._len = 0
 
+        if policy_state_shape is None:
+            self._policy_states = None
+            self._policy_next_states = None
+        else:
+            self._policy_states = torch.empty(policy_state_shape, dtype=torch.float)
+            self._policy_next_states = torch.empty(policy_state_shape, dtype=torch.float)
+
         self._add_save_attr(
             _state_type='primitive',
             _action_type='primitive',
@@ -27,11 +34,22 @@ def __init__(self, state_type, state_shape, action_type, action_shape, reward_sh
             _next_states='torch',
             _absorbing='torch',
             _last='torch',
+            _policy_states='numpy',
+            _policy_next_states='numpy',
             _len='primitive'
         )
 
     @classmethod
-    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts,
+                   policy_states=None, policy_next_states=None):
+        if not isinstance(states, torch.Tensor):
+            states = torch.as_tensor(states)
+            actions = torch.as_tensor(actions)
+            rewards = torch.as_tensor(rewards)
+            next_states = torch.as_tensor(next_states)
+            absorbings = torch.as_tensor(absorbings)
+            lasts = torch.as_tensor(lasts)
+
         dataset = cls.__new__(cls)
 
         dataset._state_type = states.dtype
@@ -45,6 +63,14 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
         dataset._last = torch.as_tensor(lasts, dtype=torch.bool)
         dataset._len = len(lasts)
 
+        if policy_states is not None and policy_next_states is not None:
+            if not isinstance(policy_states, torch.Tensor):
+                policy_states = torch.as_tensor(policy_states)
+                policy_next_states = torch.as_tensor(policy_next_states)
+
+            dataset._policy_states = policy_states
+            dataset._policy_next_states = policy_next_states
+
         dataset._add_save_attr(
             _state_type='primitive',
             _action_type='primitive',
@@ -54,6 +80,8 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
             _next_states='torch',
             _absorbing='torch',
             _last='torch',
+            _policy_states='numpy',
+            _policy_next_states='numpy',
             _len='primitive'
         )
 
@@ -62,7 +90,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts):
     def __len__(self):
         return self._len
 
-    def append(self, state, action, reward, next_state, absorbing, last):
+    def append(self, state, action, reward, next_state, absorbing, last, policy_state=None, policy_next_state=None):
         i = self._len
 
         self._states[i] = state
@@ -72,6 +100,10 @@ def append(self, state, action, reward, next_state, absorbing, last):
         self._absorbing[i] = absorbing
         self._last[i] = last
 
+        if policy_state is not None:
+            self._policy_states[i] = policy_state
+            self._policy_next_states[i] = policy_next_state
+
         self._len += 1
 
     def clear(self):
@@ -82,6 +114,10 @@ def clear(self):
         self._absorbing = torch.empty_like(self._absorbing)
         self._last = torch.empty_like(self._last)
 
+        if self._policy_states is not None:
+            self._policy_states = torch.empty_like(self._policy_states)
+            self._policy_next_states = torch.empty_like(self._policy_next_states)
+
         self._len = 0
 
     def get_view(self, index):
@@ -95,6 +131,10 @@ def get_view(self, index):
         view._last = self._last[index, ...]
         view._len = view._states.shape[0]
 
+        if self._policy_states is not None:
+            view._policy_states = self._policy_states[index, ...]
+            view._policy_next_states = self._policy_next_states[index, ...]
+
         return view
 
     def __getitem__(self, index):
@@ -112,6 +152,10 @@ def __add__(self, other):
         result._last = torch.concatenate((self.last, other.last))
         result._len = len(self) + len(other)
 
+        if result._policy_states is not None:
+            result._policy_states = torch.concatenate((self.policy_state, other.policy_state))
+            result._policy_next_states = torch.concatenate((self.policy_next_state, other.policy_next_state))
+
         return result
 
     @property
@@ -137,3 +181,11 @@ def absorbing(self):
     @property
     def last(self):
         return self._last[:len(self)]
+
+    @property
+    def policy_state(self):
+        return self._policy_states[:len(self)]
+
+    @property
+    def policy_next_state(self):
+        return self._policy_next_states[:len(self)]
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index 1b1a11b1b..c8e9c150c 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -74,7 +74,8 @@ def learn(self, n_steps=None, n_episodes=None, n_steps_per_fit=None,
         else:
             fit_condition = lambda: self._current_episodes_counter >= self._n_episodes_per_fit
 
-        dataset = Dataset(self.mdp.info, self._n_steps_per_fit, self._n_episodes_per_fit)
+        dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape,
+                          self._n_steps_per_fit, self._n_episodes_per_fit)
 
         self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record)
 
@@ -103,7 +104,7 @@ def evaluate(self, initial_states=None, n_steps=None, n_episodes=None, render=Fa
         fit_condition = lambda: False
 
         n_episodes_dataset = len(initial_states) if initial_states is not None else n_episodes
-        dataset = Dataset(self.mdp.info, n_steps, n_episodes_dataset)
+        dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape, n_steps, n_episodes_dataset)
 
         return self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record, initial_states)
 
@@ -141,16 +142,22 @@ def _run_impl(self, dataset, move_condition, fit_condition, steps_progress_bar,
         while move_condition():
             if last:
                 self.reset(initial_states)
+                if self.agent.policy.is_stateful:
+                    policy_state = self.agent.policy.get_policy_state()
 
             sample, step_info = self._step(render, record)
 
+            if self.agent.policy.is_stateful:
+                policy_next_state = self.agent.policy.get_policy_state()
+                sample += (policy_state, policy_next_state)
+
             self.callback_step(sample)
 
             self._total_steps_counter += 1
             self._current_steps_counter += 1
             steps_progress_bar.update(1)
 
-            if sample[-1]:
+            if sample[5]:
                 self._total_episodes_counter += 1
                 self._current_episodes_counter += 1
                 episodes_progress_bar.update(1)
@@ -167,7 +174,7 @@ def _run_impl(self, dataset, move_condition, fit_condition, steps_progress_bar,
 
                 dataset.clear()
 
-            last = sample[-1]
+            last = sample[5]
 
         self.agent.stop()
         self.mdp.stop()
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 390783612..c5317ec23 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -8,7 +8,7 @@
 
 
 class Dataset(Serializable):
-    def __init__(self, mdp_info, n_steps=None, n_episodes=None):
+    def __init__(self, mdp_info, policy_state_shape, n_steps=None, n_episodes=None):
         assert (n_steps is not None and n_episodes is None) or (n_steps is None and n_episodes is not None)
 
         if n_steps is not None:
@@ -23,6 +23,9 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
         action_shape = (n_samples,) + mdp_info.action_space.shape
         reward_shape = (n_samples,)
 
+        if policy_state_shape is not None:
+            policy_state_shape = (n_samples,) + policy_state_shape
+
         state_type = mdp_info.observation_space.data_type
         action_type = mdp_info.action_space.data_type
 
@@ -30,9 +33,11 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
         self._episode_info = defaultdict(list)
 
         if mdp_info.backend == 'numpy':
-            self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+            self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape,
+                                      policy_state_shape)
         elif mdp_info.backend == 'torch':
-            self._data = TorchDataset(state_type, state_shape, action_type, action_shape, reward_shape)
+            self._data = TorchDataset(state_type, state_shape, action_type, action_shape, reward_shape,
+                                      policy_state_shape)
         else:
             self._data = ListDataset()
 
@@ -45,12 +50,12 @@ def __init__(self, mdp_info, n_steps=None, n_episodes=None):
             _episode_info='pickle',
             _data='mushroom',
             _converter='primitive',
-            _gamma='primitive'
+            _gamma='primitive',
         )
 
     @classmethod
-    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, info=None, episode_info=None,
-                   gamma=0.99, backend='numpy'):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, policy_state=None,
+                   policy_next_state=None, info=None, episode_info=None, gamma=0.99, backend='numpy'):
         """
         Creates a dataset of transitions from the provided arrays.
 
@@ -60,17 +65,26 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
             rewards (np.ndarray): array of rewards;
             next_states (np.ndarray): array of next_states;
             absorbings (np.ndarray): array of absorbing flags;
-            lasts (np.ndarray): array of last flags.
+            lasts (np.ndarray): array of last flags;
+            policy_state (np.ndarray, None): array of policy internal states;
+            policy_next_state (np.ndarray, None): array of next policy internal states;
+            info (dict, None): dictiornay of step info;
+            episode_info (dict, None): dictiornary of episode info
+            gamma (float, 0.99): discount factor;
+            backend (str, 'numpy'): backend to be used by the dataset.
 
         Returns:
             The list of transitions.
 
         """
-        assert (len(states) == len(actions) == len(rewards)
-                == len(next_states) == len(absorbings) == len(lasts))
+        assert len(states) == len(actions) == len(rewards) == len(next_states) == len(absorbings) == len(lasts)
+
+        if policy_state is not None:
+            assert len(states) == len(policy_state) == len(policy_next_state)
 
         dataset = cls.__new__(cls)
         dataset._gamma = gamma
+
         if info is None:
             dataset._info = defaultdict(list)
         else:
@@ -102,7 +116,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, in
         return dataset
 
     def append(self, step, info):
-        self._data.append(*step[:6])
+        self._data.append(*step)
         self._append_info(self._info, info)
 
     def append_episode_info(self, info):
@@ -182,6 +196,14 @@ def absorbing(self):
     def last(self):
         return self._data.last
 
+    @property
+    def policy_state(self):
+        return self._data.policy_state
+
+    @property
+    def policy_next_state(self):
+        return self._data.policy_next_state
+
     @property
     def info(self):
         return self._info
@@ -230,8 +252,8 @@ def parse(self, to='numpy'):
             A tuple containing the arrays that define the dataset, i.e. state, action, next state, absorbing and last
 
         """
-        return self._converter.convert(self.state, self.action, self.reward,
-                                       self.next_state, self.absorbing, self.last, to=to)
+        return self._converter.convert(self.state, self.action, self.reward, self.next_state,
+                                       self.absorbing, self.last, to=to)
 
     def select_first_episodes(self, n_episodes):
         """
diff --git a/mushroom_rl/policy/deterministic_policy.py b/mushroom_rl/policy/deterministic_policy.py
index 065de0ca1..d033656f2 100644
--- a/mushroom_rl/policy/deterministic_policy.py
+++ b/mushroom_rl/policy/deterministic_policy.py
@@ -10,7 +10,7 @@ class DeterministicPolicy(ParametricPolicy):
     differentiable, even if the mean value approximator is differentiable.
 
     """
-    def __init__(self, mu):
+    def __init__(self, mu, policy_state_shape=None):
         """
         Constructor.
 
@@ -19,11 +19,13 @@ def __init__(self, mu):
                 in each state.
 
         """
+        super().__init__(policy_state_shape)
+
         self._approximator = mu
         self._predict_params = dict()
 
-        self._add_save_attr(_approximator='mushroom')
-        self._add_save_attr(_predict_params='pickle')
+        self._add_save_attr(_approximator='mushroom',
+                            _predict_params='pickle')
 
     def get_regressor(self):
         """
diff --git a/mushroom_rl/policy/gaussian_policy.py b/mushroom_rl/policy/gaussian_policy.py
index c7f8623cf..0990f01eb 100644
--- a/mushroom_rl/policy/gaussian_policy.py
+++ b/mushroom_rl/policy/gaussian_policy.py
@@ -9,6 +9,13 @@ class AbstractGaussianPolicy(ParametricPolicy):
     Abstract class of Gaussian policies.
 
     """
+    def __init__(self, policy_state_shape=None):
+        """
+        Constructor.
+
+        """
+        super().__init__(policy_state_shape)
+
     def __call__(self, state, action):
         mu, sigma = self._compute_multivariate_gaussian(state)[:2]
 
@@ -29,7 +36,7 @@ class GaussianPolicy(AbstractGaussianPolicy):
     matrix is fixed.
 
     """
-    def __init__(self, mu, sigma):
+    def __init__(self, mu, sigma, policy_state_shape=None):
         """
         Constructor.
 
@@ -41,6 +48,8 @@ def __init__(self, mu, sigma):
                 where n is the action dimensionality.
 
         """
+        super().__init__(policy_state_shape)
+
         self._approximator = mu
         self._predict_params = dict()
         self._inv_sigma = np.linalg.inv(sigma)
@@ -97,16 +106,12 @@ def _compute_multivariate_gaussian(self, state):
 
 class DiagonalGaussianPolicy(AbstractGaussianPolicy):
     """
-    Gaussian policy with learnable standard deviation.
-    The Covariance matrix is
-    constrained to be a diagonal matrix, where the diagonal is the squared
-    standard deviation vector.
-    This is a differentiable policy for continuous action spaces.
-    This policy is similar to the gaussian policy, but the weights includes
-    also the standard deviation.
+    Gaussian policy with learnable standard deviation. The Covariance matrix is constrained to be a diagonal matrix,
+    where the diagonal is the squared standard deviation vector. This is a differentiable policy for continuous action
+    spaces. This policy is similar to the gaussian policy, but the weights includes also the standard deviation.
 
     """
-    def __init__(self, mu, std):
+    def __init__(self, mu, std, policy_state_shape=None):
         """
         Constructor.
 
@@ -117,6 +122,8 @@ def __init__(self, mu, std):
                 this vector must be equal to the action dimensionality.
 
         """
+        super().__init__(policy_state_shape)
+
         self._approximator = mu
         self._predict_params = dict()
         self._std = std
@@ -189,7 +196,7 @@ class StateStdGaussianPolicy(AbstractGaussianPolicy):
     deviation depends on the current state.
 
     """
-    def __init__(self, mu, std, eps=1e-6):
+    def __init__(self, mu, std, eps=1e-6, policy_state_shape=None):
         """
         Constructor.
 
@@ -205,6 +212,8 @@ def __init__(self, mu, std, eps=1e-6):
         """
         assert(eps > 0)
 
+        super().__init__(policy_state_shape)
+
         self._mu_approximator = mu
         self._std_approximator = std
         self._predict_params = dict()
@@ -282,7 +291,7 @@ class StateLogStdGaussianPolicy(AbstractGaussianPolicy):
     regressor represents the logarithm of the standard deviation.
 
     """
-    def __init__(self, mu, log_std):
+    def __init__(self, mu, log_std, policy_state_shape=None):
         """
         Constructor.
 
@@ -294,6 +303,8 @@ def __init__(self, mu, log_std):
                 regressor must be equal to the action dimensionality.
 
         """
+        super().__init__(policy_state_shape)
+
         self._mu_approximator = mu
         self._log_std_approximator = log_std
         self._predict_params = dict()
@@ -343,8 +354,7 @@ def get_weights(self):
 
     @property
     def weights_size(self):
-        return self._mu_approximator.weights_size + \
-               self._log_std_approximator.weights_size
+        return self._mu_approximator.weights_size + self._log_std_approximator.weights_size
 
     def _compute_multivariate_gaussian(self, state):
         mu = np.reshape(self._mu_approximator.predict(
diff --git a/mushroom_rl/policy/noise_policy.py b/mushroom_rl/policy/noise_policy.py
index 317d4eabd..49cb17c1f 100644
--- a/mushroom_rl/policy/noise_policy.py
+++ b/mushroom_rl/policy/noise_policy.py
@@ -12,7 +12,7 @@ class OrnsteinUhlenbeckPolicy(ParametricPolicy):
     algorithm.
 
     """
-    def __init__(self, mu, sigma, theta, dt, x0=None):
+    def __init__(self, mu, sigma, theta, dt, x0=None, policy_state_shape=None):
         """
         Constructor.
 
@@ -46,6 +46,8 @@ def __init__(self, mu, sigma, theta, dt, x0=None):
             _x_prev='numpy'
         )
 
+        super().__init__(policy_state_shape)
+
     def __call__(self, state, action):
         raise NotImplementedError
 
@@ -89,7 +91,7 @@ class ClippedGaussianPolicy(ParametricPolicy):
     if the value is bigger than the boundaries. Thus, the non-differentiability.
 
     """
-    def __init__(self, mu, sigma, low, high):
+    def __init__(self, mu, sigma, low, high, policy_state_shape=None):
         """
         Constructor.
 
@@ -105,6 +107,8 @@ def __init__(self, mu, sigma, low, high):
                 component.
 
         """
+        super().__init__(policy_state_shape)
+
         self._approximator = mu
         self._predict_params = dict()
         self._sigma = sigma
diff --git a/mushroom_rl/policy/policy.py b/mushroom_rl/policy/policy.py
index ef5cf0a3b..0f6426f0e 100644
--- a/mushroom_rl/policy/policy.py
+++ b/mushroom_rl/policy/policy.py
@@ -9,6 +9,17 @@ class Policy(Serializable):
     A policy is used by mushroom agents to interact with the environment.
 
     """
+    def __init__(self, policy_state_shape=None):
+        """
+        Constructor.
+
+        Args:
+            policy_state_shape (tuple, None): the shape of the internal state of the policy.
+
+        """
+        self.policy_state_shape = policy_state_shape
+        self._internal_state = None
+
     def __call__(self, *args):
         """
         Compute the probability of taking action in a certain state following
@@ -47,6 +58,13 @@ def reset(self):
         """
         pass
 
+    def get_policy_state(self):
+        return self._internal_state
+
+    @property
+    def is_stateful(self):
+        return self.policy_state_shape is not None
+
 
 class ParametricPolicy(Policy):
     """
@@ -57,6 +75,16 @@ class ParametricPolicy(Policy):
     specified state-action pair can be provided.
     """
 
+    def __init__(self, policy_state_shape=None):
+        """
+        Constructor.
+
+        Args:
+            policy_state_shape (tuple, None): the shape of the internal state of the policy.
+
+        """
+        super().__init__(policy_state_shape)
+
     def diff_log(self, state, action):
         """
         Compute the gradient of the logarithm of the probability density
diff --git a/mushroom_rl/policy/promps.py b/mushroom_rl/policy/promps.py
index e34d2137f..36e005d60 100644
--- a/mushroom_rl/policy/promps.py
+++ b/mushroom_rl/policy/promps.py
@@ -30,21 +30,20 @@ def __init__(self, mu, phi, duration, sigma=None, periodic=False):
         """
         assert sigma is None or (len(sigma.shape) == 2 and sigma.shape[0] == sigma.shape[1])
 
+        super().__init__(policy_state_shape=(1,))
+
         self._approximator = mu
         self._phi = phi
         self._duration = duration
         self._sigma = sigma
         self._periodic = periodic
 
-        self._step = 0
-
         self._add_save_attr(
             _approximator='mushroom',
             _phi='mushroom',
             _duration='primitive',
             _sigma='numpy',
-            _periodic='primitive',
-            _step='primitive'
+            _periodic='primitive'
         )
 
     def __call__(self, state, action):
@@ -114,3 +113,11 @@ def set_duration(self, duration):
 
     def reset(self):
         self._step = 0
+
+    @property
+    def _step(self):
+        return self._internal_state
+
+    @_step.setter
+    def _step(self, value):
+        self._internal_state = value
\ No newline at end of file
diff --git a/mushroom_rl/policy/td_policy.py b/mushroom_rl/policy/td_policy.py
index 8f79d27c4..d3194e849 100644
--- a/mushroom_rl/policy/td_policy.py
+++ b/mushroom_rl/policy/td_policy.py
@@ -7,11 +7,13 @@
 
 
 class TDPolicy(Policy):
-    def __init__(self):
+    def __init__(self, policy_state_shape=None):
         """
         Constructor.
 
         """
+        super().__init__(policy_state_shape)
+
         self._approximator = None
         self._predict_params = dict()
 
@@ -40,7 +42,7 @@ class EpsGreedy(TDPolicy):
     Epsilon greedy policy.
 
     """
-    def __init__(self, epsilon):
+    def __init__(self, epsilon, policy_state_shape=None):
         """
         Constructor.
 
@@ -50,7 +52,7 @@ def __init__(self, epsilon):
                 step.
 
         """
-        super().__init__()
+        super().__init__(policy_state_shape)
 
         self._epsilon = to_parameter(epsilon)
 
@@ -116,7 +118,7 @@ class Boltzmann(TDPolicy):
     Boltzmann softmax policy.
 
     """
-    def __init__(self, beta):
+    def __init__(self, beta, policy_state_shape=None):
         """
         Constructor.
 
@@ -127,7 +129,7 @@ def __init__(self, beta):
             more and more greedy.
 
         """
-        super().__init__()
+        super().__init__(policy_state_shape)
         self._beta = to_parameter(beta)
 
         self._add_save_attr(_beta='mushroom')
@@ -213,7 +215,7 @@ def f(beta):
             except ValueError:
                 return 0.
 
-    def __init__(self, omega, beta_min=-10., beta_max=10.):
+    def __init__(self, omega, beta_min=-10., beta_max=10., policy_state_shape=None):
         """
         Constructor.
 
@@ -228,7 +230,7 @@ def __init__(self, omega, beta_min=-10., beta_max=10.):
         """
         beta_mellow = self.MellowmaxParameter(self, omega, beta_min, beta_max)
 
-        super().__init__(beta_mellow)
+        super().__init__(beta_mellow, policy_state_shape)
 
     def set_beta(self, beta):
         raise RuntimeError('Cannot change the beta parameter of Mellowmax policy')
diff --git a/mushroom_rl/policy/torch_policy.py b/mushroom_rl/policy/torch_policy.py
index 3a1a6f32b..f489518cd 100644
--- a/mushroom_rl/policy/torch_policy.py
+++ b/mushroom_rl/policy/torch_policy.py
@@ -20,7 +20,7 @@ class TorchPolicy(Policy):
     required.
 
     """
-    def __init__(self, use_cuda):
+    def __init__(self, use_cuda, policy_state_shape=None):
         """
         Constructor.
 
@@ -28,6 +28,8 @@ def __init__(self, use_cuda):
             use_cuda (bool): whether to use cuda or not.
 
         """
+        super().__init__(policy_state_shape)
+
         self._use_cuda = use_cuda
 
         self._add_save_attr(_use_cuda='primitive')
@@ -185,7 +187,7 @@ class GaussianTorchPolicy(TorchPolicy):
 
     """
     def __init__(self, network, input_shape, output_shape, std_0=1.,
-                 use_cuda=False, **params):
+                 use_cuda=False, policy_state_shape=None, **params):
         """
         Constructor.
 
@@ -198,7 +200,7 @@ def __init__(self, network, input_shape, output_shape, std_0=1.,
             params (dict): parameters used by the network constructor.
 
         """
-        super().__init__(use_cuda)
+        super().__init__(use_cuda, policy_state_shape)
 
         self._action_dim = output_shape[0]
 
@@ -260,7 +262,7 @@ class BoltzmannTorchPolicy(TorchPolicy):
     Torch policy implementing a Boltzmann policy.
 
     """
-    def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **params):
+    def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, policy_state_shape=None, **params):
         """
         Constructor.
 
@@ -276,13 +278,13 @@ def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **p
             params (dict): parameters used by the network constructor.
 
         """
-        super().__init__(use_cuda)
+        super().__init__(use_cuda, policy_state_shape)
 
         self._action_dim = output_shape[0]
         self._predict_params = dict()
 
         self._logits = Regressor(TorchApproximator, input_shape, output_shape,
-                                 network=network, use_cuda=use_cuda, **params)
+                                 network=network, use_cuda=use_cuda,  **params)
         self._beta = to_parameter(beta)
 
         self._add_save_attr(
diff --git a/tests/core/test_core.py b/tests/core/test_core.py
index 20b44168d..eaaeabf04 100644
--- a/tests/core/test_core.py
+++ b/tests/core/test_core.py
@@ -8,6 +8,7 @@
 
 class RandomDiscretePolicy(Policy):
     def __init__(self, n):
+        super().__init__()
         self._n = n
 
     def draw_action(self, state):

From b011c57dd89f90b88254907ccb07c3ed151a88ec Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Wed, 18 Oct 2023 17:25:31 +0200
Subject: [PATCH 17/24] Modified draw action to support stateful policies

---
 .../classic_actor_critic/stochastic_ac.py     |  2 +-
 .../actor_critic/deep_actor_critic/sac.py     |  7 ++-
 .../black_box_optimization.py                 |  2 +-
 mushroom_rl/algorithms/value/td/q_lambda.py   |  2 +-
 .../algorithms/value/td/rq_learning.py        | 11 ++--
 mushroom_rl/algorithms/value/td/sarsa.py      |  2 +-
 .../algorithms/value/td/sarsa_lambda.py       |  4 +-
 .../value/td/sarsa_lambda_continuous.py       |  4 +-
 .../value/td/true_online_sarsa_lambda.py      |  4 +-
 mushroom_rl/core/agent.py                     | 12 +++--
 mushroom_rl/core/core.py                      | 18 +++----
 mushroom_rl/policy/deterministic_policy.py    |  6 +--
 mushroom_rl/policy/gaussian_policy.py         | 14 ++---
 mushroom_rl/policy/noise_policy.py            | 28 +++++-----
 mushroom_rl/policy/policy.py                  | 52 ++++++++++---------
 mushroom_rl/policy/promps.py                  | 34 +++++-------
 mushroom_rl/policy/td_policy.py               | 11 ++--
 mushroom_rl/policy/torch_policy.py            |  6 +--
 tests/core/test_core.py                       |  4 +-
 tests/policy/test_deterministic_policy.py     |  3 +-
 tests/policy/test_gaussian_policy.py          | 10 ++--
 tests/policy/test_noise_policy.py             |  8 +--
 tests/policy/test_policy_interface.py         |  4 +-
 tests/policy/test_td_policy.py                |  6 +--
 tests/policy/test_torch_policy.py             |  4 +-
 25 files changed, 123 insertions(+), 135 deletions(-)

diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
index 42ab1ec1f..2ea4fe085 100644
--- a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
+++ b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
@@ -62,7 +62,7 @@ def episode_start(self, episode_info):
         self._e_v = np.zeros(self._V.weights_size)
         self._e_theta = np.zeros(self.policy.weights_size)
 
-        super().episode_start(episode_info)
+        return super().episode_start(episode_info)
 
     def fit(self, dataset, **info):
         for step in dataset:
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
index b2b0447e2..ae3b9d3cb 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
@@ -65,12 +65,11 @@ def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_mi
             _eps_log_prob='primitive'
         )
 
-    def __call__(self, state, action):
+    def __call__(self, state, action, internal_state=None):
         raise NotImplementedError
 
-    def draw_action(self, state):
-        return self.compute_action_and_log_prob_t(
-            state, compute_log_prob=False).detach().cpu().numpy()
+    def draw_action(self, state, internal_state=None):
+        return self.compute_action_and_log_prob_t(state, compute_log_prob=False).detach().cpu().numpy(), None
 
     def compute_action_and_log_prob(self, state):
         """
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
index 707b00507..4ea7f883b 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
@@ -31,7 +31,7 @@ def episode_start(self, episode_info):
         self._theta_list.append(theta)
         self.policy.set_weights(theta)
 
-        super().episode_start(episode_info)
+        return super().episode_start(episode_info)
 
     def fit(self, dataset, **info):
         Jep = dataset.compute_J(self.mdp_info.gamma)
diff --git a/mushroom_rl/algorithms/value/td/q_lambda.py b/mushroom_rl/algorithms/value/td/q_lambda.py
index 723f388a4..c8e6cd32e 100644
--- a/mushroom_rl/algorithms/value/td/q_lambda.py
+++ b/mushroom_rl/algorithms/value/td/q_lambda.py
@@ -47,4 +47,4 @@ def _update(self, state, action, reward, next_state, absorbing):
     def episode_start(self, episode_info):
         self.e.reset()
 
-        super().episode_start(episode_info)
+        return super().episode_start(episode_info)
diff --git a/mushroom_rl/algorithms/value/td/rq_learning.py b/mushroom_rl/algorithms/value/td/rq_learning.py
index 4a7da860a..0a986e7f2 100644
--- a/mushroom_rl/algorithms/value/td/rq_learning.py
+++ b/mushroom_rl/algorithms/value/td/rq_learning.py
@@ -57,16 +57,13 @@ def _update(self, state, action, reward, next_state, absorbing):
             q_next = self._next_q(next_state)
 
             if self.delta is not None:
-                beta = alpha * self.delta(state, action, target=q_next,
-                                          factor=alpha)
+                beta = alpha * self.delta(state, action, target=q_next, factor=alpha)
             else:
                 beta = self.beta(state, action, target=q_next)
 
-            self.Q_tilde[state, action] += beta * (q_next - self.Q_tilde[
-                state, action])
+            self.Q_tilde[state, action] += beta * (q_next - self.Q_tilde[state, action])
 
-        self.Q[state, action] = self.R_tilde[
-            state, action] + self.mdp_info.gamma * self.Q_tilde[state, action]
+        self.Q[state, action] = self.R_tilde[state, action] + self.mdp_info.gamma * self.Q_tilde[state, action]
 
     def _next_q(self, next_state):
         """
@@ -81,6 +78,6 @@ def _next_q(self, next_state):
         if self.off_policy:
             return np.max(self.Q[next_state, :])
         else:
-            self.next_action = self.draw_action(next_state)
+            self.next_action, _ = self.draw_action(next_state)
 
             return self.Q[next_state, self.next_action]
diff --git a/mushroom_rl/algorithms/value/td/sarsa.py b/mushroom_rl/algorithms/value/td/sarsa.py
index 253dac401..e376e0e63 100644
--- a/mushroom_rl/algorithms/value/td/sarsa.py
+++ b/mushroom_rl/algorithms/value/td/sarsa.py
@@ -15,7 +15,7 @@ def __init__(self, mdp_info, policy, learning_rate):
     def _update(self, state, action, reward, next_state, absorbing):
         q_current = self.Q[state, action]
 
-        self.next_action = self.draw_action(next_state)
+        self.next_action, _ = self.draw_action(next_state)
         q_next = self.Q[next_state, self.next_action] if not absorbing else 0.
 
         self.Q[state, action] = q_current + self._alpha(state, action) * (
diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda.py b/mushroom_rl/algorithms/value/td/sarsa_lambda.py
index 0e04027d9..7c3a3032e 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda.py
@@ -33,7 +33,7 @@ def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
     def _update(self, state, action, reward, next_state, absorbing):
         q_current = self.Q[state, action]
 
-        self.next_action = self.draw_action(next_state)
+        self.next_action, _ = self.draw_action(next_state)
         q_next = self.Q[next_state, self.next_action] if not absorbing else 0.
 
         delta = reward + self.mdp_info.gamma * q_next - q_current
@@ -45,4 +45,4 @@ def _update(self, state, action, reward, next_state, absorbing):
     def episode_start(self, episode_info):
         self.e.reset()
 
-        super().episode_start(episode_info)
+        return super().episode_start(episode_info)
diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
index c5795d0e7..f3fc11dc5 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
@@ -39,7 +39,7 @@ def _update(self, state, action, reward, next_state, absorbing):
 
         self.e = self.mdp_info.gamma * self._lambda() * self.e + self.Q.diff(phi_state, action)
 
-        self.next_action = self.draw_action(next_state)
+        self.next_action, _ = self.draw_action(next_state)
         phi_next_state = self.phi(next_state)
         q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0.
 
@@ -52,4 +52,4 @@ def _update(self, state, action, reward, next_state, absorbing):
     def episode_start(self, episode_info):
         self.e = np.zeros(self.Q.weights_size)
 
-        super().episode_start(episode_info)
+        return super().episode_start(episode_info)
diff --git a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
index cf354e0f1..5ca541661 100644
--- a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
@@ -51,7 +51,7 @@ def _update(self, state, action, reward, next_state, absorbing):
         self.e = self.mdp_info.gamma * self._lambda() * self.e + alpha * (
             1. - self.mdp_info.gamma * self._lambda.get_value() * e_phi) * phi_state_action
 
-        self.next_action = self.draw_action(next_state)
+        self.next_action, _ = self.draw_action(next_state)
         phi_next_state = self.phi(next_state)
         q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0.
 
@@ -67,4 +67,4 @@ def episode_start(self, episode_info):
         self._q_old = None
         self.e = np.zeros(self.Q.weights_size)
 
-        super().episode_start(episode_info)
+        return super().episode_start(episode_info)
diff --git a/mushroom_rl/core/agent.py b/mushroom_rl/core/agent.py
index f323b5845..4ac130eeb 100644
--- a/mushroom_rl/core/agent.py
+++ b/mushroom_rl/core/agent.py
@@ -57,14 +57,15 @@ def fit(self, dataset, **info):
         """
         raise NotImplementedError('Agent is an abstract class')
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state=None):
         """
         Return the action to execute in the given state. It is the action
         returned by the policy or the action set by the algorithm (e.g. in the
         case of SARSA).
 
         Args:
-            state (np.ndarray): the state where the agent is.
+            state: the state where the agent is;
+            policy_state: the policy internal state.
 
         Returns:
             The action to be executed.
@@ -74,12 +75,13 @@ def draw_action(self, state):
             state = self.phi(state)
 
         if self.next_action is None:
-            action = self.policy.draw_action(state)
+            action, next_policy_state = self.policy.draw_action(state, policy_state)
         else:
             action = self.next_action
+            next_policy_state = None  # FIXME
             self.next_action = None
 
-        return self._convert_to_env_backend(action)
+        return self._convert_to_env_backend(action), self._convert_to_env_backend(next_policy_state)
 
     def episode_start(self, episode_info):
         """
@@ -89,7 +91,7 @@ def episode_start(self, episode_info):
             episode_info (dict): a dictionary containing the information at reset, such as context.
 
         """
-        self.policy.reset()
+        return self.policy.reset()
 
     def stop(self):
         """
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index c8e9c150c..fbf854395 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -26,6 +26,7 @@ def __init__(self, agent, mdp, callbacks_fit=None, callback_step=None, record_di
         self.callback_step = callback_step if callback_step is not None else lambda x: None
 
         self._state = None
+        self._policy_state = None
 
         self._total_episodes_counter = 0
         self._total_steps_counter = 0
@@ -142,15 +143,9 @@ def _run_impl(self, dataset, move_condition, fit_condition, steps_progress_bar,
         while move_condition():
             if last:
                 self.reset(initial_states)
-                if self.agent.policy.is_stateful:
-                    policy_state = self.agent.policy.get_policy_state()
 
             sample, step_info = self._step(render, record)
 
-            if self.agent.policy.is_stateful:
-                policy_next_state = self.agent.policy.get_policy_state()
-                sample += (policy_state, policy_next_state)
-
             self.callback_step(sample)
 
             self._total_steps_counter += 1
@@ -197,7 +192,7 @@ def _step(self, render, record):
             state, the absorbing flag of the reached state and the last step flag.
 
         """
-        action = self.agent.draw_action(self._state)
+        action, policy_next_state = self.agent.draw_action(self._state, self._policy_state)
         next_state, reward, absorbing, step_info = self.mdp.step(action)
 
         self._episode_steps += 1
@@ -208,14 +203,15 @@ def _step(self, render, record):
             if record:
                 self._record(frame)
 
-        last = not(
-            self._episode_steps < self.mdp.info.horizon and not absorbing)
+        last = not(self._episode_steps < self.mdp.info.horizon and not absorbing)
 
         state = self._state
+        policy_state = self._policy_state
         next_state = self._preprocess(next_state)
         self._state = next_state
+        self._policy_state = policy_next_state
 
-        return (state, action, reward, next_state, absorbing, last), step_info
+        return (state, action, reward, next_state, absorbing, last, policy_state, policy_next_state), step_info
 
     def reset(self, initial_states=None):
         """
@@ -228,7 +224,7 @@ def reset(self, initial_states=None):
             initial_state = initial_states[self._total_episodes_counter]
 
         state, episode_info = self.mdp.reset(initial_state)
-        self.agent.episode_start(episode_info)
+        self._policy_state = self.agent.episode_start(episode_info)
 
         self._state = self._preprocess(state)
         self.agent.next_action = None
diff --git a/mushroom_rl/policy/deterministic_policy.py b/mushroom_rl/policy/deterministic_policy.py
index d033656f2..62097de78 100644
--- a/mushroom_rl/policy/deterministic_policy.py
+++ b/mushroom_rl/policy/deterministic_policy.py
@@ -37,13 +37,13 @@ def get_regressor(self):
         """
         return self._approximator
 
-    def __call__(self, state, action):
+    def __call__(self, state, action, policy_state=None):
         policy_action = self._approximator.predict(state, **self._predict_params)
 
         return 1. if np.array_equal(action, policy_action) else 0.
 
-    def draw_action(self, state):
-        return self._approximator.predict(state, **self._predict_params)
+    def draw_action(self, state, policy_state=None):
+        return self._approximator.predict(state, **self._predict_params), None
 
     def set_weights(self, weights):
         self._approximator.set_weights(weights)
diff --git a/mushroom_rl/policy/gaussian_policy.py b/mushroom_rl/policy/gaussian_policy.py
index 0990f01eb..f92f4d90e 100644
--- a/mushroom_rl/policy/gaussian_policy.py
+++ b/mushroom_rl/policy/gaussian_policy.py
@@ -16,15 +16,15 @@ def __init__(self, policy_state_shape=None):
         """
         super().__init__(policy_state_shape)
 
-    def __call__(self, state, action):
+    def __call__(self, state, action, policy_state=None):
         mu, sigma = self._compute_multivariate_gaussian(state)[:2]
 
         return multivariate_normal.pdf(action, mu, sigma)
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state=None):
         mu, sigma = self._compute_multivariate_gaussian(state)[:2]
 
-        return np.random.multivariate_normal(mu, sigma)
+        return np.random.multivariate_normal(mu, sigma), None
 
 
 class GaussianPolicy(AbstractGaussianPolicy):
@@ -74,7 +74,7 @@ def set_sigma(self, sigma):
         self._sigma = sigma
         self._inv_sigma = np.linalg.inv(sigma)
 
-    def diff_log(self, state, action):
+    def diff_log(self, state, action, policy_state=None):
         mu, _, inv_sigma = self._compute_multivariate_gaussian(state)
 
         delta = action - mu
@@ -145,7 +145,7 @@ def set_std(self, std):
         """
         self._std = std
 
-    def diff_log(self, state, action):
+    def diff_log(self, state, action, policy_state=None):
         mu, _, inv_sigma = self._compute_multivariate_gaussian(state)
 
         delta = action - mu
@@ -226,7 +226,7 @@ def __init__(self, mu, std, eps=1e-6, policy_state_shape=None):
             _eps='primitive'
         )
 
-    def diff_log(self, state, action):
+    def diff_log(self, state, action, policy_state=None):
         mu, sigma, std = self._compute_multivariate_gaussian(state)
         diag_sigma = np.diag(sigma)
 
@@ -315,7 +315,7 @@ def __init__(self, mu, log_std, policy_state_shape=None):
             _predict_params='pickle'
         )
 
-    def diff_log(self, state, action):
+    def diff_log(self, state, action, policy_state=None):
 
         mu, sigma = self._compute_multivariate_gaussian(state)
         diag_sigma = np.diag(sigma)
diff --git a/mushroom_rl/policy/noise_policy.py b/mushroom_rl/policy/noise_policy.py
index 49cb17c1f..6f2143c93 100644
--- a/mushroom_rl/policy/noise_policy.py
+++ b/mushroom_rl/policy/noise_policy.py
@@ -12,7 +12,7 @@ class OrnsteinUhlenbeckPolicy(ParametricPolicy):
     algorithm.
 
     """
-    def __init__(self, mu, sigma, theta, dt, x0=None, policy_state_shape=None):
+    def __init__(self, mu, sigma, theta, dt, x0=None):
         """
         Constructor.
 
@@ -42,25 +42,21 @@ def __init__(self, mu, sigma, theta, dt, x0=None, policy_state_shape=None):
             _sigma='numpy',
             _theta='primitive',
             _dt='primitive',
-            _x0='numpy',
-            _x_prev='numpy'
+            _x0='numpy'
         )
 
-        super().__init__(policy_state_shape)
+        super().__init__(self._approximator.output_shape)
 
-    def __call__(self, state, action):
+    def __call__(self, state, action=None, policy_state=None):
         raise NotImplementedError
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state):
         mu = self._approximator.predict(state, **self._predict_params)
 
-        x = self._x_prev - self._theta * self._x_prev * self._dt +\
-            self._sigma * np.sqrt(self._dt) * np.random.normal(
-                size=self._approximator.output_shape
-            )
-        self._x_prev = x
+        x = policy_state - self._theta * policy_state * self._dt +\
+            self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._approximator.output_shape)
 
-        return mu + x
+        return mu + x, x
 
     def set_weights(self, weights):
         self._approximator.set_weights(weights)
@@ -73,7 +69,7 @@ def weights_size(self):
         return self._approximator.weights_size
 
     def reset(self):
-        self._x_prev = self._x0 if self._x0 is not None else np.zeros(self._approximator.output_shape)
+        return self._x0 if self._x0 is not None else np.zeros(self._approximator.output_shape)
 
 
 class ClippedGaussianPolicy(ParametricPolicy):
@@ -123,15 +119,15 @@ def __init__(self, mu, sigma, low, high, policy_state_shape=None):
             _high='numpy'
         )
 
-    def __call__(self, state, action):
+    def __call__(self, state, action=None, policy_state=None):
         raise NotImplementedError
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state=None):
         mu = np.reshape(self._approximator.predict(np.expand_dims(state, axis=0), **self._predict_params), -1)
 
         action_raw = np.random.multivariate_normal(mu, self._sigma)
 
-        return np.clip(action_raw, self._low, self._high)
+        return np.clip(action_raw, self._low, self._high), None
 
     def set_weights(self, weights):
         self._approximator.set_weights(weights)
diff --git a/mushroom_rl/policy/policy.py b/mushroom_rl/policy/policy.py
index 0f6426f0e..fa1964404 100644
--- a/mushroom_rl/policy/policy.py
+++ b/mushroom_rl/policy/policy.py
@@ -18,34 +18,35 @@ def __init__(self, policy_state_shape=None):
 
         """
         self.policy_state_shape = policy_state_shape
-        self._internal_state = None
 
-    def __call__(self, *args):
+    def __call__(self, state, action=None, policy_state=None):
         """
         Compute the probability of taking action in a certain state following
         the policy.
 
         Args:
-            *args (list): list containing a state or a state and an action.
+            state: state where you want to evaluate the policy density;
+            action: action where you want to evaluate the policy density;
+            policy_state: internal_state where you want to evaluate the policy density.
 
         Returns:
-            The probability of all actions following the policy in the given
-            state if the list contains only the state, else the probability
-            of the given action in the given state following the policy. If
-            the action space is continuous, state and action must be provided
+            The probability of all actions following the policy in the given state if the list contains only the state,
+            else the probability of the given action in the given state following the policy. If the action space is
+            continuous, state and action must be provided
 
         """
         raise NotImplementedError
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state=None):
         """
         Sample an action in ``state`` using the policy.
 
         Args:
-            state (np.ndarray): the state where the agent is.
+            state: the state where the agent is;
+            policy_state: the internal state of the policy.
 
         Returns:
-            The action sampled from the policy.
+            The action sampled from the policy and optionally the next policy state.
 
         """
         raise NotImplementedError
@@ -55,11 +56,11 @@ def reset(self):
         Useful when the policy needs a special initialization at the beginning
         of an episode.
 
-        """
-        pass
+        Returns:
+            The initial policy state (by default None).
 
-    def get_policy_state(self):
-        return self._internal_state
+        """
+        return None
 
     @property
     def is_stateful(self):
@@ -69,10 +70,9 @@ def is_stateful(self):
 class ParametricPolicy(Policy):
     """
     Interface for a generic parametric policy.
-    A parametric policy is a policy that depends on set of parameters,
-    called the policy weights.
-    If the policy is differentiable, the derivative of the probability for a
-    specified state-action pair can be provided.
+    A parametric policy is a policy that depends on set of parameters, called the policy weights.
+    For differentiable policies, the derivative of the probability for a specified state-action pair can be provided.
+
     """
 
     def __init__(self, policy_state_shape=None):
@@ -85,7 +85,7 @@ def __init__(self, policy_state_shape=None):
         """
         super().__init__(policy_state_shape)
 
-    def diff_log(self, state, action):
+    def diff_log(self, state, action, policy_state=None):
         """
         Compute the gradient of the logarithm of the probability density
         function, in the specified state and action pair, i.e.:
@@ -95,15 +95,16 @@ def diff_log(self, state, action):
 
 
         Args:
-            state (np.ndarray): the state where the gradient is computed
-            action (np.ndarray): the action where the gradient is computed
+            state: the state where the gradient is computed;
+            action: the action where the gradient is computed;
+            policy_state: the internal state of the policy.
 
         Returns:
             The gradient of the logarithm of the pdf w.r.t. the policy weights
         """
         raise RuntimeError('The policy is not differentiable')
 
-    def diff(self, state, action):
+    def diff(self, state, action, policy_state=None):
         """
         Compute the derivative of the probability density function, in the
         specified state and action pair. Normally it is computed w.r.t. the
@@ -115,13 +116,14 @@ def diff(self, state, action):
 
 
         Args:
-            state (np.ndarray): the state where the derivative is computed
-            action (np.ndarray): the action where the derivative is computed
+            state: the state where the derivative is computed;
+            action: the action where the derivative is computed;
+            policy_state: the internal state of the policy.
 
         Returns:
             The derivative w.r.t. the  policy weights
         """
-        return self(state, action) * self.diff_log(state, action)
+        return self(state, action, policy_state) * self.diff_log(state, action, policy_state)
 
     def set_weights(self, weights):
         """
diff --git a/mushroom_rl/policy/promps.py b/mushroom_rl/policy/promps.py
index 36e005d60..d074a3e83 100644
--- a/mushroom_rl/policy/promps.py
+++ b/mushroom_rl/policy/promps.py
@@ -55,19 +55,19 @@ def __call__(self, state, action):
         else:
             return multivariate_normal.pdf(action, mu, self._sigma)
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state):
         z = self._compute_phase(state)
 
-        self.update_time(state)
-
         mu = self._approximator(self._phi(z))
 
+        next_policy_state = self.update_time(state, policy_state)
+
         if self._sigma is None:
-            return mu
+            return mu, next_policy_state
         else:
-            return np.random.multivariate_normal(mu, self._sigma)
+            return np.random.multivariate_normal(mu, self._sigma), next_policy_state
 
-    def update_time(self, state):
+    def update_time(self, state, policy_state):
         """
         Method that updates the time counter. Can be overridden to introduce complex state-dependant behaviors.
 
@@ -75,12 +75,14 @@ def update_time(self, state):
             state (np.ndarray): The current state of the system.
 
         """
-        self._step += 1
+        policy_state += 1
+
+        if not self._periodic and policy_state >= self._duration:
+            policy_state = self._duration
 
-        if not self._periodic and self._step >= self._duration:
-            self._step = self._duration
+        return policy_state
 
-    def _compute_phase(self, state):
+    def _compute_phase(self, state, policy_state):
         """
         Method that updates the state variable. It can be overridden to implement state dependent phase.
 
@@ -91,7 +93,7 @@ def _compute_phase(self, state):
             The current value of the phase variable
 
         """
-        return self._step / self._duration
+        return policy_state / self._duration
 
     def set_weights(self, weights):
         self._approximator.set_weights(weights)
@@ -112,12 +114,4 @@ def set_duration(self, duration):
         self._duration = duration - 1
 
     def reset(self):
-        self._step = 0
-
-    @property
-    def _step(self):
-        return self._internal_state
-
-    @_step.setter
-    def _step(self, value):
-        self._internal_state = value
\ No newline at end of file
+        return 0
diff --git a/mushroom_rl/policy/td_policy.py b/mushroom_rl/policy/td_policy.py
index d3194e849..fbadca2a0 100644
--- a/mushroom_rl/policy/td_policy.py
+++ b/mushroom_rl/policy/td_policy.py
@@ -77,7 +77,7 @@ def __call__(self, *args):
 
             return probs
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state=None):
         if not np.random.uniform() < self._epsilon(state):
             q = self._approximator.predict(state, **self._predict_params)
             max_a = np.argwhere(q == np.max(q)).ravel()
@@ -85,9 +85,9 @@ def draw_action(self, state):
             if len(max_a) > 1:
                 max_a = np.array([np.random.choice(max_a)])
 
-            return max_a
+            return max_a, None
 
-        return np.array([np.random.choice(self._approximator.n_actions)])
+        return np.array([np.random.choice(self._approximator.n_actions)]), None
 
     def set_epsilon(self, epsilon):
         """
@@ -147,9 +147,8 @@ def __call__(self, *args):
         else:
             return qs / np.sum(qs)
 
-    def draw_action(self, state):
-        return np.array([np.random.choice(self._approximator.n_actions,
-                                          p=self(state))])
+    def draw_action(self, state, policy_state=None):
+        return np.array([np.random.choice(self._approximator.n_actions, p=self(state))]), None
 
     def set_beta(self, beta):
         """
diff --git a/mushroom_rl/policy/torch_policy.py b/mushroom_rl/policy/torch_policy.py
index f489518cd..b626ba508 100644
--- a/mushroom_rl/policy/torch_policy.py
+++ b/mushroom_rl/policy/torch_policy.py
@@ -34,18 +34,18 @@ def __init__(self, use_cuda, policy_state_shape=None):
 
         self._add_save_attr(_use_cuda='primitive')
 
-    def __call__(self, state, action):
+    def __call__(self, state, action, policy_state=None):
         s = to_float_tensor(np.atleast_2d(state), self._use_cuda)
         a = to_float_tensor(np.atleast_2d(action), self._use_cuda)
 
         return np.exp(self.log_prob_t(s, a).item())
 
-    def draw_action(self, state):
+    def draw_action(self, state, policy_state=None):
         with torch.no_grad():
             s = to_float_tensor(np.atleast_2d(state), self._use_cuda)
             a = self.draw_action_t(s)
 
-        return torch.squeeze(a, dim=0).detach().cpu().numpy()
+        return torch.squeeze(a, dim=0).detach().cpu().numpy(), None
 
     def distribution(self, state):
         """
diff --git a/tests/core/test_core.py b/tests/core/test_core.py
index eaaeabf04..470a9bab0 100644
--- a/tests/core/test_core.py
+++ b/tests/core/test_core.py
@@ -11,8 +11,8 @@ def __init__(self, n):
         super().__init__()
         self._n = n
 
-    def draw_action(self, state):
-        return [np.random.randint(self._n)]
+    def draw_action(self, state, policy_state=None):
+        return [np.random.randint(self._n)], None
 
 
 class DummyAgent(Agent):
diff --git a/tests/policy/test_deterministic_policy.py b/tests/policy/test_deterministic_policy.py
index c881e2c92..25753183a 100644
--- a/tests/policy/test_deterministic_policy.py
+++ b/tests/policy/test_deterministic_policy.py
@@ -36,5 +36,6 @@ def test_deterministic_policy():
     assert pi(s_test_2, a_test) == 0
 
     a_stored = np.array([-1.86941072, -0.1789696])
-    assert np.allclose(pi.draw_action(s_test_1), a_stored)
+    action, _ = pi.draw_action(s_test_1)
+    assert np.allclose(action, a_stored)
 
diff --git a/tests/policy/test_gaussian_policy.py b/tests/policy/test_gaussian_policy.py
index 54e0b539f..66756703b 100644
--- a/tests/policy/test_gaussian_policy.py
+++ b/tests/policy/test_gaussian_policy.py
@@ -22,7 +22,7 @@ def test_univariate_gaussian():
 
     for x_i in x:
         state = np.atleast_1d(x_i)
-        action = pi.draw_action(state)
+        action, _ = pi.draw_action(state)
         exact_diff = pi.diff(state, action)
         numerical_diff = numerical_diff_policy(pi, state, action)
 
@@ -50,7 +50,7 @@ def test_multivariate_gaussian():
 
     for x_i in x:
         state = np.atleast_1d(x_i)
-        action = pi.draw_action(state)
+        action, _ = pi.draw_action(state)
         exact_diff = pi.diff(state, action)
         numerical_diff = numerical_diff_policy(pi, state, action)
 
@@ -76,7 +76,7 @@ def test_multivariate_diagonal_gaussian():
 
     for x_i in x:
         state = np.atleast_1d(x_i)
-        action = pi.draw_action(state)
+        action, _ = pi.draw_action(state)
         exact_diff = pi.diff(state, action)
         numerical_diff = numerical_diff_policy(pi, state, action)
 
@@ -104,7 +104,7 @@ def test_multivariate_state_std_gaussian():
 
     for x_i in x:
         state = np.atleast_1d(x_i)
-        action = pi.draw_action(state)
+        action, _ = pi.draw_action(state)
         exact_diff = pi.diff(state, action)
         numerical_diff = numerical_diff_policy(pi, state, action)
 
@@ -132,7 +132,7 @@ def test_multivariate_state_log_std_gaussian():
 
     for x_i in x:
         state = np.atleast_1d(x_i)
-        action = pi.draw_action(state)
+        action, _ = pi.draw_action(state)
         exact_diff = pi.diff(state, action)
         numerical_diff = numerical_diff_policy(pi, state, action)
 
diff --git a/tests/policy/test_noise_policy.py b/tests/policy/test_noise_policy.py
index aff1f3254..85213c2c4 100644
--- a/tests/policy/test_noise_policy.py
+++ b/tests/policy/test_noise_policy.py
@@ -17,12 +17,14 @@ def test_ornstein_uhlenbeck_policy():
 
     state = np.random.randn(5)
 
-    action = pi.draw_action(state)
+    policy_state = pi.reset()
+
+    action, policy_state = pi.draw_action(state, policy_state)
     action_test = np.array([-1.95896171,  1.91292747])
     assert np.allclose(action, action_test)
 
-    pi.reset()
-    action = pi.draw_action(state)
+    policy_state = pi.reset()
+    action, policy_state = pi.draw_action(state, policy_state)
     action_test = np.array([-1.94161061,  1.92233358])
     assert np.allclose(action, action_test)
 
diff --git a/tests/policy/test_policy_interface.py b/tests/policy/test_policy_interface.py
index a61607b7a..680c2ce7b 100644
--- a/tests/policy/test_policy_interface.py
+++ b/tests/policy/test_policy_interface.py
@@ -12,8 +12,8 @@ def abstract_method_tester(f, ex, *args):
 
 def test_policy_interface():
     tmp = Policy()
-    abstract_method_tester(tmp.__call__, NotImplementedError)
-    abstract_method_tester(tmp.draw_action, NotImplementedError, None)
+    abstract_method_tester(tmp.__call__, NotImplementedError, None)
+    abstract_method_tester(tmp.draw_action, NotImplementedError, None, None)
     tmp.reset()
 
 
diff --git a/tests/policy/test_td_policy.py b/tests/policy/test_td_policy.py
index e1890f278..0563f7dc8 100644
--- a/tests/policy/test_td_policy.py
+++ b/tests/policy/test_td_policy.py
@@ -33,7 +33,7 @@ def test_eps_greedy():
     p_sa_test = np.array([0.93333333])
     assert np.allclose(p_sa, p_sa_test)
 
-    a = pi.draw_action(s)
+    a, _ = pi.draw_action(s)
     a_test = 1
     assert a.item() == a_test
 
@@ -70,7 +70,7 @@ def test_boltzmann():
     p_sa_test = np.array([0.36223227])
     assert np.allclose(p_sa, p_sa_test)
 
-    a = pi.draw_action(s)
+    a, _ = pi.draw_action(s)
     a_test = 2
     assert a.item() == a_test
 
@@ -106,7 +106,7 @@ def test_mellowmax():
     p_sa_test = np.array([0.69215916])
     assert np.allclose(p_sa, p_sa_test)
 
-    a = pi.draw_action(s)
+    a, _ = pi.draw_action(s)
     a_test = 2
     assert a.item() == a_test
 
diff --git a/tests/policy/test_torch_policy.py b/tests/policy/test_torch_policy.py
index 17614ad08..ce5dac90b 100644
--- a/tests/policy/test_torch_policy.py
+++ b/tests/policy/test_torch_policy.py
@@ -61,7 +61,7 @@ def test_gaussian_torch_policy():
     pi = GaussianTorchPolicy(Network, (3,), (2,), n_features=50)
 
     state = np.random.rand(3)
-    action = pi.draw_action(state)
+    action, _ = pi.draw_action(state)
     action_test = np.array([-0.21276927,  0.27437747])
     assert np.allclose(action, action_test)
 
@@ -81,7 +81,7 @@ def test_boltzmann_torch_policy():
     pi = BoltzmannTorchPolicy(Network, (3,), (2,), beta, n_features=50)
 
     state = np.random.rand(3, 3)
-    action = pi.draw_action(state)
+    action, _ = pi.draw_action(state)
     action_test = np.array([1, 0, 0])
     assert np.allclose(action, action_test)
 

From be9d13b7e0f136a6d597bb77d448a5d22d990312 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 19 Oct 2023 18:40:52 +0200
Subject: [PATCH 18/24] Major updates

- Implemented PPO_BPTT
- Implemented recurrent policies
- Implemented parse_policy_state in dataset
- Refactoring of core in preparation to parallel and vectorized core
---
 examples/gym_recurrent_ppo.py                 | 282 ++++++++++++++++++
 .../algorithms/actor_critic/__init__.py       |   4 +-
 .../deep_actor_critic/__init__.py             |   3 +-
 .../deep_actor_critic/ppo_bptt.py             | 213 +++++++++++++
 mushroom_rl/core/_impl/__init__.py            |   3 +-
 mushroom_rl/core/_impl/core_logic.py          |  81 +++++
 mushroom_rl/core/core.py                      | 106 ++-----
 mushroom_rl/core/dataset.py                   |  12 +
 mushroom_rl/policy/__init__.py                |   1 +
 mushroom_rl/policy/recurrent_torch_policy.py  |  63 ++++
 10 files changed, 683 insertions(+), 85 deletions(-)
 create mode 100644 examples/gym_recurrent_ppo.py
 create mode 100644 mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
 create mode 100644 mushroom_rl/core/_impl/core_logic.py
 create mode 100644 mushroom_rl/policy/recurrent_torch_policy.py

diff --git a/examples/gym_recurrent_ppo.py b/examples/gym_recurrent_ppo.py
new file mode 100644
index 000000000..fcde5a209
--- /dev/null
+++ b/examples/gym_recurrent_ppo.py
@@ -0,0 +1,282 @@
+import os
+import numpy as np
+import torch
+from experiment_launcher.decorators import single_experiment
+from experiment_launcher import run_experiment
+import torch.optim as optim
+
+from mushroom_rl.core import Logger, Core
+from mushroom_rl.environments import Gym
+
+from mushroom_rl.algorithms.actor_critic import PPO_BPTT
+from mushroom_rl.policy import RecurrentGaussianTorchPolicy
+
+from tqdm import trange
+
+
+def get_recurrent_network(rnn_type):
+    if rnn_type == "vanilla":
+        return torch.nn.RNN
+    elif rnn_type == "gru":
+        return torch.nn.GRU
+    else:
+        raise ValueError("Unknown RNN type %s." % rnn_type)
+        
+
+class PPOCriticBPTTNetwork(torch.nn.Module):
+
+    def __init__(self, input_shape, output_shape, dim_env_state, dim_action, rnn_type,
+                 n_hidden_features=128, n_features=128, num_hidden_layers=1,
+                 hidden_state_treatment="zero_initial", **kwargs):
+        super().__init__()
+
+        assert hidden_state_treatment in ["zero_initial", "use_policy_hidden_state"]
+
+        self._input_shape = input_shape
+        self._output_shape = output_shape
+        self._dim_env_state = dim_env_state
+        self._dim_action = dim_action
+        self._use_policy_hidden_states = True if hidden_state_treatment == "use_policy_hidden_state" else False
+
+        rnn = get_recurrent_network(rnn_type)
+
+        # embedder
+        self._h1_o = torch.nn.Linear(dim_env_state, n_features)
+        self._h1_o_post_rnn = torch.nn.Linear(dim_env_state, n_features)
+
+        # rnn
+        self._rnn = rnn(input_size=n_features,
+                        hidden_size=n_hidden_features,
+                        num_layers=num_hidden_layers,
+                        # nonlinearity=hidden_activation, # todo: this is turned off for now to allow for rnn and gru
+                        batch_first=True)
+
+        # post-rnn layer
+        self._hq_1 = torch.nn.Linear(n_hidden_features+n_features, n_features)
+        self._hq_2 = torch.nn.Linear(n_features, 1)
+        self._act_func = torch.nn.ReLU()
+
+        torch.nn.init.xavier_uniform_(self._h1_o.weight, gain=torch.nn.init.calculate_gain("relu"))
+        torch.nn.init.xavier_uniform_(self._h1_o_post_rnn.weight, gain=torch.nn.init.calculate_gain("relu"))
+        torch.nn.init.xavier_uniform_(self._hq_1.weight, gain=torch.nn.init.calculate_gain("relu"))
+        torch.nn.init.xavier_uniform_(self._hq_2.weight, gain=torch.nn.init.calculate_gain("relu"))
+
+    def forward(self, state, policy_state, lengths):
+        # pre-rnn embedder
+        input_rnn = self._act_func(self._h1_o(state))
+
+        # --- forward rnn ---
+        # the inputs are padded. Based on that and the length, we created a packed sequence
+        packed_seq = torch.nn.utils.rnn.pack_padded_sequence(input_rnn, lengths, enforce_sorted=False,
+                                                             batch_first=True)
+        if self._use_policy_hidden_states:
+            # hidden state has to have shape (N_layers, N_batch, DIM_hidden),
+            # so we need to reshape and swap the first two axes.
+            policy_state_reshaped = policy_state.view(-1, self._num_hidden_layers, self._n_hidden_features)
+            policy_state_reshaped = torch.swapaxes(policy_state_reshaped, 0, 1)
+            out_rnn, _ = self._rnn(packed_seq, policy_state_reshaped)
+        else:
+            out_rnn, _ = self._rnn(packed_seq)   # use zero initial states
+
+        # we only need the last entry in each sequence
+        features_rnn, _ = torch.nn.utils.rnn.pad_packed_sequence(out_rnn, batch_first=True)
+        rel_indices = lengths.view(-1, 1, 1) - 1
+        features_rnn = torch.squeeze(torch.take_along_dim(features_rnn, rel_indices, dim=1), dim=1)
+
+        # post-rnn embedder. Here we again only need the last state
+        last_state = torch.squeeze(torch.take_along_dim(state, rel_indices, dim=1), dim=1)
+        feature_s = self._act_func(self._h1_o_post_rnn(last_state))
+
+        # last layer
+        input_last_layer = torch.concat([feature_s, features_rnn], dim=1)
+        q = self._hq_2(self._act_func(self._hq_1(input_last_layer)))
+
+        return torch.squeeze(q)
+
+
+class PPOActorBPTTNetwork(torch.nn.Module):
+
+    def __init__(self, input_shape, output_shape, n_features, dim_env_state, rnn_type,
+                 n_hidden_features, num_hidden_layers=1, **kwargs):
+        super().__init__()
+
+        dim_state = input_shape[0]
+        dim_action = output_shape[0]
+        self._dim_env_state = dim_env_state
+        self._num_hidden_layers = num_hidden_layers
+        self._n_hidden_features = n_hidden_features
+
+        rnn = get_recurrent_network(rnn_type)
+
+        # embedder
+        self._h1_o = torch.nn.Linear(dim_env_state, n_features)
+        self._h1_o_post_rnn = torch.nn.Linear(dim_env_state, n_features)
+
+        # rnn
+        self._rnn = rnn(input_size=n_features,
+                        hidden_size=n_hidden_features,
+                        num_layers=num_hidden_layers,
+                        # nonlinearity=hidden_activation, # todo: this is turned off for now to allow for rnn and gru
+                        batch_first=True)
+
+        # post-rnn layer
+        self._h3 = torch.nn.Linear(n_hidden_features+n_features, dim_action)
+        self._act_func = torch.nn.ReLU()
+        self._tanh = torch.nn.Tanh()
+
+        torch.nn.init.xavier_uniform_(self._h1_o.weight, gain=torch.nn.init.calculate_gain("relu")*0.05)
+        torch.nn.init.xavier_uniform_(self._h1_o_post_rnn.weight, gain=torch.nn.init.calculate_gain("relu")*0.05)
+        torch.nn.init.xavier_uniform_(self._h3.weight, gain=torch.nn.init.calculate_gain("relu")*0.05)
+
+    def forward(self, state, policy_state, lengths):
+        # pre-rnn embedder
+        input_rnn = self._act_func(self._h1_o(state))
+
+        # forward rnn
+        # the inputs are padded. Based on that and the length, we created a packed sequence
+        packed_seq = torch.nn.utils.rnn.pack_padded_sequence(input_rnn, lengths, enforce_sorted=False,
+                                                             batch_first=True)
+
+        # hidden state has to have shape (N_layers, N_batch, DIM_hidden),
+        # so we need to reshape and swap the first two axes.
+        policy_state_reshaped = policy_state.view(-1, self._num_hidden_layers, self._n_hidden_features)
+        policy_state_reshaped = torch.swapaxes(policy_state_reshaped, 0, 1)
+
+        out_rnn, next_hidden = self._rnn(packed_seq, policy_state_reshaped)
+
+        # we only need the last entry in each sequence
+        features_rnn, _ = torch.nn.utils.rnn.pad_packed_sequence(out_rnn, batch_first=True)
+        rel_indices = lengths.view(-1, 1, 1) - 1
+        features_rnn = torch.squeeze(torch.take_along_dim(features_rnn, rel_indices, dim=1), dim=1)
+
+        # post-rnn embedder. Here we again only need the last state
+        last_state = torch.squeeze(torch.take_along_dim(state, rel_indices, dim=1), dim=1)
+        feature_sa = self._act_func(self._h1_o_post_rnn(last_state))
+
+        # last layer
+        input_last_layer = torch.concat([feature_sa, features_rnn], dim=1)
+        a = self._h3(input_last_layer)
+
+        return a, torch.swapaxes(next_hidden, 0, 1)
+
+
+def get_POMDP_params(pomdp_type):
+    if pomdp_type == "no_velocities":
+        return dict(obs_to_hide=("velocities",), random_force_com=False)
+    elif pomdp_type == "no_positions":
+        return dict(obs_to_hide=("positions",), random_force_com=False)
+    elif pomdp_type == "windy":
+        return dict(obs_to_hide=tuple(), random_force_com=True)
+
+
+@single_experiment
+def experiment(
+        env: str = 'HalfCheetah-v4',
+        horizon: int = 1000,
+        gamma: float = 0.99,
+        n_epochs: int = 300,
+        n_steps_per_epoch: int = 50000,
+        n_steps_per_fit: int = 2000,
+        n_episode_eval: int = 10,
+        lr_actor: float = 0.001,
+        lr_critic: float = 0.001,
+        batch_size_actor: int = 32,
+        batch_size_critic: int = 32,
+        n_epochs_policy: int = 10,
+        clip_eps_ppo: float = 0.05,
+        gae_lambda: float = 0.95,
+        seed: int = 0,  # This argument is mandatory
+        results_dir: str = './logs',  # This argument is mandatory
+        use_cuda: bool = False,
+        std_0: float = 0.5,
+        rnn_type: str ="gru",
+        n_hidden_features: int = 128,
+        num_hidden_layers: int = 1,
+        truncation_length: int = 5
+):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    # prepare logging
+    results_dir = os.path.join(results_dir, str(seed))
+    logger = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed)
+
+    # MDP
+    mdp = Gym(env, horizon=horizon, gamma=gamma)
+
+    # create the policy
+    dim_env_state = mdp.info.observation_space.shape[0]
+    dim_action = mdp.info.action_space.shape[0]
+
+    policy = RecurrentGaussianTorchPolicy(network=PPOActorBPTTNetwork,
+                                          policy_state_shape=(n_hidden_features,),
+                                          input_shape=(dim_env_state, ),
+                                          output_shape=(dim_action,),
+                                          n_features=128,
+                                          rnn_type=rnn_type,
+                                          n_hidden_features=n_hidden_features,
+                                          num_hidden_layers=num_hidden_layers,
+                                          dim_hidden_state=n_hidden_features,
+                                          dim_env_state=dim_env_state,
+                                          dim_action=dim_action,
+                                          std_0=std_0)
+
+    # setup critic
+    input_shape_critic = (mdp.info.observation_space.shape[0]+2*n_hidden_features,)
+    critic_params = dict(network=PPOCriticBPTTNetwork,
+                         optimizer={'class':  optim.Adam,
+                                    'params': {'lr': lr_critic,
+                                               'weight_decay': 0.0}},
+                         loss=torch.nn.MSELoss(),
+                         batch_size=batch_size_critic,
+                         input_shape=input_shape_critic,
+                         output_shape=(1,),
+                         n_features=128,
+                         n_hidden_features=n_hidden_features,
+                         rnn_type=rnn_type,
+                         num_hidden_layers=num_hidden_layers,
+                         dim_env_state=mdp.info.observation_space.shape[0],
+                         dim_hidden_state=n_hidden_features,
+                         dim_action=dim_action,
+                         use_cuda=use_cuda,
+                         )
+
+    alg_params = dict(actor_optimizer={'class':  optim.Adam,
+                                       'params': {'lr':           lr_actor,
+                                                  'weight_decay': 0.0}},
+                      n_epochs_policy=n_epochs_policy,
+                      batch_size=batch_size_actor,
+                      dim_env_state=dim_env_state,
+                      eps_ppo=clip_eps_ppo,
+                      lam=gae_lambda,
+                      truncation_length=truncation_length
+                      )
+
+    # Create the agent
+    agent = PPO_BPTT(mdp_info=mdp.info, policy=policy, critic_params=critic_params, **alg_params)
+
+    # Create Core
+    core = Core(agent, mdp)
+
+    # Evaluation
+    dataset = core.evaluate(n_episodes=5)
+    J = np.mean(dataset.discounted_return)
+    R = np.mean(dataset.undiscounted_return)
+    L = np.mean(dataset.episodes_length)
+    logger.log_numpy(R=R, J=J, L=L)
+    logger.epoch_info(0, R=R, J=J, L=L)
+
+    for i in trange(1, n_epochs+1, 1, leave=False):
+        core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit)
+
+        # Evaluation
+        dataset = core.evaluate(n_episodes=n_episode_eval)
+        J = np.mean(dataset.discounted_return)
+        R = np.mean(dataset.undiscounted_return)
+        L = np.mean(dataset.episodes_length)
+        logger.log_numpy(R=R, J=J, L=L)
+        logger.epoch_info(i, R=R, J=J, L=L)
+
+
+if __name__ == '__main__':
+    run_experiment(experiment)
diff --git a/mushroom_rl/algorithms/actor_critic/__init__.py b/mushroom_rl/algorithms/actor_critic/__init__.py
index aea310acf..ab3748f44 100644
--- a/mushroom_rl/algorithms/actor_critic/__init__.py
+++ b/mushroom_rl/algorithms/actor_critic/__init__.py
@@ -1,5 +1,5 @@
 from .classic_actor_critic import StochasticAC, StochasticAC_AVG, COPDAC_Q
-from .deep_actor_critic import DeepAC, A2C, DDPG, TD3, SAC, TRPO, PPO
+from .deep_actor_critic import DeepAC, A2C, DDPG, TD3, SAC, TRPO, PPO, PPO_BPTT
 
 __all__ = ['COPDAC_Q', 'StochasticAC', 'StochasticAC_AVG',
-           'DeepAC', 'A2C', 'DDPG', 'TD3', 'SAC', 'TRPO', 'PPO']
+           'DeepAC', 'A2C', 'DDPG', 'TD3', 'SAC', 'TRPO', 'PPO', 'PPO_BPTT']
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/__init__.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/__init__.py
index a2fbf5bb6..9740f682b 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/__init__.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/__init__.py
@@ -5,5 +5,6 @@
 from .sac import SAC
 from .trpo import TRPO
 from .ppo import PPO
+from .ppo_bptt import PPO_BPTT
 
-__all__ = ['DeepAC', 'A2C', 'DDPG', 'TD3', 'SAC', 'TRPO', 'PPO']
\ No newline at end of file
+__all__ = ['DeepAC', 'A2C', 'DDPG', 'TD3', 'SAC', 'TRPO', 'PPO', 'PPO_BPTT']
\ No newline at end of file
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
new file mode 100644
index 000000000..97bc8d565
--- /dev/null
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
@@ -0,0 +1,213 @@
+import torch
+
+from mushroom_rl.core import Agent
+from mushroom_rl.approximators import Regressor
+from mushroom_rl.approximators.parametric import TorchApproximator
+from mushroom_rl.utils.torch import update_optimizer_parameters
+from mushroom_rl.utils.minibatches import minibatch_generator
+from mushroom_rl.utils.parameters import to_parameter
+from mushroom_rl.utils.preprocessors import StandardizationPreprocessor
+
+
+class PPO_BPTT(Agent):
+    """
+    Proximal Policy Optimization algorithm.
+    "Proximal Policy Optimization Algorithms".
+    Schulman J. et al.. 2017.
+
+    """
+    def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
+                 n_epochs_policy, batch_size, eps_ppo, lam, dim_env_state, ent_coeff=0.0,
+                 critic_fit_params=None, truncation_length=5):
+        """
+        Constructor.
+
+        Args:
+            policy (TorchPolicy): torch policy to be learned by the algorithm
+            actor_optimizer (dict): parameters to specify the actor optimizer
+                algorithm;
+            critic_params (dict): parameters of the critic approximator to
+                build;
+            n_epochs_policy ([int, Parameter]): number of policy updates for every dataset;
+            batch_size ([int, Parameter]): size of minibatches for every optimization step
+            eps_ppo ([float, Parameter]): value for probability ratio clipping;
+            lam ([float, Parameter], 1.): lambda coefficient used by generalized
+                advantage estimation;
+            ent_coeff ([float, Parameter], 1.): coefficient for the entropy regularization term;
+            critic_fit_params (dict, None): parameters of the fitting algorithm
+                of the critic approximator.
+
+        """
+        self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params
+
+        self._n_epochs_policy = to_parameter(n_epochs_policy)
+        self._batch_size = to_parameter(batch_size)
+        self._eps_ppo = to_parameter(eps_ppo)
+
+        self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params'])
+
+        self._lambda = to_parameter(lam)
+        self._ent_coeff = to_parameter(ent_coeff)
+
+        self._V = Regressor(TorchApproximator, **critic_params)
+
+        self._truncation_length = truncation_length
+        self._dim_env_state = dim_env_state
+
+        self._iter = 1
+
+        self._add_save_attr(
+            _critic_fit_params='pickle',
+            _n_epochs_policy='mushroom',
+            _batch_size='mushroom',
+            _eps_ppo='mushroom',
+            _ent_coeff='mushroom',
+            _optimizer='torch',
+            _lambda='mushroom',
+            _V='mushroom',
+            _iter='primitive',
+            _dim_env_state='primitive'
+        )
+
+        super().__init__(mdp_info, policy, None)
+
+        # add the standardization preprocessor
+        self._preprocessors.append(StandardizationPreprocessor(mdp_info))
+
+    def divide_state_to_env_hidden_batch(self, states):
+        assert len(states.shape) > 1, "This function only divides batches of states."
+        return states[:, 0:self._dim_env_state], states[:, self._dim_env_state:]
+
+    def fit(self, dataset, **info):
+        obs, act, r, obs_next, absorbing, last = dataset.parse(to='torch')
+        policy_state, policy_next_state = dataset.parse_policy_state(to='torch')
+        obs_seq, policy_state_seq, act_seq, obs_next_seq, policy_next_state_seq, lengths = \
+            self.transform_to_sequences(obs, policy_state, act, obs_next, policy_next_state, last, absorbing)
+
+        v_target, adv = self.compute_gae(self._V, obs_seq, policy_state_seq, obs_next_seq, policy_next_state_seq,
+                                         lengths, r, absorbing, last, self.mdp_info.gamma, self._lambda())
+        adv = (adv - torch.mean(adv)) / (torch.std(adv) + 1e-8)
+
+        old_pol_dist = self.policy.distribution_t(obs_seq, policy_state_seq, lengths)
+        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()
+
+        self._V.fit(obs_seq, policy_state_seq, lengths, v_target, **self._critic_fit_params)
+
+        self._update_policy(obs_seq, policy_state_seq, act, lengths, adv, old_log_p)
+
+        # Print fit information
+        self._log_info(dataset, obs_seq, policy_state_seq, lengths, v_target, old_pol_dist)
+        self._iter += 1
+
+    def transform_to_sequences(self, states, policy_states, actions, next_states, policy_next_states, last, absorbing):
+
+        s = torch.empty(len(states), self._truncation_length, states.shape[-1])
+        ps = torch.empty(len(states), policy_states.shape[-1])
+        a = torch.empty(len(actions), self._truncation_length, actions.shape[-1])
+        ss = torch.empty(len(states), self._truncation_length, states.shape[-1])
+        pss = torch.empty(len(states), policy_states.shape[-1])
+        lengths = torch.empty(len(states), dtype=torch.long)
+
+        for i in range(len(states)):
+            # determine the begin of a sequence
+            begin_seq = max(i - self._truncation_length + 1, 0)
+            end_seq = i + 1
+
+            # maybe the sequence contains more than one trajectory, so we need to cut it so that it contains only one
+            lasts_absorbing = last[begin_seq - 1: i].int() + absorbing[begin_seq - 1: i].int()
+            begin_traj = torch.where(lasts_absorbing > 0)
+            sequence_is_shorter_than_requested = len(*begin_traj) > 0
+            if sequence_is_shorter_than_requested:
+                begin_seq = begin_seq + begin_traj[0][-1]
+
+            # get the sequences
+            states_seq = states[begin_seq:end_seq]
+            actions_seq = actions[begin_seq:end_seq]
+            next_states_seq = next_states[begin_seq:end_seq]
+
+            # apply padding
+            length_seq = len(states_seq)
+            padded_states = torch.concatenate([states_seq,
+                                               torch.zeros((self._truncation_length - states_seq.shape[0],
+                                                                  states_seq.shape[1]))])
+            padded_next_states = torch.concatenate([next_states_seq,
+                                                    torch.zeros((self._truncation_length - next_states_seq.shape[0],
+                                                    next_states_seq.shape[1]))])
+            padded_action_seq = torch.concatenate([actions_seq,
+                                                   torch.zeros((self._truncation_length - actions_seq.shape[0],
+                                                                actions_seq.shape[1]))])
+
+            s[i] = padded_states
+            ps[i] = policy_states[begin_seq]
+            a[i] = padded_action_seq
+            ss[i] = padded_next_states
+            pss[i] = policy_next_states[begin_seq]
+
+            lengths[i] = length_seq
+
+        return s.detach(), ps.detach(), a.detach(), ss.detach(), pss.detach(), lengths.detach()
+
+    def _update_policy(self, obs, pi_h, act, lengths, adv, old_log_p):
+        for epoch in range(self._n_epochs_policy()):
+            for obs_i, pi_h_i, act_i, length_i, adv_i, old_log_p_i in minibatch_generator(
+                    self._batch_size(), obs, pi_h, act, lengths, adv, old_log_p):
+                self._optimizer.zero_grad()
+                prob_ratio = torch.exp(
+                    self.policy.log_prob_t(obs_i, act_i, pi_h_i, length_i) - old_log_p_i
+                )
+                clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo(), 1 + self._eps_ppo.get_value())
+                loss = -torch.mean(torch.min(prob_ratio * adv_i, clipped_ratio * adv_i))
+                loss -= self._ent_coeff()*self.policy.entropy_t(obs_i)
+                loss.backward()
+                self._optimizer.step()
+
+    def _log_info(self, dataset, x, pi_h, lengths, v_target, old_pol_dist):
+        pass
+
+    def _post_load(self):
+        if self._optimizer is not None:
+            update_optimizer_parameters(self._optimizer, list(self.policy.parameters()))
+
+    @staticmethod
+    def compute_gae(V, s, pi_h, ss, pi_hn, lengths, r, absorbing, last, gamma, lam):
+        """
+        Function to compute Generalized Advantage Estimation (GAE)
+        and new value function target over a dataset.
+
+        "High-Dimensional Continuous Control Using Generalized
+        Advantage Estimation".
+        Schulman J. et al.. 2016.
+
+        Args:
+            V (Regressor): the current value function regressor;
+            s (numpy.ndarray): the set of states in which we want
+                to evaluate the advantage;
+            ss (numpy.ndarray): the set of next states in which we want
+                to evaluate the advantage;
+            r (numpy.ndarray): the reward obtained in each transition
+                from state s to state ss;
+            absorbing (numpy.ndarray): an array of boolean flags indicating
+                if the reached state is absorbing;
+            last (numpy.ndarray): an array of boolean flags indicating
+                if the reached state is the last of the trajectory;
+            gamma (float): the discount factor of the considered problem;
+            lam (float): the value for the lamba coefficient used by GEA
+                algorithm.
+        Returns:
+            The new estimate for the value function of the next state
+            and the estimated generalized advantage.
+        """
+        with torch.no_grad():
+            v = V(s, pi_h, lengths, output_tensor=True)
+            v_next = V(ss, pi_hn, lengths, output_tensor=True)
+            gen_adv = torch.empty_like(v)
+            for rev_k in range(len(v)):
+                k = len(v) - rev_k - 1
+                if last[k] or rev_k == 0:
+                    gen_adv[k] = r[k] - v[k]
+                    if not absorbing[k]:
+                        gen_adv[k] += gamma * v_next[k]
+                else:
+                    gen_adv[k] = r[k] + gamma * v_next[k] - v[k] + gamma * lam * gen_adv[k + 1]
+
+            return gen_adv + v, gen_adv
diff --git a/mushroom_rl/core/_impl/__init__.py b/mushroom_rl/core/_impl/__init__.py
index 385d2bad9..b43e6afa1 100644
--- a/mushroom_rl/core/_impl/__init__.py
+++ b/mushroom_rl/core/_impl/__init__.py
@@ -1,4 +1,5 @@
 from .numpy_dataset import NumpyDataset
 from .torch_dataset import TorchDataset
 from .list_dataset import ListDataset
-from .type_conversions import DataConversion, NumpyConversion, TorchConversion, ListConversion
\ No newline at end of file
+from .type_conversions import DataConversion, NumpyConversion, TorchConversion, ListConversion
+from .core_logic import CoreLogic
\ No newline at end of file
diff --git a/mushroom_rl/core/_impl/core_logic.py b/mushroom_rl/core/_impl/core_logic.py
new file mode 100644
index 000000000..babb41e27
--- /dev/null
+++ b/mushroom_rl/core/_impl/core_logic.py
@@ -0,0 +1,81 @@
+from tqdm import tqdm
+
+
+class CoreLogic(object):
+    def __init__(self):
+        self.fit_required = None
+        self.move_required = None
+
+        self._total_episodes_counter = 0
+        self._total_steps_counter = 0
+        self._current_episodes_counter = 0
+        self._current_steps_counter = 0
+
+        self._n_episodes = None
+        self._n_steps_per_fit = None
+        self._n_episodes_per_fit = None
+
+        self._steps_progress_bar = None
+        self._episodes_progress_bar = None
+
+    def initialize_fit(self, n_steps_per_fit, n_episodes_per_fit):
+        assert (n_episodes_per_fit is not None and n_steps_per_fit is None) \
+               or (n_episodes_per_fit is None and n_steps_per_fit is not None)
+
+        self._n_steps_per_fit = n_steps_per_fit
+        self._n_episodes_per_fit = n_episodes_per_fit
+
+        if n_steps_per_fit is not None:
+            self.fit_required = lambda: self._current_steps_counter >= self._n_steps_per_fit
+        else:
+            self.fit_required = lambda: self._current_episodes_counter >= self._n_episodes_per_fit
+
+    def initialize_evaluate(self):
+        self.fit_required = lambda: False
+
+    def initialize_run(self, n_steps, n_episodes, initial_states, quiet):
+        assert n_episodes is not None and n_steps is None and initial_states is None\
+            or n_episodes is None and n_steps is not None and initial_states is None\
+            or n_episodes is None and n_steps is None and initial_states is not None
+
+        self._n_episodes = len(initial_states) if initial_states is not None else n_episodes
+
+        if n_steps is not None:
+            self.move_required = lambda: self._total_steps_counter < n_steps
+
+            self._steps_progress_bar = tqdm(total=n_steps,  dynamic_ncols=True, disable=quiet, leave=False)
+            self._episodes_progress_bar = tqdm(disable=True)
+        else:
+            self.move_required = lambda: self._total_episodes_counter < self._n_episodes
+
+            self._steps_progress_bar = tqdm(disable=True)
+            self._episodes_progress_bar = tqdm(total=self._n_episodes, dynamic_ncols=True, disable=quiet, leave=False)
+
+        self._total_episodes_counter = 0
+        self._total_steps_counter = 0
+        self._current_episodes_counter = 0
+        self._current_steps_counter = 0
+
+    def get_initial_state(self, initial_states):
+        if initial_states is None or self._total_episodes_counter == self._n_episodes:
+            return None
+        else:
+            return initial_states[self._total_episodes_counter]
+
+    def after_step(self, last):
+        self._total_steps_counter += 1
+        self._current_steps_counter += 1
+        self._steps_progress_bar.update(1)
+
+        if last:
+            self._total_episodes_counter += 1
+            self._current_episodes_counter += 1
+            self._episodes_progress_bar.update(1)
+
+    def after_fit(self):
+        self._current_episodes_counter = 0
+        self._current_steps_counter = 0
+
+    def terminate_run(self):
+        self._steps_progress_bar.close()
+        self._episodes_progress_bar.close()
\ No newline at end of file
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index fbf854395..5c76df230 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -1,8 +1,8 @@
-from tqdm import tqdm
-
 from mushroom_rl.core.dataset import Dataset
 from mushroom_rl.utils.record import VideoRecorder
 
+from ._impl import CoreLogic
+
 
 class Core(object):
     """
@@ -27,15 +27,9 @@ def __init__(self, agent, mdp, callbacks_fit=None, callback_step=None, record_di
 
         self._state = None
         self._policy_state = None
-
-        self._total_episodes_counter = 0
-        self._total_steps_counter = 0
-        self._current_episodes_counter = 0
-        self._current_steps_counter = 0
         self._episode_steps = None
-        self._n_episodes = None
-        self._n_steps_per_fit = None
-        self._n_episodes_per_fit = None
+
+        self._core_logic = CoreLogic()
 
         if record_dictionary is None:
             record_dictionary = dict()
@@ -62,23 +56,12 @@ def learn(self, n_steps=None, n_episodes=None, n_steps_per_fit=None,
                 should be set to True.
 
         """
-        assert (n_episodes_per_fit is not None and n_steps_per_fit is None)\
-            or (n_episodes_per_fit is None and n_steps_per_fit is not None)
-
         assert (render and record) or (not record), "To record, the render flag must be set to true"
+        self._core_logic.initialize_fit(n_steps_per_fit, n_episodes_per_fit)
 
-        self._n_steps_per_fit = n_steps_per_fit
-        self._n_episodes_per_fit = n_episodes_per_fit
-
-        if n_steps_per_fit is not None:
-            fit_condition = lambda: self._current_steps_counter >= self._n_steps_per_fit
-        else:
-            fit_condition = lambda: self._current_episodes_counter >= self._n_episodes_per_fit
-
-        dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape,
-                          self._n_steps_per_fit, self._n_episodes_per_fit)
+        dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape, n_steps_per_fit, n_episodes_per_fit)
 
-        self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record)
+        self._run(dataset, n_steps, n_episodes, render, quiet, record)
 
     def evaluate(self, initial_states=None, n_steps=None, n_episodes=None, render=False, quiet=False, record=False):
         """
@@ -96,73 +79,36 @@ def evaluate(self, initial_states=None, n_steps=None, n_episodes=None, render=Fa
                 should be set to True.
 
         Returns:
-            The collected dataset and, optionally, an extra dataset of
-            environment info, collected at each step.
+            The collected dataset.
 
         """
         assert (render and record) or (not record), "To record, the render flag must be set to true"
 
-        fit_condition = lambda: False
+        self._core_logic.initialize_evaluate()
 
         n_episodes_dataset = len(initial_states) if initial_states is not None else n_episodes
         dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape, n_steps, n_episodes_dataset)
 
-        return self._run(dataset, n_steps, n_episodes, fit_condition, render, quiet, record, initial_states)
+        return self._run(dataset, n_steps, n_episodes, render, quiet, record, initial_states)
 
-    def _run(self, dataset, n_steps, n_episodes, fit_condition, render, quiet, record, initial_states=None):
-        assert n_episodes is not None and n_steps is None and initial_states is None\
-            or n_episodes is None and n_steps is not None and initial_states is None\
-            or n_episodes is None and n_steps is None and initial_states is not None
-
-        self._n_episodes = len(initial_states) if initial_states is not None else n_episodes
-
-        if n_steps is not None:
-            move_condition = lambda: self._total_steps_counter < n_steps
-
-            steps_progress_bar = tqdm(total=n_steps,  dynamic_ncols=True, disable=quiet, leave=False)
-            episodes_progress_bar = tqdm(disable=True)
-        else:
-            move_condition = lambda: self._total_episodes_counter < self._n_episodes
-
-            steps_progress_bar = tqdm(disable=True)
-            episodes_progress_bar = tqdm(total=self._n_episodes, dynamic_ncols=True, disable=quiet, leave=False)
-
-        self._run_impl(dataset, move_condition, fit_condition, steps_progress_bar, episodes_progress_bar, render,
-                       record, initial_states)
-
-        return dataset
-
-    def _run_impl(self, dataset, move_condition, fit_condition, steps_progress_bar, episodes_progress_bar, render,
-                  record, initial_states):
-        self._total_episodes_counter = 0
-        self._total_steps_counter = 0
-        self._current_episodes_counter = 0
-        self._current_steps_counter = 0
+    def _run(self, dataset, n_steps, n_episodes, render, quiet, record, initial_states=None):
+        self._core_logic.initialize_run(n_steps, n_episodes, initial_states, quiet)
 
         last = True
-        while move_condition():
+        while self._core_logic.move_required():
             if last:
-                self.reset(initial_states)
+                self._reset(initial_states)
 
             sample, step_info = self._step(render, record)
 
             self.callback_step(sample)
-
-            self._total_steps_counter += 1
-            self._current_steps_counter += 1
-            steps_progress_bar.update(1)
-
-            if sample[5]:
-                self._total_episodes_counter += 1
-                self._current_episodes_counter += 1
-                episodes_progress_bar.update(1)
+            self._core_logic.after_step(sample[5])
 
             dataset.append(sample, step_info)
 
-            if fit_condition():
+            if self._core_logic.fit_required():
                 self.agent.fit(dataset)
-                self._current_episodes_counter = 0
-                self._current_steps_counter = 0
+                self._core_logic.after_fit()
 
                 for c in self.callbacks_fit:
                     c(dataset)
@@ -177,8 +123,9 @@ def _run_impl(self, dataset, move_condition, fit_condition, steps_progress_bar,
         if record:
             self._record.stop()
 
-        steps_progress_bar.close()
-        episodes_progress_bar.close()
+        self._core_logic.terminate_run()
+
+        return dataset
 
     def _step(self, render, record):
         """
@@ -195,14 +142,14 @@ def _step(self, render, record):
         action, policy_next_state = self.agent.draw_action(self._state, self._policy_state)
         next_state, reward, absorbing, step_info = self.mdp.step(action)
 
-        self._episode_steps += 1
-
         if render:
             frame = self.mdp.render(record)
 
             if record:
                 self._record(frame)
 
+        self._episode_steps += 1
+
         last = not(self._episode_steps < self.mdp.info.horizon and not absorbing)
 
         state = self._state
@@ -213,21 +160,18 @@ def _step(self, render, record):
 
         return (state, action, reward, next_state, absorbing, last, policy_state, policy_next_state), step_info
 
-    def reset(self, initial_states=None):
+    def _reset(self, initial_states):
         """
         Reset the state of the agent.
 
         """
-        if initial_states is None or self._total_episodes_counter == self._n_episodes:
-            initial_state = None
-        else:
-            initial_state = initial_states[self._total_episodes_counter]
+        initial_state = self._core_logic.get_initial_state(initial_states)
 
         state, episode_info = self.mdp.reset(initial_state)
         self._policy_state = self.agent.episode_start(episode_info)
-
         self._state = self._preprocess(state)
         self.agent.next_action = None
+
         self._episode_steps = 0
 
     def _preprocess(self, state):
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index c5317ec23..6e94e721d 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -255,6 +255,18 @@ def parse(self, to='numpy'):
         return self._converter.convert(self.state, self.action, self.reward, self.next_state,
                                        self.absorbing, self.last, to=to)
 
+    def parse_policy_state(self, to='numpy'):
+        """
+        Return the dataset as set of arrays.
+
+        to (str, numpy):  the backend to be used for the returned arrays.
+
+        Returns:
+            A tuple containing the arrays that define the dataset, i.e. state, action, next state, absorbing and last
+
+        """
+        return self._converter.convert(self.policy_state, self.policy_next_state, to=to)
+
     def select_first_episodes(self, n_episodes):
         """
         Return the first ``n_episodes`` episodes in the provided dataset.
diff --git a/mushroom_rl/policy/__init__.py b/mushroom_rl/policy/__init__.py
index 2bc435dde..503ed676d 100644
--- a/mushroom_rl/policy/__init__.py
+++ b/mushroom_rl/policy/__init__.py
@@ -5,6 +5,7 @@
      StateStdGaussianPolicy, StateLogStdGaussianPolicy
 from .deterministic_policy import DeterministicPolicy
 from .torch_policy import TorchPolicy, GaussianTorchPolicy, BoltzmannTorchPolicy
+from .recurrent_torch_policy import RecurrentGaussianTorchPolicy
 from .promps import ProMP
 
 
diff --git a/mushroom_rl/policy/recurrent_torch_policy.py b/mushroom_rl/policy/recurrent_torch_policy.py
new file mode 100644
index 000000000..0849d3cc3
--- /dev/null
+++ b/mushroom_rl/policy/recurrent_torch_policy.py
@@ -0,0 +1,63 @@
+import torch
+import numpy as np
+
+from mushroom_rl.policy import GaussianTorchPolicy
+from mushroom_rl.utils.torch import to_float_tensor
+from mushroom_rl.utils.parameters import to_parameter
+
+
+class RecurrentGaussianTorchPolicy(GaussianTorchPolicy):
+    def __init__(self,  policy_state_shape, log_std_min=-20, log_std_max=2, **kwargs):
+
+        super().__init__(policy_state_shape=policy_state_shape, **kwargs)
+
+        self._log_std_min = to_parameter(log_std_min)
+        self._log_std_max = to_parameter(log_std_max)
+
+    def reset(self):
+        return torch.zeros(self.policy_state_shape)
+
+    def draw_action(self, state, policy_state):
+        with torch.no_grad():
+            state = to_float_tensor(state)
+            policy_state = torch.as_tensor(policy_state)
+            a, policy_state = self.draw_action_t(state, policy_state)
+        return torch.squeeze(a, dim=0).detach().cpu().numpy(), policy_state
+
+    def draw_action_t(self, state, policy_state):
+        lengths = torch.tensor([1])
+        state = torch.atleast_2d(state).view(1, 1, -1)
+        policy_state = torch.atleast_2d(policy_state)
+
+        dist, policy_state = self.distribution_and_policy_state_t(state, policy_state, lengths)
+        action = dist.sample().detach()
+
+        return action, policy_state
+
+    def log_prob_t(self, state, action, policy_state, lengths):
+        return self.distribution_t(state, policy_state, lengths).log_prob(action.squeeze())[:, None]
+
+    def entropy_t(self, state=None):
+        return self._action_dim / 2 * np.log(2 * np.pi * np.e) + torch.sum(self._log_sigma)
+
+    def distribution(self, state, policy_state, lengths):
+        s = to_float_tensor(state, self._use_cuda)
+
+        return self.distribution_t(s, policy_state, lengths)
+
+    def distribution_t(self, state, policy_state, lengths):
+        mu, sigma, _ = self.get_mean_and_covariance_and_policy_state(state, policy_state, lengths)
+        return torch.distributions.MultivariateNormal(loc=mu, covariance_matrix=sigma)
+
+    def distribution_and_policy_state_t(self, state, policy_state, lengths):
+        mu, sigma, policy_state = self.get_mean_and_covariance_and_policy_state(state, policy_state, lengths)
+        return torch.distributions.MultivariateNormal(loc=mu, covariance_matrix=sigma), policy_state
+
+    def get_mean_and_covariance_and_policy_state(self, state, policy_state, lengths):
+        mu, next_hidden_state = self._mu(state, policy_state, lengths, **self._predict_params, output_tensor=True)
+
+        # Bound the log_std
+        log_sigma = torch.clamp(self._log_sigma, self._log_std_min(), self._log_std_max())
+
+        covariance = torch.diag(torch.exp(2 * log_sigma))
+        return mu, covariance, next_hidden_state

From f741ea1d96ee03aa8b5d27e8eab76847dbe886e5 Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 19 Oct 2023 19:08:06 +0200
Subject: [PATCH 19/24] Removed gradient computation from value functions

- now value function methods do not use gradient computations
---
 mushroom_rl/utils/value_functions.py | 57 +++++++++++++++-------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/mushroom_rl/utils/value_functions.py b/mushroom_rl/utils/value_functions.py
index e93ebb539..496aa558d 100644
--- a/mushroom_rl/utils/value_functions.py
+++ b/mushroom_rl/utils/value_functions.py
@@ -22,18 +22,19 @@ def compute_advantage_montecarlo(V, s, ss, r, absorbing, gamma):
         The new estimate for the value function of the next state
         and the advantage function.
     """
-    r = r.squeeze()
-    q = torch.zeros(len(r))
-    v = V(s, output_tensor=True).squeeze()
+    with torch.no_grad():
+        r = r.squeeze()
+        q = torch.zeros(len(r))
+        v = V(s, output_tensor=True).squeeze()
 
-    q_next = V(ss[-1]).squeeze().item()
-    for rev_k in range(len(r)):
-        k = len(r) - rev_k - 1
-        q_next = r[k] + gamma * q_next * (1 - absorbing[k].int())
-        q[k] = q_next
+        q_next = V(ss[-1]).squeeze().item()
+        for rev_k in range(len(r)):
+            k = len(r) - rev_k - 1
+            q_next = r[k] + gamma * q_next * (1 - absorbing[k].int())
+            q[k] = q_next
 
-    adv = q - v
-    return q[:, None], adv[:, None]
+        adv = q - v
+        return q[:, None], adv[:, None]
 
 
 def compute_advantage(V, s, ss, r, absorbing, gamma):
@@ -56,12 +57,13 @@ def compute_advantage(V, s, ss, r, absorbing, gamma):
         The new estimate for the value function of the next state
         and the advantage function.
     """
-    v = V(s, output_tensor=True).squeeze()
-    v_next = V(ss).squeeze() * (1 - absorbing.int())
+    with torch.no_grad():
+        v = V(s, output_tensor=True).squeeze()
+        v_next = V(ss).squeeze() * (1 - absorbing.int())
 
-    q = r + gamma * v_next
-    adv = q - v
-    return q[:, None], adv[:, None]
+        q = r + gamma * v_next
+        adv = q - v
+        return q[:, None], adv[:, None]
 
 
 def compute_gae(V, s, ss, r, absorbing, last, gamma, lam):
@@ -92,15 +94,16 @@ def compute_gae(V, s, ss, r, absorbing, last, gamma, lam):
         The new estimate for the value function of the next state
         and the estimated generalized advantage.
     """
-    v = V(s, output_tensor=True)
-    v_next = V(ss, output_tensor=True)
-    gen_adv = torch.empty_like(v)
-    for rev_k in range(len(v)):
-        k = len(v) - rev_k - 1
-        if last[k] or rev_k == 0:
-            gen_adv[k] = r[k] - v[k]
-            if not absorbing[k]:
-                gen_adv[k] += gamma * v_next[k]
-        else:
-            gen_adv[k] = r[k] + gamma * v_next[k] - v[k] + gamma * lam * gen_adv[k + 1]
-    return gen_adv + v, gen_adv
\ No newline at end of file
+    with torch.no_grad():
+        v = V(s, output_tensor=True)
+        v_next = V(ss, output_tensor=True)
+        gen_adv = torch.empty_like(v)
+        for rev_k in range(len(v)):
+            k = len(v) - rev_k - 1
+            if last[k] or rev_k == 0:
+                gen_adv[k] = r[k] - v[k]
+                if not absorbing[k]:
+                    gen_adv[k] += gamma * v_next[k]
+            else:
+                gen_adv[k] = r[k] + gamma * v_next[k] - v[k] + gamma * lam * gen_adv[k + 1]
+        return gen_adv + v, gen_adv
\ No newline at end of file

From e6320ffbbacd8597ed55754ba94222915be204cd Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 19 Oct 2023 20:02:43 +0200
Subject: [PATCH 20/24] Added parallel environments

---
 mushroom_rl/core/parallel_environment.py | 124 +++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 mushroom_rl/core/parallel_environment.py

diff --git a/mushroom_rl/core/parallel_environment.py b/mushroom_rl/core/parallel_environment.py
new file mode 100644
index 000000000..6ed2e793b
--- /dev/null
+++ b/mushroom_rl/core/parallel_environment.py
@@ -0,0 +1,124 @@
+from .environment import Environment
+
+
+class ParallelEnvironment(object):
+    """
+    Basic interface to generate and collect multiple copies of the same environment.
+    This class assumes that the environments are homogeneus, i.e. have the same type and MDP info.
+
+    """
+    def __init__(self, env_list):
+        """
+        Constructor.
+
+        Args:
+            env_list: list of the environments to be evaluated in parallel.
+
+        """
+        self.envs = env_list
+
+    @property
+    def info(self):
+        """
+        Returns:
+             An object containing the info of all environments.
+
+        """
+        return self.envs[0].info
+
+    def __len__(self):
+        return len(self.envs)
+
+    def __getitem__(self, item):
+        return self.envs[item]
+
+    def seed(self, seeds):
+        """
+        Set the seed of all environments.
+
+        Args:
+            seeds ([int, list]): the value of the seed or a list of seeds for each environment. The list lenght must be
+                equal to the number of parallel environments.
+
+        """
+        if isinstance(seeds, list):
+            assert len(seeds) == len(self)
+            for env, seed in zip(self.envs,seeds):
+                env.seed(seed)
+        else:
+            for env in self.envs:
+                env.seed(seeds)
+
+    def stop(self):
+        """
+        Method used to stop an mdp. Useful when dealing with real world environments, simulators, or when using
+        openai-gym rendering
+
+        """
+        for env in self.envs:
+            env.stop()
+
+    @staticmethod
+    def make(env_name, n_envs, use_constructor=False, *args, **kwargs):
+        """
+        Generate multiple copies of a given environment using the specified name and parameters.
+        The environment is created using the generate method, if available. Otherwise, the constructor is used.
+        See the `Environment.make` documentation for more information.
+
+        Args:
+            env_name (str): Name of the environment;
+            n_envs (int): Number of environments in parallel to generate;
+            use_constructor (bool, False): whether to force the method to use the constructor instead of the generate
+                method;
+            *args: positional arguments to be provided to the environment generator/constructor;
+            **kwargs: keyword arguments to be provided to the environment generator/constructor.
+
+        Returns:
+            An instance of the constructed environment.
+
+        """
+        if '.' in env_name:
+            env_data = env_name.split('.')
+            env_name = env_data[0]
+            args = env_data[1:] + list(args)
+
+        env = Environment._registered_envs[env_name]
+
+        if not use_constructor and hasattr(env, 'generate'):
+            return ParallelEnvironment.generate(env, *args, **kwargs)
+        else:
+            return ParallelEnvironment([env(*args, **kwargs) for _ in range(n_envs)])
+
+    @staticmethod
+    def init(env, n_envs, *args, **kwargs):
+        """
+        Method to generate an array of multiple copies of the same environment, calling the constructor n_envs times
+
+        Args:
+            env (class): the environment to be constructed;
+            *args: positional arguments to be passed to the constructor;
+            n_envs (int, 1): number of environments to generate;
+            **kwargs: keywords arguments to be passed to the constructor
+
+        Returns:
+            A list containing multiple copies of the environment.
+
+        """
+        return
+
+    @staticmethod
+    def generate(env, n_envs, *args, **kwargs):
+        """
+        Method to generate an array of multiple copies of the same environment, calling the generate method n_envs times
+
+        Args:
+            env (class): the environment to be constructed;
+            *args: positional arguments to be passed to the constructor;
+            n_envs (int, 1): number of environments to generate;
+            **kwargs: keywords arguments to be passed to the constructor
+
+        Returns:
+            A list containing multiple copies of the environment.
+
+        """
+        return ParallelEnvironment([env.generate(*args, **kwargs) for _ in range(n_envs)])
\ No newline at end of file

From d1de31cb000a7197ac1e9f6005fc71853b4bb19b Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Thu, 19 Oct 2023 20:15:26 +0200
Subject: [PATCH 21/24] Fixed some issuesn when adding datasets toghether

- now the dataset add operation should set the last flag correctly
---
 mushroom_rl/core/_impl/list_dataset.py  | 9 +++++++--
 mushroom_rl/core/_impl/numpy_dataset.py | 1 +
 mushroom_rl/core/_impl/torch_dataset.py | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mushroom_rl/core/_impl/list_dataset.py b/mushroom_rl/core/_impl/list_dataset.py
index 4455c02d0..3f781ba8b 100644
--- a/mushroom_rl/core/_impl/list_dataset.py
+++ b/mushroom_rl/core/_impl/list_dataset.py
@@ -11,7 +11,8 @@ def __init__(self):
         self._policy_dataset = list()
 
         self._add_save_attr(
-            _dataset='pickle'
+            _dataset='pickle',
+            _policy_dataset='pickle'
         )
 
     @classmethod
@@ -58,7 +59,11 @@ def __getitem__(self, index):
 
     def __add__(self, other):
         result = self.copy()
-        result._dataset = self._dataset + other._dataset
+        last_step = result._dataset[-1]
+        modified_last_step = last_step[:-1] + (True,)
+        result._dataset[-1] = modified_last_step
+        result._dataset = result._dataset + other._dataset
+        result._policy_dataset = result._policy_dataset + other._policy_dataset
 
         return result
 
diff --git a/mushroom_rl/core/_impl/numpy_dataset.py b/mushroom_rl/core/_impl/numpy_dataset.py
index a00b7a04f..81141c729 100644
--- a/mushroom_rl/core/_impl/numpy_dataset.py
+++ b/mushroom_rl/core/_impl/numpy_dataset.py
@@ -150,6 +150,7 @@ def __add__(self, other):
         result._next_states = np.concatenate((self.next_state, other.next_state))
         result._absorbing = np.concatenate((self.absorbing, other.absorbing))
         result._last = np.concatenate((self.last, other.last))
+        result._last[len(self)-1] = True
         result._len = len(self) + len(other)
 
         if result._policy_states is not None:
diff --git a/mushroom_rl/core/_impl/torch_dataset.py b/mushroom_rl/core/_impl/torch_dataset.py
index fe9ab683a..073aa68ac 100644
--- a/mushroom_rl/core/_impl/torch_dataset.py
+++ b/mushroom_rl/core/_impl/torch_dataset.py
@@ -150,6 +150,7 @@ def __add__(self, other):
         result._next_states = torch.concatenate((self.next_state, other.next_state))
         result._absorbing = torch.concatenate((self.absorbing, other.absorbing))
         result._last = torch.concatenate((self.last, other.last))
+        result._last[len(self) - 1] = True
         result._len = len(self) + len(other)
 
         if result._policy_states is not None:

From 575bcef90e6dc9ec81904d2128e5694f47361eb7 Mon Sep 17 00:00:00 2001
From: Boris_il_forte <boris.ilpossente@hotmail.it>
Date: Sat, 21 Oct 2023 20:28:15 +0200
Subject: [PATCH 22/24] Removed features from agent, added agent info

- features are not used anymore by the agent
- features moved to linear approximator
- fixed algorithms to use features implicitly
---
 .../classic_actor_critic/copdac_q.py          |  14 +-
 .../classic_actor_critic/stochastic_ac.py     |  31 ++--
 .../actor_critic/deep_actor_critic/ppo.py     |   2 +-
 .../actor_critic/deep_actor_critic/trpo.py    |   2 +-
 .../black_box_optimization.py                 |   4 +-
 .../constrained_reps.py                       |   4 +-
 .../black_box_optimization/more.py            |   4 +-
 .../black_box_optimization/pgpe.py            |   5 +-
 .../black_box_optimization/reps.py            |   4 +-
 .../black_box_optimization/rwr.py             |   4 +-
 .../policy_search/policy_gradient/enac.py     |   5 +-
 .../policy_search/policy_gradient/gpomdp.py   |   4 +-
 .../policy_gradient/policy_gradient.py        |   7 +-
 .../policy_gradient/reinforce.py              |   4 +-
 .../algorithms/value/batch_td/batch_td.py     |   5 +-
 mushroom_rl/algorithms/value/batch_td/lspi.py |  12 +-
 .../algorithms/value/td/sarsa_lambda.py       |   3 +-
 .../value/td/sarsa_lambda_continuous.py       |  12 +-
 mushroom_rl/algorithms/value/td/td.py         |   8 +-
 .../value/td/true_online_sarsa_lambda.py      |  16 +-
 mushroom_rl/approximators/parametric/cmac.py  |   6 +-
 .../approximators/parametric/linear.py        |  33 +++--
 mushroom_rl/core/__init__.py                  |   4 +-
 mushroom_rl/core/agent.py                     |  67 +++++----
 tests/algorithms/helper/utils.py              |  16 +-
 tests/algorithms/test_dpg.py                  |   8 +-
 tests/algorithms/test_lspi.py                 |   6 +-
 tests/algorithms/test_stochastic_ac.py        |  15 +-
 tests/algorithms/test_td.py                   | 140 ++++++------------
 29 files changed, 200 insertions(+), 245 deletions(-)

diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
index 7787e785d..7086492ed 100644
--- a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
+++ b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
@@ -14,8 +14,7 @@ class COPDAC_Q(Agent):
     Silver D. et al.. 2014.
 
     """
-    def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
-                 value_function_features=None, policy_features=None):
+    def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None):
         """
         Constructor.
 
@@ -27,7 +26,6 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
             alpha_v ([float, Parameter]): learning rate for the value function;
             value_function_features (Features, None): features used by the value
                 function approximator;
-            policy_features (Features, None): features used by the policy.
 
         """
         self._mu = mu
@@ -59,19 +57,18 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
             _A='mushroom'
         )
 
-        super().__init__(mdp_info, policy, policy_features)
+        super().__init__(mdp_info, policy)
 
     def fit(self, dataset, **info):
         for step in dataset:
             s, a, r, ss, absorbing, _ = step
 
-            s_phi = self.phi(s) if self.phi is not None else s
             s_psi = self._psi(s) if self._psi is not None else s
             ss_psi = self._psi(ss) if self._psi is not None else ss
 
             q_next = self._V(ss_psi).item() if not absorbing else 0
 
-            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
+            grad_mu_s = np.atleast_2d(self._mu.diff(s))
             omega = self._A.get_weights()
 
             delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
@@ -96,8 +93,7 @@ def _Q(self, state, action):
                                                             action)).item()
 
     def _nu(self, state, action):
-        state_phi = self.phi(state) if self.phi is not None else state
-        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
-        delta = action - self._mu(state_phi)
+        grad_mu = np.atleast_2d(self._mu.diff(state))
+        delta = action - self._mu(state)
 
         return delta.dot(grad_mu)
diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
index 2ea4fe085..03434927a 100644
--- a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
+++ b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
@@ -14,8 +14,7 @@ class StochasticAC(Agent):
     Degris T. et al.. 2012.
 
     """
-    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
-                 value_function_features=None, policy_features=None):
+    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None):
         """
         Constructor.
 
@@ -23,9 +22,7 @@ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
             alpha_theta ([float, Parameter]): learning rate for policy update;
             alpha_v ([float, Parameter]): learning rate for the value function;
             lambda_par ([float, Parameter], .9): trace decay parameter;
-            value_function_features (Features, None): features used by the
-                value function approximator;
-            policy_features (Features, None): features used by the policy.
+            value_function_features (Features, None): features used by the value function approximator.
 
         """
         self._psi = value_function_features
@@ -35,15 +32,14 @@ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
 
         self._lambda = to_parameter(lambda_par)
 
-        super().__init__(mdp_info, policy, policy_features)
+        super().__init__(mdp_info, policy)
 
         if self._psi is not None:
             input_shape = (self._psi.size,)
         else:
             input_shape = mdp_info.observation_space.shape
 
-        self._V = Regressor(LinearApproximator, input_shape=input_shape,
-                            output_shape=(1,))
+        self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,))
 
         self._e_v = np.zeros(self._V.weights_size)
         self._e_theta = np.zeros(self.policy.weights_size)
@@ -68,13 +64,12 @@ def fit(self, dataset, **info):
         for step in dataset:
             s, a, r, ss, absorbing, _ = step
 
-            s_phi = self.phi(s) if self.phi is not None else s
             s_psi = self._psi(s) if self._psi is not None else s
             ss_psi = self._psi(ss) if self._psi is not None else ss
 
             v_next = self._V(ss_psi) if not absorbing else 0
 
-            delta = self._compute_td_n_traces(a, r, v_next, s_psi, s_phi)
+            delta = self._compute_td_n_traces(s, a, r, v_next, s_psi)
 
             # Update value function
             delta_v = self._alpha_v(s, a) * delta * self._e_v
@@ -86,14 +81,13 @@ def fit(self, dataset, **info):
             theta_new = self.policy.get_weights() + delta_theta
             self.policy.set_weights(theta_new)
 
-    def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
+    def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
         # Compute TD error
         delta = r + self.mdp_info.gamma * v_next - self._V(s_psi)
 
         # Update traces
         self._e_v = self.mdp_info.gamma * self._lambda() * self._e_v + s_psi
-        self._e_theta = self.mdp_info.gamma * self._lambda() * \
-            self._e_theta + self.policy.diff_log(s_phi, a)
+        self._e_theta = self.mdp_info.gamma * self._lambda() * self._e_theta + self.policy.diff_log(s, a)
 
         return delta
 
@@ -105,9 +99,7 @@ class StochasticAC_AVG(StochasticAC):
     Degris T. et al.. 2012.
 
     """
-    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r,
-                 lambda_par=.9, value_function_features=None,
-                 policy_features=None):
+    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None):
         """
         Constructor.
 
@@ -115,21 +107,20 @@ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r,
             alpha_r (Parameter): learning rate for the reward trace.
 
         """
-        super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par,
-                         value_function_features, policy_features)
+        super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par,  value_function_features)
 
         self._alpha_r = to_parameter(alpha_r)
         self._r_bar = 0
 
         self._add_save_attr(_alpha_r='mushroom', _r_bar='primitive')
 
-    def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
+    def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
         # Compute TD error
         delta = r - self._r_bar + v_next - self._V(s_psi)
 
         # Update traces
         self._r_bar += self._alpha_r() * delta
         self._e_v = self._lambda() * self._e_v + s_psi
-        self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s_phi, a)
+        self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s, a)
 
         return delta
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
index 13923348d..200c71115 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
@@ -68,7 +68,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
             _iter='primitive'
         )
 
-        super().__init__(mdp_info, policy, None)
+        super().__init__(mdp_info, policy)
 
     def fit(self, dataset, **info):
         state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
index c7b017a1d..fe80e7887 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
@@ -79,7 +79,7 @@ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, l
             _iter='primitive'
         )
 
-        super().__init__(mdp_info, policy, None)
+        super().__init__(mdp_info, policy)
 
     def fit(self, dataset, **info):
         state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
index 4ea7f883b..cfbda4fc6 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
@@ -10,7 +10,7 @@ class BlackBoxOptimization(Agent):
     do not rely on stochastic and differentiable policies.
 
     """
-    def __init__(self, mdp_info, distribution, policy, features=None):
+    def __init__(self, mdp_info, distribution, policy):
         """
         Constructor.
 
@@ -24,7 +24,7 @@ def __init__(self, mdp_info, distribution, policy, features=None):
 
         self._add_save_attr(distribution='mushroom', _theta_list='pickle')
 
-        super().__init__(mdp_info, policy, features)
+        super().__init__(mdp_info, policy, is_episodic=True)
 
     def episode_start(self, episode_info):
         theta = self.distribution.sample()
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/constrained_reps.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/constrained_reps.py
index a3cb7cf2b..e15771546 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/constrained_reps.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/constrained_reps.py
@@ -9,7 +9,7 @@ class ConstrainedREPS(BlackBoxOptimization):
     Episodic Relative Entropy Policy Search algorithm with constrained policy update.
 
     """
-    def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None):
+    def __init__(self, mdp_info, distribution, policy, eps, kappa):
         """
         Constructor.
 
@@ -28,7 +28,7 @@ def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None):
         self._add_save_attr(_eps='mushroom')
         self._add_save_attr(_kappa='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         eta_start = np.ones(1)
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/more.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/more.py
index 50eaf8c5e..5c7517cb1 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/more.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/more.py
@@ -17,7 +17,7 @@ class MORE(BlackBoxOptimization):
     Peters, Jan R and Lau, Nuno and Pualo Reis, Luis and Neumann, Gerhard. 2015.
 
     """
-    def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99, features=None):
+    def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99):
         """
         Constructor.
 
@@ -53,7 +53,7 @@ def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99, feat
         self._add_save_attr(h0='primitive')
         self._add_save_attr(kappa='primitive')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/pgpe.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/pgpe.py
index 9f504fca9..a5fde15e1 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/pgpe.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/pgpe.py
@@ -10,8 +10,7 @@ class PGPE(BlackBoxOptimization):
     Peters J.. 2013.
 
     """
-    def __init__(self, mdp_info, distribution, policy, optimizer,
-                 features=None):
+    def __init__(self, mdp_info, distribution, policy, optimizer):
         """
         Constructor.
 
@@ -23,7 +22,7 @@ def __init__(self, mdp_info, distribution, policy, optimizer,
 
         self._add_save_attr(optimizer='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         baseline_num_list = list()
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/reps.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/reps.py
index 792a261c3..dd91093c3 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/reps.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/reps.py
@@ -13,7 +13,7 @@ class REPS(BlackBoxOptimization):
     Peters J.. 2013.
 
     """
-    def __init__(self, mdp_info, distribution, policy, eps, features=None):
+    def __init__(self, mdp_info, distribution, policy, eps):
         """
         Constructor.
 
@@ -27,7 +27,7 @@ def __init__(self, mdp_info, distribution, policy, eps, features=None):
 
         self._add_save_attr(_eps='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         eta_start = np.ones(1)
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/rwr.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/rwr.py
index e751d62a3..719105c9c 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/rwr.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/rwr.py
@@ -11,7 +11,7 @@ class RWR(BlackBoxOptimization):
     Peters J.. 2013.
 
     """
-    def __init__(self, mdp_info, distribution, policy, beta, features=None):
+    def __init__(self, mdp_info, distribution, policy, beta):
         """
         Constructor.
 
@@ -24,7 +24,7 @@ def __init__(self, mdp_info, distribution, policy, beta, features=None):
 
         self._add_save_attr(_beta='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         Jep -= np.max(Jep)
diff --git a/mushroom_rl/algorithms/policy_search/policy_gradient/enac.py b/mushroom_rl/algorithms/policy_search/policy_gradient/enac.py
index 24d58573f..a7af25842 100644
--- a/mushroom_rl/algorithms/policy_search/policy_gradient/enac.py
+++ b/mushroom_rl/algorithms/policy_search/policy_gradient/enac.py
@@ -10,8 +10,7 @@ class eNAC(PolicyGradient):
     Peters J. 2013.
 
     """
-    def __init__(self, mdp_info, policy, optimizer, features=None,
-                 critic_features=None):
+    def __init__(self, mdp_info, policy, optimizer, critic_features=None):
         """
         Constructor.
 
@@ -19,7 +18,7 @@ def __init__(self, mdp_info, policy, optimizer, features=None,
             critic_features (Features, None): features used by the critic.
 
         """
-        super().__init__(mdp_info, policy, optimizer, features)
+        super().__init__(mdp_info, policy, optimizer)
         self.phi_c = critic_features
 
         self.sum_grad_log = None
diff --git a/mushroom_rl/algorithms/policy_search/policy_gradient/gpomdp.py b/mushroom_rl/algorithms/policy_search/policy_gradient/gpomdp.py
index c0a68eb46..bf2acd6e6 100644
--- a/mushroom_rl/algorithms/policy_search/policy_gradient/gpomdp.py
+++ b/mushroom_rl/algorithms/policy_search/policy_gradient/gpomdp.py
@@ -10,8 +10,8 @@ class GPOMDP(PolicyGradient):
     2001.
 
     """
-    def __init__(self, mdp_info, policy, optimizer, features=None):
-        super().__init__(mdp_info, policy, optimizer, features)
+    def __init__(self, mdp_info, policy, optimizer):
+        super().__init__(mdp_info, policy, optimizer)
 
         self.sum_d_log_pi = None
         self.list_sum_d_log_pi = list()
diff --git a/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py b/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py
index e7b85d05c..f82d3095b 100644
--- a/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py
+++ b/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py
@@ -11,7 +11,7 @@ class PolicyGradient(Agent):
     al.. 2011.
 
     """
-    def __init__(self, mdp_info, policy, optimizer, features):
+    def __init__(self, mdp_info, policy, optimizer):
         """
         Constructor.
 
@@ -29,7 +29,7 @@ def __init__(self, mdp_info, policy, optimizer, features):
             J_episode='numpy'
         )
 
-        super().__init__(mdp_info, policy, features)
+        super().__init__(mdp_info, policy)
 
     def fit(self, dataset, **info):
         J = list()
@@ -133,7 +133,4 @@ def _parse(self, sample):
         absorbing = sample[4]
         last = sample[5]
 
-        if self.phi is not None:
-            state = self.phi(state)
-
         return state, action, reward, next_state, absorbing, last
diff --git a/mushroom_rl/algorithms/policy_search/policy_gradient/reinforce.py b/mushroom_rl/algorithms/policy_search/policy_gradient/reinforce.py
index 15d4db55f..f241f0be3 100644
--- a/mushroom_rl/algorithms/policy_search/policy_gradient/reinforce.py
+++ b/mushroom_rl/algorithms/policy_search/policy_gradient/reinforce.py
@@ -10,8 +10,8 @@ class REINFORCE(PolicyGradient):
     Reinforcement Learning", Williams R. J.. 1992.
 
     """
-    def __init__(self, mdp_info, policy, optimizer, features=None):
-        super().__init__(mdp_info, policy, optimizer, features)
+    def __init__(self, mdp_info, policy, optimizer):
+        super().__init__(mdp_info, policy, optimizer)
         self.sum_d_log_pi = None
         self.list_sum_d_log_pi = list()
         self.baseline_num = list()
diff --git a/mushroom_rl/algorithms/value/batch_td/batch_td.py b/mushroom_rl/algorithms/value/batch_td/batch_td.py
index 34c313bbb..5178ddc89 100644
--- a/mushroom_rl/algorithms/value/batch_td/batch_td.py
+++ b/mushroom_rl/algorithms/value/batch_td/batch_td.py
@@ -7,8 +7,7 @@ class BatchTD(Agent):
     Abstract class to implement a generic Batch TD algorithm.
 
     """
-    def __init__(self, mdp_info, policy, approximator, approximator_params=None,
-                 fit_params=None, features=None):
+    def __init__(self, mdp_info, policy, approximator, approximator_params=None, fit_params=None):
         """
         Constructor.
 
@@ -33,7 +32,7 @@ def __init__(self, mdp_info, policy, approximator, approximator_params=None,
             _fit_params='pickle'
         )
 
-        super().__init__(mdp_info, policy, features)
+        super().__init__(mdp_info, policy)
 
     def _post_load(self):
         self.policy.set_q(self.approximator)
diff --git a/mushroom_rl/algorithms/value/batch_td/lspi.py b/mushroom_rl/algorithms/value/batch_td/lspi.py
index 6c741fa56..e1d80aff8 100644
--- a/mushroom_rl/algorithms/value/batch_td/lspi.py
+++ b/mushroom_rl/algorithms/value/batch_td/lspi.py
@@ -12,8 +12,7 @@ class LSPI(BatchTD):
     "Least-Squares Policy Iteration". Lagoudakis M. G. and Parr R.. 2003.
 
     """
-    def __init__(self, mdp_info, policy, approximator_params=None,
-                 epsilon=1e-2, fit_params=None, features=None):
+    def __init__(self, mdp_info, policy, approximator_params=None, epsilon=1e-2, fit_params=None):
         """
         Constructor.
 
@@ -25,20 +24,19 @@ def __init__(self, mdp_info, policy, approximator_params=None,
 
         self._add_save_attr(_epsilon='mushroom')
 
-        super().__init__(mdp_info, policy, LinearApproximator,
-                         approximator_params, fit_params, features)
+        super().__init__(mdp_info, policy, LinearApproximator, approximator_params, fit_params)
 
     def fit(self, dataset, **info):
         state, action, reward, next_state, absorbing, _ = dataset.parse()
 
-        phi_state = self.phi(state)
-        phi_next_state = self.phi(next_state)
+        phi_state = self.approximator.model.phi(state)
+        phi_next_state = self.approximator.model.phi(next_state)
 
         phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n)
 
         norm = np.inf
         while norm > self._epsilon():
-            q = self.approximator.predict(phi_next_state)
+            q = self.approximator.predict(next_state)
             if np.any(absorbing):
                 q *= 1 - absorbing.reshape(-1, 1)
 
diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda.py b/mushroom_rl/algorithms/value/td/sarsa_lambda.py
index 7c3a3032e..7e82d29a0 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda.py
@@ -9,8 +9,7 @@ class SARSALambda(TD):
     The SARSA(lambda) algorithm for finite MDPs.
 
     """
-    def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
-                 trace='replacing'):
+    def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, trace='replacing'):
         """
         Constructor.
 
diff --git a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
index f3fc11dc5..c6e7b7aa4 100644
--- a/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
+++ b/mushroom_rl/algorithms/value/td/sarsa_lambda_continuous.py
@@ -10,7 +10,7 @@ class SARSALambdaContinuous(TD):
     Continuous version of SARSA(lambda) algorithm.
 
     """
-    def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None):
+    def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, approximator_params=None):
         """
         Constructor.
 
@@ -29,19 +29,17 @@ def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff,
             e='numpy'
         )
 
-        super().__init__(mdp_info, policy, Q, learning_rate, features)
+        super().__init__(mdp_info, policy, Q, learning_rate)
 
     def _update(self, state, action, reward, next_state, absorbing):
-        phi_state = self.phi(state)
-        q_current = self.Q.predict(phi_state, action)
+        q_current = self.Q.predict(state, action)
 
         alpha = self._alpha(state, action)
 
-        self.e = self.mdp_info.gamma * self._lambda() * self.e + self.Q.diff(phi_state, action)
+        self.e = self.mdp_info.gamma * self._lambda() * self.e + self.Q.diff(state, action)
 
         self.next_action, _ = self.draw_action(next_state)
-        phi_next_state = self.phi(next_state)
-        q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0.
+        q_next = self.Q.predict(next_state, self.next_action) if not absorbing else 0.
 
         delta = reward + self.mdp_info.gamma * q_next - q_current
 
diff --git a/mushroom_rl/algorithms/value/td/td.py b/mushroom_rl/algorithms/value/td/td.py
index 928599f6c..99178b850 100644
--- a/mushroom_rl/algorithms/value/td/td.py
+++ b/mushroom_rl/algorithms/value/td/td.py
@@ -8,14 +8,12 @@ class TD(Agent):
     Implements functions to run TD algorithms.
 
     """
-    def __init__(self, mdp_info, policy, approximator, learning_rate,
-                 features=None):
+    def __init__(self, mdp_info, policy, approximator, learning_rate):
         """
         Constructor.
 
         Args:
-            approximator (object): the approximator to use to fit the
-               Q-function;
+            approximator: the approximator to use to fit the Q-function;
             learning_rate (Parameter): the learning rate.
 
         """
@@ -26,7 +24,7 @@ def __init__(self, mdp_info, policy, approximator, learning_rate,
 
         self._add_save_attr(_alpha='mushroom', Q='mushroom')
 
-        super().__init__(mdp_info, policy, features)
+        super().__init__(mdp_info, policy)
 
     def fit(self, dataset, **info):
         assert len(dataset) == 1
diff --git a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
index 5ca541661..196765c3e 100644
--- a/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
+++ b/mushroom_rl/algorithms/value/td/true_online_sarsa_lambda.py
@@ -13,8 +13,7 @@ class TrueOnlineSARSALambda(TD):
     "True Online TD(lambda)". Seijen H. V. et al.. 2014.
 
     """
-    def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
-                 features, approximator_params=None):
+    def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, approximator_params=None):
         """
         Constructor.
 
@@ -35,12 +34,12 @@ def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
             e='numpy'
         )
 
-        super().__init__(mdp_info, policy, Q, learning_rate, features)
+        super().__init__(mdp_info, policy, Q, learning_rate)
 
     def _update(self, state, action, reward, next_state, absorbing):
-        phi_state = self.phi(state)
+        phi_state = self.Q.model.phi(state)
         phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n)
-        q_current = self.Q.predict(phi_state, action)
+        q_current = self.Q.predict(state, action)
 
         if self._q_old is None:
             self._q_old = q_current
@@ -48,12 +47,11 @@ def _update(self, state, action, reward, next_state, absorbing):
         alpha = self._alpha(state, action)
 
         e_phi = self.e.dot(phi_state_action)
-        self.e = self.mdp_info.gamma * self._lambda() * self.e + alpha * (
-            1. - self.mdp_info.gamma * self._lambda.get_value() * e_phi) * phi_state_action
+        self.e = (self.mdp_info.gamma * self._lambda() * self.e +
+                  alpha * (1. - self.mdp_info.gamma * self._lambda.get_value() * e_phi) * phi_state_action)
 
         self.next_action, _ = self.draw_action(next_state)
-        phi_next_state = self.phi(next_state)
-        q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0.
+        q_next = self.Q.predict(next_state, self.next_action) if not absorbing else 0.
 
         delta = reward + self.mdp_info.gamma * q_next - self._q_old
 
diff --git a/mushroom_rl/approximators/parametric/cmac.py b/mushroom_rl/approximators/parametric/cmac.py
index 1d4fea2d9..90b7c70c4 100644
--- a/mushroom_rl/approximators/parametric/cmac.py
+++ b/mushroom_rl/approximators/parametric/cmac.py
@@ -22,12 +22,12 @@ def __init__(self, tilings, weights=None, output_shape=(1,), **kwargs):
             **kwargs: other params of the approximator.
 
         """
-        self._phi = Features(tilings=tilings)
+        phi = Features(tilings=tilings)
         self._n = len(tilings)
 
-        super().__init__(weights=weights, input_shape=(self._phi.size,), output_shape=output_shape)
+        super().__init__(weights=weights, input_shape=(phi.size,), output_shape=output_shape, phi=phi)
 
-        self._add_save_attr(_phi='pickle', _n='primitive')
+        self._add_save_attr(_n='primitive')
 
     def fit(self, x, y, alpha=1.0, **kwargs):
         """
diff --git a/mushroom_rl/approximators/parametric/linear.py b/mushroom_rl/approximators/parametric/linear.py
index 004c7988e..55c4b0fcb 100644
--- a/mushroom_rl/approximators/parametric/linear.py
+++ b/mushroom_rl/approximators/parametric/linear.py
@@ -8,7 +8,7 @@ class LinearApproximator(Serializable):
     This class implements a linear approximator.
 
     """
-    def __init__(self, weights=None, input_shape=None, output_shape=(1,),
+    def __init__(self, weights=None, input_shape=None, output_shape=(1,), phi=None,
                  **kwargs):
         """
         Constructor.
@@ -17,6 +17,7 @@ def __init__(self, weights=None, input_shape=None, output_shape=(1,),
              weights (np.ndarray): array of weights to initialize the weights of the approximator;
              input_shape (np.ndarray, None): the shape of the input of the model;
              output_shape (np.ndarray, (1,)): the shape of the output of the model;
+             phi (object, None): features to extract from the state;
              **kwargs: other params of the approximator.
 
         """
@@ -33,7 +34,11 @@ def __init__(self, weights=None, input_shape=None, output_shape=(1,),
             raise ValueError('You should specify the initial parameter vector'
                              ' or the input dimension')
 
-        self._add_save_attr(_w='numpy')
+        self._phi = phi
+        self._add_save_attr(
+            _w='numpy',
+            _phi='pickle'
+        )
 
     def fit(self, x, y, **fit_params):
         """
@@ -45,7 +50,8 @@ def fit(self, x, y, **fit_params):
             **fit_params: other parameters used by the fit method of the regressor.
 
         """
-        self._w = np.atleast_2d(np.linalg.pinv(x).dot(y).T)
+        phi = np.atleast_2d(self.phi(x))
+        self._w = np.atleast_2d(np.linalg.pinv(phi).dot(y).T)
 
     def predict(self, x, **predict_params):
         """
@@ -59,9 +65,11 @@ def predict(self, x, **predict_params):
             The predictions of the model.
 
         """
-        prediction = np.ones((x.shape[0], self._w.shape[0]))
-        for i, x_i in enumerate(x):
-            prediction[i] = x_i.dot(self._w.T)
+        phi = np.atleast_2d(self.phi(x))
+
+        prediction = np.ones((phi.shape[0], self._w.shape[0]))
+        for i, phi_i in enumerate(phi):
+            prediction[i] = phi_i.dot(self._w.T)
 
         return prediction
 
@@ -94,6 +102,12 @@ def set_weights(self, w):
         """
         self._w = w.reshape(self._w.shape)
 
+    def phi(self, x):
+        if self._phi is not None:
+            return self._phi(x)
+        else:
+            return x
+
     def diff(self, state, action=None):
         """
         Compute the derivative of the output w.r.t. ``state``, and ``action`` if provided.
@@ -108,7 +122,7 @@ def diff(self, state, action=None):
 
         """
         if len(self._w.shape) == 1 or self._w.shape[0] == 1:
-            return state
+            return self.phi(state)
         else:
             n_phi = self._w.shape[1]
             n_outs = self._w.shape[0]
@@ -119,13 +133,14 @@ def diff(self, state, action=None):
                 start = 0
                 for i in range(n_outs):
                     stop = start + n_phi
-                    df[start:stop, i] = state
+                    df[start:stop, i] = self.phi(state)
                     start = stop
             else:
                 shape = (n_phi * n_outs)
                 df = np.zeros(shape)
                 start = action[0] * n_phi
                 stop = start + n_phi
-                df[start:stop] = state
+                df[start:stop] = self.phi(state)
 
             return df
+
diff --git a/mushroom_rl/core/__init__.py b/mushroom_rl/core/__init__.py
index 79e679afc..16d34d519 100644
--- a/mushroom_rl/core/__init__.py
+++ b/mushroom_rl/core/__init__.py
@@ -1,10 +1,10 @@
 from .core import Core
 from .dataset import Dataset
 from .environment import Environment, MDPInfo
-from .agent import Agent
+from .agent import Agent, AgentInfo
 from .serialization import Serializable
 from .logger import Logger
 
 import mushroom_rl.environments
 
-__all__ = ['Core', 'Dataset', 'Environment', 'MDPInfo', 'Agent', 'Serializable', 'Logger']
+__all__ = ['Core', 'Dataset', 'Environment', 'MDPInfo', 'Agent', 'AgentInfo', 'Serializable', 'Logger']
diff --git a/mushroom_rl/core/agent.py b/mushroom_rl/core/agent.py
index 4ac130eeb..d31095831 100644
--- a/mushroom_rl/core/agent.py
+++ b/mushroom_rl/core/agent.py
@@ -3,45 +3,64 @@
 from ._impl import *
 
 
+class AgentInfo(Serializable):
+    def __init__(self, is_episodic, policy_state_shape, backend):
+        assert isinstance(is_episodic, bool)
+        assert policy_state_shape is None or isinstance(policy_state_shape, tuple)
+        assert isinstance(backend, str)
+
+        self.is_episodic = is_episodic
+        self.is_stateful = policy_state_shape is not None
+        self.policy_state_shape = policy_state_shape
+        self.backend = backend
+
+        self._add_save_attr(
+            is_episodic='primitive',
+            is_stateful='primitive',
+            policy_state_shape='primitive',
+            backend='primitive'
+        )
+
+
 class Agent(Serializable):
     """
-    This class implements the functions to manage the agent (e.g. move the agent
-    following its policy).
+    This class implements the functions to manage the agent (e.g. move the agent following its policy).
 
     """
 
-    def __init__(self, mdp_info, policy, features=None, backend='numpy'):
+    def __init__(self, mdp_info, policy, is_episodic=False, backend='numpy'):
         """
         Constructor.
 
         Args:
             mdp_info (MDPInfo): information about the MDP;
             policy (Policy): the policy followed by the agent;
-            features (object, None): features to extract from the state;
+            is_episodic (bool, False): whether the agent is learning in an episodic fashion or not;
             backend (str, 'numpy'): array backend to be used by the algorithm.
 
         """
         self.mdp_info = mdp_info
-        self.policy = policy
-        self.backend = backend
-
-        self.phi = features
+        self._info = AgentInfo(
+            is_episodic=is_episodic,
+            policy_state_shape=policy.policy_state_shape,
+            backend=backend
+        )
 
+        self.policy = policy
         self.next_action = None
-
         self._agent_converter = DataConversion.get_converter(backend)
         self._env_converter = DataConversion.get_converter(self.mdp_info.backend)
 
         self._preprocessors = list()
+
         self._logger = None
 
         self._add_save_attr(
-            mdp_info='pickle',
             policy='mushroom',
-            backend='primitive',
-            phi='pickle',
-            next_action='numpy',
-            _agent_converter = 'primitive',
+            next_action='none',
+            mdp_info='mushroom',
+            _info='mushroom',
+            _agent_converter='primitive',
             _env_converter='primitive',
             _preprocessors='mushroom',
             _logger='none'
@@ -59,9 +78,8 @@ def fit(self, dataset, **info):
 
     def draw_action(self, state, policy_state=None):
         """
-        Return the action to execute in the given state. It is the action
-        returned by the policy or the action set by the algorithm (e.g. in the
-        case of SARSA).
+        Return the action to execute in the given state. It is the action returned by the policy or the action set by
+        the algorithm (e.g. in the case of SARSA).
 
         Args:
             state: the state where the agent is;
@@ -71,9 +89,6 @@ def draw_action(self, state, policy_state=None):
             The action to be executed.
 
         """
-        if self.phi is not None:
-            state = self.phi(state)
-
         if self.next_action is None:
             action, next_policy_state = self.policy.draw_action(state, policy_state)
         else:
@@ -95,9 +110,8 @@ def episode_start(self, episode_info):
 
     def stop(self):
         """
-        Method used to stop an agent. Useful when dealing with real world
-        environments, simulators, or to cleanup environments internals after
-        a core learn/evaluate to enforce consistency.
+        Method used to stop an agent. Useful when dealing with real world environments, simulators, or to cleanup
+        environments internals after a core learn/evaluate to enforce consistency.
 
         """
         pass
@@ -114,8 +128,7 @@ def set_logger(self, logger):
 
     def add_preprocessor(self, preprocessor):
         """
-        Add preprocessor to the preprocessor list.
-        The preprocessors are applied in order.
+        Add preprocessor to the preprocessor list. The preprocessors are applied in order.
 
         Args:
             preprocessor (object): state preprocessors to be applied
@@ -138,3 +151,7 @@ def _convert_to_env_backend(self, array):
     def _convert_to_agent_backend(self, array):
         return self._agent_converter.to_backend_array(self._env_converter, array)
 
+    @property
+    def info(self):
+        return self._info
+
diff --git a/tests/algorithms/helper/utils.py b/tests/algorithms/helper/utils.py
index 2fed7444a..7f2b9634c 100644
--- a/tests/algorithms/helper/utils.py
+++ b/tests/algorithms/helper/utils.py
@@ -4,7 +4,7 @@
 import itertools
 
 import mushroom_rl
-from mushroom_rl.core import MDPInfo
+from mushroom_rl.core import MDPInfo, AgentInfo
 from mushroom_rl.policy.td_policy import TDPolicy
 from mushroom_rl.policy.torch_policy import TorchPolicy
 from mushroom_rl.policy.policy import ParametricPolicy
@@ -60,6 +60,8 @@ def assert_eq(cls, this, that):
             assert cls.eq_chain(this, that)
         elif cls._check_type(this, that, MDPInfo):
             assert cls.eq_mdp_info(this, that)
+        elif cls._check_type(this, that, AgentInfo):
+            assert cls.eq_agent_info(this, that)
         elif cls._check_type(this, that, ReplayMemory):
             assert cls.eq_replay_memory(this, that)
         elif cls._check_type(this, that, PrioritizedReplayMemory):
@@ -170,6 +172,18 @@ def eq_mdp_info(cls, this, that):
         res &= this.horizon == that.horizon
         return res
 
+    @classmethod
+    def eq_agent_info(cls, this, that):
+        """
+                Compare two mdp_info objects for equality
+                """
+        res = this.is_episodic == that.is_episodic
+        res &= this.is_stateful == that.is_stateful
+        res &= this.policy_state_shape == that.policy_state_shape
+        res &= this.backend == that.backend
+
+        return res
+
     @classmethod
     def eq_ornstein_uhlenbeck_policy(cls, this, that):
         """
diff --git a/tests/algorithms/test_dpg.py b/tests/algorithms/test_dpg.py
index 203427d98..ef8de329e 100644
--- a/tests/algorithms/test_dpg.py
+++ b/tests/algorithms/test_dpg.py
@@ -35,16 +35,12 @@ def learn_copdac_q():
 
     input_shape = (phi.size,)
 
-    mu = Regressor(LinearApproximator, input_shape=input_shape,
-                   output_shape=mdp.info.action_space.shape)
+    mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, phi=phi)
 
     sigma = 1e-1 * np.eye(1)
     policy = GaussianPolicy(mu, sigma)
 
-    agent = COPDAC_Q(mdp.info, policy, mu,
-                     alpha_theta, alpha_omega, alpha_v,
-                     value_function_features=phi,
-                     policy_features=phi)
+    agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi)
 
     # Train
     core = Core(agent, mdp)
diff --git a/tests/algorithms/test_lspi.py b/tests/algorithms/test_lspi.py
index 93a37ca98..f47399133 100644
--- a/tests/algorithms/test_lspi.py
+++ b/tests/algorithms/test_lspi.py
@@ -30,9 +30,9 @@ def learn_lspi():
     fit_params = dict()
     approximator_params = dict(input_shape=(features.size,),
                                output_shape=(mdp.info.action_space.n,),
-                               n_actions=mdp.info.action_space.n)
-    agent = LSPI(mdp.info, pi, approximator_params=approximator_params,
-                 fit_params=fit_params, features=features)
+                               n_actions=mdp.info.action_space.n,
+                               phi=features)
+    agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params)
 
     # Algorithm
     core = Core(agent, mdp)
diff --git a/tests/algorithms/test_stochastic_ac.py b/tests/algorithms/test_stochastic_ac.py
index 2a1266c44..503be0423 100644
--- a/tests/algorithms/test_stochastic_ac.py
+++ b/tests/algorithms/test_stochastic_ac.py
@@ -40,11 +40,9 @@ def learn(alg):
 
     input_shape = (phi.size,)
 
-    mu = Regressor(LinearApproximator, input_shape=input_shape,
-                   output_shape=mdp.info.action_space.shape)
+    mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, phi=phi)
 
-    std = Regressor(LinearApproximator, input_shape=input_shape,
-                    output_shape=mdp.info.action_space.shape)
+    std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, phi=phi)
 
     std_0 = np.sqrt(1.)
     std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))
@@ -52,12 +50,11 @@ def learn(alg):
     policy = StateLogStdGaussianPolicy(mu, std)
 
     if alg is StochasticAC:
-        agent = alg(mdp.info, policy, alpha_theta, alpha_v, lambda_par=.5,
-                    value_function_features=psi, policy_features=phi)
+        agent = alg(mdp.info, policy, alpha_theta, alpha_v, lambda_par=.5,  value_function_features=psi)
     elif alg is StochasticAC_AVG:
-        agent = alg(mdp.info, policy, alpha_theta, alpha_v, alpha_r,
-                    lambda_par=.5, value_function_features=psi,
-                    policy_features=phi)
+        agent = alg(mdp.info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi)
+    else:
+        assert False
 
     core = Core(agent, mdp)
 
diff --git a/tests/algorithms/test_td.py b/tests/algorithms/test_td.py
index 9acbbaee2..fb043e733 100644
--- a/tests/algorithms/test_td.py
+++ b/tests/algorithms/test_td.py
@@ -18,6 +18,14 @@
 from mushroom_rl.utils.parameters import Parameter
 
 
+def assert_properly_loaded(agent_save, agent_load):
+    for att, method in vars(agent_save).items():
+        if att != 'next_action':
+            save_attr = getattr(agent_save, att)
+            load_attr = getattr(agent_load, att)
+            tu.assert_eq(save_attr, load_attr)
+
+
 class Network(nn.Module):
     def __init__(self, input_shape, output_shape, **kwargs):
         super().__init__()
@@ -80,10 +88,7 @@ def test_q_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_double_q_learning():
@@ -122,11 +127,7 @@ def test_double_q_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_weighted_q_learning():
@@ -160,11 +161,7 @@ def test_weighted_q_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_maxmin_q_learning():
@@ -198,11 +195,7 @@ def test_maxmin_q_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_speedy_q_learning():
@@ -236,11 +229,7 @@ def test_speedy_q_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_sarsa():
@@ -274,11 +263,7 @@ def test_sarsa_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_q_lambda():
@@ -312,11 +297,7 @@ def test_q_lambda_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_sarsa_lambda_discrete():
@@ -350,11 +331,7 @@ def test_sarsa_lambda_discrete_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_sarsa_lambda_continuous_linear():
@@ -369,11 +346,11 @@ def test_sarsa_lambda_continuous_linear():
     approximator_params = dict(
         input_shape=(features.size,),
         output_shape=(mdp_continuous.info.action_space.n,),
-        n_actions=mdp_continuous.info.action_space.n
+        n_actions=mdp_continuous.info.action_space.n,
+        phi=features
     )
     agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator,
-                                  Parameter(.1), .9, features=features,
-                                  approximator_params=approximator_params)
+                                  Parameter(.1), .9,  approximator_params=approximator_params)
 
     core = Core(agent, mdp_continuous)
 
@@ -402,11 +379,11 @@ def test_sarsa_lambda_continuous_linear_save(tmpdir):
     approximator_params = dict(
         input_shape=(features.size,),
         output_shape=(mdp_continuous.info.action_space.n,),
-        n_actions=mdp_continuous.info.action_space.n
+        n_actions=mdp_continuous.info.action_space.n,
+        phi=features,
     )
-    agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator,
-                                  Parameter(.1), .9, features=features,
-                                  approximator_params=approximator_params)
+    agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9,
+                                       approximator_params=approximator_params)
 
     core = Core(agent_save, mdp_continuous)
 
@@ -416,28 +393,19 @@ def test_sarsa_lambda_continuous_linear_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_sarsa_lambda_continuous_nn():
     pi, _, mdp_continuous = initialize()
-    
-    features = Features(
-        n_outputs=mdp_continuous.info.observation_space.shape[0]
-    )
 
     approximator_params = dict(
-        input_shape=(features.size,),
+        input_shape=mdp_continuous.info.observation_space.shape,
         output_shape=(mdp_continuous.info.action_space.n,),
         network=Network,
-        n_actions=mdp_continuous.info.action_space.n
+        n_actions=mdp_continuous.info.action_space.n,
     )
-    agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator,
-                                  Parameter(.1), .9, features=features,
+    agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9,
                                   approximator_params=approximator_params)
 
     core = Core(agent, mdp_continuous)
@@ -457,19 +425,14 @@ def test_sarsa_lambda_continuous_nn_save(tmpdir):
 
     pi, _, mdp_continuous = initialize()
 
-    features = Features(
-        n_outputs=mdp_continuous.info.observation_space.shape[0]
-    )
-
     approximator_params = dict(
-        input_shape=(features.size,),
+        input_shape=mdp_continuous.info.observation_space.shape,
         output_shape=(mdp_continuous.info.action_space.n,),
         network=Network,
         n_actions=mdp_continuous.info.action_space.n
     )
-    agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator,
-                                  Parameter(.1), .9, features=features,
-                                  approximator_params=approximator_params)
+    agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9,
+                                       approximator_params=approximator_params)
 
     core = Core(agent_save, mdp_continuous)
 
@@ -479,11 +442,7 @@ def test_sarsa_lambda_continuous_nn_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_expected_sarsa():
@@ -517,11 +476,7 @@ def test_expected_sarsa_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_true_online_sarsa_lambda():
@@ -536,10 +491,10 @@ def test_true_online_sarsa_lambda():
     approximator_params = dict(
         input_shape=(features.size,),
         output_shape=(mdp_continuous.info.action_space.n,),
-        n_actions=mdp_continuous.info.action_space.n
+        n_actions=mdp_continuous.info.action_space.n,
+        phi=features,
     )
-    agent = TrueOnlineSARSALambda(mdp_continuous.info, pi,
-                                  Parameter(.1), .9, features=features,
+    agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9,
                                   approximator_params=approximator_params)
 
     core = Core(agent, mdp_continuous)
@@ -571,11 +526,11 @@ def test_true_online_sarsa_lambda_save(tmpdir):
     approximator_params = dict(
         input_shape=(features.size,),
         output_shape=(mdp_continuous.info.action_space.n,),
-        n_actions=mdp_continuous.info.action_space.n
+        n_actions=mdp_continuous.info.action_space.n,
+        phi=features,
     )
-    agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi,
-                                  Parameter(.1), .9, features=features,
-                                  approximator_params=approximator_params)
+    agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9,
+                                       approximator_params=approximator_params)
 
     core = Core(agent_save, mdp_continuous)
 
@@ -585,11 +540,7 @@ def test_true_online_sarsa_lambda_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_r_learning():
@@ -623,11 +574,7 @@ def test_r_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)
 
 
 def test_rq_learning():
@@ -707,7 +654,4 @@ def test_rq_learning_save(tmpdir):
     agent_save.save(agent_path)
     agent_load = Agent.load(agent_path)
 
-    for att, method in vars(agent_save).items():
-        save_attr = getattr(agent_save, att)
-        load_attr = getattr(agent_load, att)
-        tu.assert_eq(save_attr, load_attr)
+    assert_properly_loaded(agent_save, agent_load)

From 9fb0b3a4a2553902622ae34a2a0d862b76935610 Mon Sep 17 00:00:00 2001
From: Boris_il_forte <boris.ilpossente@hotmail.it>
Date: Sat, 21 Oct 2023 22:57:15 +0200
Subject: [PATCH 23/24] Cleanup fit interface

- dataset now has everything: transitions, policy state, info,
theta_list
- removed the info dictionary from fit interface, as everything is
inside the dataset
- cleanup of some test to be more reasonable and up to date with the
interface
---
 .../classic_actor_critic/copdac_q.py          |  2 +-
 .../classic_actor_critic/stochastic_ac.py     |  2 +-
 .../actor_critic/deep_actor_critic/a2c.py     |  2 +-
 .../actor_critic/deep_actor_critic/ddpg.py    |  2 +-
 .../deep_actor_critic/deep_actor_critic.py    |  2 +-
 .../actor_critic/deep_actor_critic/ppo.py     |  2 +-
 .../deep_actor_critic/ppo_bptt.py             |  2 +-
 .../actor_critic/deep_actor_critic/sac.py     |  2 +-
 .../actor_critic/deep_actor_critic/trpo.py    |  2 +-
 .../black_box_optimization.py                 | 19 ++++-------
 .../policy_gradient/policy_gradient.py        |  2 +-
 .../algorithms/value/batch_td/boosted_fqi.py  |  2 +-
 .../algorithms/value/batch_td/double_fqi.py   |  2 +-
 mushroom_rl/algorithms/value/batch_td/fqi.py  |  2 +-
 mushroom_rl/algorithms/value/batch_td/lspi.py |  2 +-
 .../algorithms/value/dqn/abstract_dqn.py      |  2 +-
 .../algorithms/value/dqn/categorical_dqn.py   |  2 +-
 .../algorithms/value/dqn/maxmin_dqn.py        | 16 ++++-----
 .../algorithms/value/dqn/quantile_dqn.py      |  2 +-
 mushroom_rl/algorithms/value/dqn/rainbow.py   |  2 +-
 mushroom_rl/algorithms/value/td/td.py         |  2 +-
 mushroom_rl/core/_impl/list_dataset.py        | 22 ++++++++-----
 mushroom_rl/core/_impl/numpy_dataset.py       | 12 ++++---
 mushroom_rl/core/_impl/torch_dataset.py       | 12 ++++---
 mushroom_rl/core/agent.py                     |  4 +--
 mushroom_rl/core/core.py                      | 27 ++++++++++-----
 mushroom_rl/core/dataset.py                   | 33 +++++++++++++++----
 mushroom_rl/policy/policy.py                  |  6 ++--
 mushroom_rl/policy/torch_policy.py            |  3 --
 tests/core/test_core.py                       |  2 +-
 tests/policy/test_policy_interface.py         |  6 ++--
 tests/utils/test_preprocessors.py             | 10 +++---
 32 files changed, 119 insertions(+), 91 deletions(-)

diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
index 7086492ed..edb0e9739 100644
--- a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
+++ b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
@@ -59,7 +59,7 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, valu
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         for step in dataset:
             s, a, r, ss, absorbing, _ = step
 
diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
index 03434927a..2c49123b9 100644
--- a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
+++ b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
@@ -60,7 +60,7 @@ def episode_start(self, episode_info):
 
         return super().episode_start(episode_info)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         for step in dataset:
             s, a, r, ss, absorbing, _ = step
 
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
index fc6ae6aec..51b127439 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
@@ -56,7 +56,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
 
         super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, _ = dataset.parse(to='torch')
 
         v, adv = compute_advantage_montecarlo(self._V, state, next_state,
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ddpg.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ddpg.py
index 54c6abb73..bf3380a6a 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ddpg.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ddpg.py
@@ -97,7 +97,7 @@ def __init__(self, mdp_info, policy_class, policy_params,
 
         super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset)
         if self._replay_memory.initialized:
             state, action, reward, next_state, absorbing, _ =\
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/deep_actor_critic.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/deep_actor_critic.py
index d007b1ccd..f97b1d4d2 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/deep_actor_critic.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/deep_actor_critic.py
@@ -42,7 +42,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, parameters):
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         """
         Fit step.
 
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
index 200c71115..748ed39f8 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
@@ -70,7 +70,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
 
         v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
index 97bc8d565..5efa65448 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
@@ -78,7 +78,7 @@ def divide_state_to_env_hidden_batch(self, states):
         assert len(states.shape) > 1, "This function only divides batches of states."
         return states[:, 0:self._dim_env_state], states[:, self._dim_env_state:]
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         obs, act, r, obs_next, absorbing, last = dataset.parse(to='torch')
         policy_state, policy_next_state = dataset.parse_policy_state(to='torch')
         obs_seq, policy_state_seq, act_seq, obs_next_seq, policy_next_state_seq, lengths = \
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
index ae3b9d3cb..95adbf5e3 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
@@ -280,7 +280,7 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimize
 
         super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset)
         if self._replay_memory.initialized:
             state, action, reward, next_state, absorbing, _ = self._replay_memory.get(self._batch_size())
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
index fe80e7887..ded0856db 100644
--- a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
+++ b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
@@ -81,7 +81,7 @@ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, l
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
 
         v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,
diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
index cfbda4fc6..0ea85988a 100644
--- a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
+++ b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
@@ -20,32 +20,25 @@ def __init__(self, mdp_info, distribution, policy):
 
         """
         self.distribution = distribution
-        self._theta_list = list()
 
-        self._add_save_attr(distribution='mushroom', _theta_list='pickle')
+        self._add_save_attr(distribution='mushroom')
 
         super().__init__(mdp_info, policy, is_episodic=True)
 
     def episode_start(self, episode_info):
         theta = self.distribution.sample()
-        self._theta_list.append(theta)
         self.policy.set_weights(theta)
 
-        return super().episode_start(episode_info)
+        policy_state, _ = super().episode_start(episode_info)
 
-    def fit(self, dataset, **info):
-        Jep = dataset.compute_J(self.mdp_info.gamma)
+        return policy_state, theta
 
-        Jep = np.array(Jep)
-        theta = np.array(self._theta_list)
+    def fit(self, dataset):
+        Jep = np.array(dataset.discounted_return)
+        theta = np.array(dataset.theta_list)
 
         self._update(Jep, theta)
 
-        self._theta_list = list()
-
-    def stop(self):
-        self._theta_list = list()
-
     def _update(self, Jep, theta):
         """
         Function that implements the update routine of distribution parameters.
diff --git a/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py b/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py
index f82d3095b..6a4afef3f 100644
--- a/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py
+++ b/mushroom_rl/algorithms/policy_search/policy_gradient/policy_gradient.py
@@ -31,7 +31,7 @@ def __init__(self, mdp_info, policy, optimizer):
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         J = list()
         self.df = 1.
         self.J_episode = 0.
diff --git a/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py b/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py
index de271fd93..9c75ff757 100644
--- a/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/boosted_fqi.py
@@ -29,7 +29,7 @@ def __init__(self, mdp_info, policy, approximator, n_iterations,
 
         super().__init__(mdp_info, policy, approximator, n_iterations, approximator_params, fit_params, quiet)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, _ = dataset.parse()
         for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
             if self._target is None:
diff --git a/mushroom_rl/algorithms/value/batch_td/double_fqi.py b/mushroom_rl/algorithms/value/batch_td/double_fqi.py
index 923b1184f..ed0354988 100644
--- a/mushroom_rl/algorithms/value/batch_td/double_fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/double_fqi.py
@@ -18,7 +18,7 @@ def __init__(self, mdp_info, policy, approximator, n_iterations,
         super().__init__(mdp_info, policy, approximator, n_iterations,
                          approximator_params, fit_params, quiet)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
             state = list()
             action = list()
diff --git a/mushroom_rl/algorithms/value/batch_td/fqi.py b/mushroom_rl/algorithms/value/batch_td/fqi.py
index 6b6466a90..d52307961 100644
--- a/mushroom_rl/algorithms/value/batch_td/fqi.py
+++ b/mushroom_rl/algorithms/value/batch_td/fqi.py
@@ -33,7 +33,7 @@ def __init__(self, mdp_info, policy, approximator, n_iterations,
 
         super().__init__(mdp_info, policy, approximator, approximator_params, fit_params)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, _ = dataset.parse()
         for _ in trange(self._n_iterations(), dynamic_ncols=True, disable=self._quiet, leave=False):
             if self._target is None:
diff --git a/mushroom_rl/algorithms/value/batch_td/lspi.py b/mushroom_rl/algorithms/value/batch_td/lspi.py
index e1d80aff8..0e380d79e 100644
--- a/mushroom_rl/algorithms/value/batch_td/lspi.py
+++ b/mushroom_rl/algorithms/value/batch_td/lspi.py
@@ -26,7 +26,7 @@ def __init__(self, mdp_info, policy, approximator_params=None, epsilon=1e-2, fit
 
         super().__init__(mdp_info, policy, LinearApproximator, approximator_params, fit_params)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, _ = dataset.parse()
 
         phi_state = self.approximator.model.phi(state)
diff --git a/mushroom_rl/algorithms/value/dqn/abstract_dqn.py b/mushroom_rl/algorithms/value/dqn/abstract_dqn.py
index 70a9bec10..411eac7d7 100644
--- a/mushroom_rl/algorithms/value/dqn/abstract_dqn.py
+++ b/mushroom_rl/algorithms/value/dqn/abstract_dqn.py
@@ -81,7 +81,7 @@ def __init__(self, mdp_info, policy, approximator, approximator_params,
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._fit(dataset)
 
         self._n_updates += 1
diff --git a/mushroom_rl/algorithms/value/dqn/categorical_dqn.py b/mushroom_rl/algorithms/value/dqn/categorical_dqn.py
index daaea344a..7dee67c1c 100644
--- a/mushroom_rl/algorithms/value/dqn/categorical_dqn.py
+++ b/mushroom_rl/algorithms/value/dqn/categorical_dqn.py
@@ -113,7 +113,7 @@ def __init__(self, mdp_info, policy, approximator_params, n_atoms, v_min,
 
         super().__init__(mdp_info, policy, TorchApproximator, **params)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset)
         if self._replay_memory.initialized:
             state, action, reward, next_state, absorbing, _ =\
diff --git a/mushroom_rl/algorithms/value/dqn/maxmin_dqn.py b/mushroom_rl/algorithms/value/dqn/maxmin_dqn.py
index 144c64d91..8f202eef9 100644
--- a/mushroom_rl/algorithms/value/dqn/maxmin_dqn.py
+++ b/mushroom_rl/algorithms/value/dqn/maxmin_dqn.py
@@ -11,8 +11,7 @@ class MaxminDQN(DQN):
     Lan Q. et al.. 2020.
 
     """
-    def __init__(self, mdp_info, policy, approximator, n_approximators,
-                 **params):
+    def __init__(self, mdp_info, policy, approximator, n_approximators, **params):
         """
         Constructor.
 
@@ -26,17 +25,15 @@ def __init__(self, mdp_info, policy, approximator, n_approximators,
 
         super().__init__(mdp_info, policy, approximator, **params)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._fit_params['idx'] = np.random.randint(self._n_approximators)
 
-        super().fit(dataset, **info)
+        super().fit(dataset)
 
-    def _initialize_regressors(self, approximator, apprx_params_train,
-                               apprx_params_target):
+    def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target):
         self.approximator = Regressor(approximator,
                                       n_models=self._n_approximators,
-                                      prediction='min',
-                                      **apprx_params_train)
+                                      prediction='min', **apprx_params_train)
         self.target_approximator = Regressor(approximator,
                                              n_models=self._n_approximators,
                                              prediction='min',
@@ -45,5 +42,4 @@ def _initialize_regressors(self, approximator, apprx_params_train,
 
     def _update_target(self):
         for i in range(len(self.target_approximator)):
-            self.target_approximator[i].set_weights(
-                self.approximator[i].get_weights())
+            self.target_approximator[i].set_weights(self.approximator[i].get_weights())
diff --git a/mushroom_rl/algorithms/value/dqn/quantile_dqn.py b/mushroom_rl/algorithms/value/dqn/quantile_dqn.py
index df60e9767..32a85b0c4 100644
--- a/mushroom_rl/algorithms/value/dqn/quantile_dqn.py
+++ b/mushroom_rl/algorithms/value/dqn/quantile_dqn.py
@@ -97,7 +97,7 @@ def __init__(self, mdp_info, policy, approximator_params, n_quantiles, **params)
 
         super().__init__(mdp_info, policy, TorchApproximator, **params)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset)
         if self._replay_memory.initialized:
             state, action, reward, next_state, absorbing, _ =\
diff --git a/mushroom_rl/algorithms/value/dqn/rainbow.py b/mushroom_rl/algorithms/value/dqn/rainbow.py
index f167bae98..f058503fb 100644
--- a/mushroom_rl/algorithms/value/dqn/rainbow.py
+++ b/mushroom_rl/algorithms/value/dqn/rainbow.py
@@ -119,7 +119,7 @@ def __init__(self, mdp_info, policy, approximator_params, n_atoms, v_min,
 
         super().__init__(mdp_info, policy, TorchApproximator, **params)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset, np.ones(len(dataset)) * self._replay_memory.max_priority,
                                 n_steps_return=self._n_steps_return, gamma=self.mdp_info.gamma)
         if self._replay_memory.initialized:
diff --git a/mushroom_rl/algorithms/value/td/td.py b/mushroom_rl/algorithms/value/td/td.py
index 99178b850..bf87ae2b0 100644
--- a/mushroom_rl/algorithms/value/td/td.py
+++ b/mushroom_rl/algorithms/value/td/td.py
@@ -26,7 +26,7 @@ def __init__(self, mdp_info, policy, approximator, learning_rate):
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         assert len(dataset) == 1
 
         state, action, reward, next_state, absorbing, _ = dataset.item()
diff --git a/mushroom_rl/core/_impl/list_dataset.py b/mushroom_rl/core/_impl/list_dataset.py
index 3f781ba8b..f88e3c721 100644
--- a/mushroom_rl/core/_impl/list_dataset.py
+++ b/mushroom_rl/core/_impl/list_dataset.py
@@ -6,29 +6,33 @@
 
 
 class ListDataset(Serializable):
-    def __init__(self):
+    def __init__(self, is_stateful):
         self._dataset = list()
         self._policy_dataset = list()
+        self._is_stateful = is_stateful
 
         self._add_save_attr(
             _dataset='pickle',
-            _policy_dataset='pickle'
+            _policy_dataset='pickle',
+            _is_stateful='primitive'
         )
 
     @classmethod
     def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, policy_states=None,
                    policy_next_states=None):
-        dataset = cls()
+        is_stateful = (policy_states is not None) and (policy_next_states is not None)
 
-        if policy_states is None:
-            for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
-                                             absorbings.astype(bool), lasts.astype(bool)):
-                dataset.append(s, a, r.item(), ss, ab.item(), last.item())
-        else:
+        dataset = cls(is_stateful)
+
+        if dataset._is_stateful:
             for s, a, r, ss, ab, last, ps, pss in zip(states, actions, rewards, next_states,
                                                       absorbings.astype(bool), lasts.astype(bool),
                                                       policy_states, policy_next_states):
                 dataset.append(s, a, r.item(), ss, ab.item(), last.item(), ps.item(), pss.item())
+        else:
+            for s, a, r, ss, ab, last in zip(states, actions, rewards, next_states,
+                                             absorbings.astype(bool), lasts.astype(bool)):
+                dataset.append(s, a, r.item(), ss, ab.item(), last.item())
 
         return dataset
 
@@ -38,7 +42,7 @@ def __len__(self):
     def append(self, *step):
         step_copy = deepcopy(step)
         self._dataset.append(step_copy[:6])
-        if len(step_copy) == 8:
+        if self._is_stateful:
             self._policy_dataset.append(step_copy[6:])
 
     def clear(self):
diff --git a/mushroom_rl/core/_impl/numpy_dataset.py b/mushroom_rl/core/_impl/numpy_dataset.py
index 81141c729..d8986d2c5 100644
--- a/mushroom_rl/core/_impl/numpy_dataset.py
+++ b/mushroom_rl/core/_impl/numpy_dataset.py
@@ -100,7 +100,7 @@ def append(self, state, action, reward, next_state, absorbing, last, policy_stat
         self._absorbing[i] = absorbing
         self._last[i] = last
 
-        if policy_state is not None:
+        if self._is_stateful:
             self._policy_states[i] = policy_state
             self._policy_next_states[i] = policy_next_state
 
@@ -114,7 +114,7 @@ def clear(self):
         self._absorbing = np.empty_like(self._absorbing)
         self._last = np.empty_like(self._last)
 
-        if self._policy_states is not None:
+        if self._is_stateful:
             self._policy_states = np.empty_like(self._policy_states)
             self._policy_next_states = np.empty_like(self._policy_next_states)
 
@@ -131,7 +131,7 @@ def get_view(self, index):
         view._last = self.last[index, ...]
         view._len = view._states.shape[0]
 
-        if self._policy_states is not None:
+        if self._is_stateful:
             view._policy_states = self._policy_states[index, ...]
             view._policy_next_states = self._policy_next_states[index, ...]
 
@@ -153,7 +153,7 @@ def __add__(self, other):
         result._last[len(self)-1] = True
         result._len = len(self) + len(other)
 
-        if result._policy_states is not None:
+        if self._is_stateful:
             result._policy_states = np.concatenate((self.policy_state, other.policy_state))
             result._policy_next_states = np.concatenate((self.policy_next_state, other.policy_next_state))
 
@@ -190,3 +190,7 @@ def policy_state(self):
     @property
     def policy_next_state(self):
         return self._policy_next_states[:len(self)]
+
+    @property
+    def _is_stateful(self):
+        return self._policy_states is not None
\ No newline at end of file
diff --git a/mushroom_rl/core/_impl/torch_dataset.py b/mushroom_rl/core/_impl/torch_dataset.py
index 073aa68ac..7e7309e88 100644
--- a/mushroom_rl/core/_impl/torch_dataset.py
+++ b/mushroom_rl/core/_impl/torch_dataset.py
@@ -100,7 +100,7 @@ def append(self, state, action, reward, next_state, absorbing, last, policy_stat
         self._absorbing[i] = absorbing
         self._last[i] = last
 
-        if policy_state is not None:
+        if self._is_stateful:
             self._policy_states[i] = policy_state
             self._policy_next_states[i] = policy_next_state
 
@@ -114,7 +114,7 @@ def clear(self):
         self._absorbing = torch.empty_like(self._absorbing)
         self._last = torch.empty_like(self._last)
 
-        if self._policy_states is not None:
+        if self._is_stateful:
             self._policy_states = torch.empty_like(self._policy_states)
             self._policy_next_states = torch.empty_like(self._policy_next_states)
 
@@ -131,7 +131,7 @@ def get_view(self, index):
         view._last = self._last[index, ...]
         view._len = view._states.shape[0]
 
-        if self._policy_states is not None:
+        if self._is_stateful:
             view._policy_states = self._policy_states[index, ...]
             view._policy_next_states = self._policy_next_states[index, ...]
 
@@ -153,7 +153,7 @@ def __add__(self, other):
         result._last[len(self) - 1] = True
         result._len = len(self) + len(other)
 
-        if result._policy_states is not None:
+        if self._is_stateful:
             result._policy_states = torch.concatenate((self.policy_state, other.policy_state))
             result._policy_next_states = torch.concatenate((self.policy_next_state, other.policy_next_state))
 
@@ -190,3 +190,7 @@ def policy_state(self):
     @property
     def policy_next_state(self):
         return self._policy_next_states[:len(self)]
+
+    @property
+    def _is_stateful(self):
+        return self._policy_states is not None
diff --git a/mushroom_rl/core/agent.py b/mushroom_rl/core/agent.py
index d31095831..136edf874 100644
--- a/mushroom_rl/core/agent.py
+++ b/mushroom_rl/core/agent.py
@@ -66,7 +66,7 @@ def __init__(self, mdp_info, policy, is_episodic=False, backend='numpy'):
             _logger='none'
         )
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         """
         Fit step.
 
@@ -106,7 +106,7 @@ def episode_start(self, episode_info):
             episode_info (dict): a dictionary containing the information at reset, such as context.
 
         """
-        return self.policy.reset()
+        return self.policy.reset(), None
 
     def stop(self):
         """
diff --git a/mushroom_rl/core/core.py b/mushroom_rl/core/core.py
index 5c76df230..1dc5154ac 100644
--- a/mushroom_rl/core/core.py
+++ b/mushroom_rl/core/core.py
@@ -27,6 +27,7 @@ def __init__(self, agent, mdp, callbacks_fit=None, callback_step=None, record_di
 
         self._state = None
         self._policy_state = None
+        self._current_theta = None
         self._episode_steps = None
 
         self._core_logic = CoreLogic()
@@ -59,7 +60,7 @@ def learn(self, n_steps=None, n_episodes=None, n_steps_per_fit=None,
         assert (render and record) or (not record), "To record, the render flag must be set to true"
         self._core_logic.initialize_fit(n_steps_per_fit, n_episodes_per_fit)
 
-        dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape, n_steps_per_fit, n_episodes_per_fit)
+        dataset = Dataset(self.mdp.info, self.agent.info, n_steps_per_fit, n_episodes_per_fit)
 
         self._run(dataset, n_steps, n_episodes, render, quiet, record)
 
@@ -87,7 +88,7 @@ def evaluate(self, initial_states=None, n_steps=None, n_episodes=None, render=Fa
         self._core_logic.initialize_evaluate()
 
         n_episodes_dataset = len(initial_states) if initial_states is not None else n_episodes
-        dataset = Dataset(self.mdp.info, self.agent.policy.policy_state_shape, n_steps, n_episodes_dataset)
+        dataset = Dataset(self.mdp.info, self.agent.info, n_steps, n_episodes_dataset)
 
         return self._run(dataset, n_steps, n_episodes, render, quiet, record, initial_states)
 
@@ -98,6 +99,8 @@ def _run(self, dataset, n_steps, n_episodes, render, quiet, record, initial_stat
         while self._core_logic.move_required():
             if last:
                 self._reset(initial_states)
+                if self.agent.info.is_episodic:
+                    dataset.append_theta(self._current_theta)
 
             sample, step_info = self._step(render, record)
 
@@ -120,10 +123,7 @@ def _run(self, dataset, n_steps, n_episodes, render, quiet, record, initial_stat
         self.agent.stop()
         self.mdp.stop()
 
-        if record:
-            self._record.stop()
-
-        self._core_logic.terminate_run()
+        self._end(record)
 
         return dataset
 
@@ -150,7 +150,7 @@ def _step(self, render, record):
 
         self._episode_steps += 1
 
-        last = not(self._episode_steps < self.mdp.info.horizon and not absorbing)
+        last = self._episode_steps >= self.mdp.info.horizon or absorbing
 
         state = self._state
         policy_state = self._policy_state
@@ -168,12 +168,23 @@ def _reset(self, initial_states):
         initial_state = self._core_logic.get_initial_state(initial_states)
 
         state, episode_info = self.mdp.reset(initial_state)
-        self._policy_state = self.agent.episode_start(episode_info)
+        self._policy_state, self._current_theta = self.agent.episode_start(episode_info)
         self._state = self._preprocess(state)
         self.agent.next_action = None
 
         self._episode_steps = 0
 
+    def _end(self, record):
+        self._state = None
+        self._policy_state = None
+        self._current_theta = None
+        self._episode_steps = None
+
+        if record:
+            self._record.stop()
+
+        self._core_logic.terminate_run()
+
     def _preprocess(self, state):
         """
         Method to apply state preprocessors.
diff --git a/mushroom_rl/core/dataset.py b/mushroom_rl/core/dataset.py
index 6e94e721d..18deb2192 100644
--- a/mushroom_rl/core/dataset.py
+++ b/mushroom_rl/core/dataset.py
@@ -8,7 +8,7 @@
 
 
 class Dataset(Serializable):
-    def __init__(self, mdp_info, policy_state_shape, n_steps=None, n_episodes=None):
+    def __init__(self, mdp_info, agent_info, n_steps=None, n_episodes=None):
         assert (n_steps is not None and n_episodes is None) or (n_steps is None and n_episodes is not None)
 
         if n_steps is not None:
@@ -23,14 +23,17 @@ def __init__(self, mdp_info, policy_state_shape, n_steps=None, n_episodes=None):
         action_shape = (n_samples,) + mdp_info.action_space.shape
         reward_shape = (n_samples,)
 
-        if policy_state_shape is not None:
-            policy_state_shape = (n_samples,) + policy_state_shape
+        if agent_info.is_stateful:
+            policy_state_shape = (n_samples,) + agent_info.policy_state_shape
+        else:
+            policy_state_shape = None
 
         state_type = mdp_info.observation_space.data_type
         action_type = mdp_info.action_space.data_type
 
         self._info = defaultdict(list)
         self._episode_info = defaultdict(list)
+        self._theta_list = list()
 
         if mdp_info.backend == 'numpy':
             self._data = NumpyDataset(state_type, state_shape, action_type, action_shape, reward_shape,
@@ -48,14 +51,16 @@ def __init__(self, mdp_info, policy_state_shape, n_steps=None, n_episodes=None):
         self._add_save_attr(
             _info='pickle',
             _episode_info='pickle',
+            _theta_list='pickle',
             _data='mushroom',
             _converter='primitive',
             _gamma='primitive',
         )
 
     @classmethod
-    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, policy_state=None,
-                   policy_next_state=None, info=None, episode_info=None, gamma=0.99, backend='numpy'):
+    def from_array(cls, states, actions, rewards, next_states, absorbings, lasts,
+                   policy_state=None, policy_next_state=None, info=None, episode_info=None, theta_list=None,
+                   gamma=0.99, backend='numpy'):
         """
         Creates a dataset of transitions from the provided arrays.
 
@@ -69,7 +74,8 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, po
             policy_state (np.ndarray, None): array of policy internal states;
             policy_next_state (np.ndarray, None): array of next policy internal states;
             info (dict, None): dictiornay of step info;
-            episode_info (dict, None): dictiornary of episode info
+            episode_info (dict, None): dictiornary of episode info;
+            theta_list (list, None): list of policy parameters;
             gamma (float, 0.99): discount factor;
             backend (str, 'numpy'): backend to be used by the dataset.
 
@@ -95,6 +101,11 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, po
         else:
             dataset._episode_info = episode_info.copy()
 
+        if theta_list is None:
+            dataset._theta_list = list()
+        else:
+            dataset._theta_list = theta_list
+
         if backend == 'numpy':
             dataset._data = NumpyDataset.from_array(states, actions, rewards, next_states, absorbings, lasts)
             dataset._converter = NumpyConversion
@@ -108,6 +119,7 @@ def from_array(cls, states, actions, rewards, next_states, absorbings, lasts, po
         dataset._add_save_attr(
             _info='pickle',
             _episode_info='pickle',
+            _theta_list='pickle',
             _data='mushroom',
             _converter='primitive',
             _gamma='primitive'
@@ -122,6 +134,9 @@ def append(self, step, info):
     def append_episode_info(self, info):
         self._append_info(self._episode_info, info)
 
+    def append_theta(self, theta):
+        self._theta_list.append(theta)
+
     def get_info(self, field, index=None):
         if index is None:
             return self._info[field]
@@ -165,6 +180,7 @@ def __add__(self, other):
 
         result._info = new_info
         result._episode_info = new_episode_info
+        result.theta_list = result._theta_list + other._theta_list
         result._data = self._data + other._data
 
         return result
@@ -212,6 +228,10 @@ def info(self):
     def episode_info(self):
         return self._episode_info
 
+    @property
+    def theta_list(self):
+        return self._theta_list
+
     @property
     def episodes_length(self):
         """
@@ -372,6 +392,7 @@ def compute_metrics(self, gamma=1.):
             If no episode has been completed, it returns 0 for all values.
 
         """
+        i = 0
         for i in reversed(range(len(self))):
             if self.last[i]:
                 i += 1
diff --git a/mushroom_rl/policy/policy.py b/mushroom_rl/policy/policy.py
index fa1964404..0d55b6c00 100644
--- a/mushroom_rl/policy/policy.py
+++ b/mushroom_rl/policy/policy.py
@@ -19,7 +19,7 @@ def __init__(self, policy_state_shape=None):
         """
         self.policy_state_shape = policy_state_shape
 
-    def __call__(self, state, action=None, policy_state=None):
+    def __call__(self, state, action, policy_state):
         """
         Compute the probability of taking action in a certain state following
         the policy.
@@ -37,7 +37,7 @@ def __call__(self, state, action=None, policy_state=None):
         """
         raise NotImplementedError
 
-    def draw_action(self, state, policy_state=None):
+    def draw_action(self, state, policy_state):
         """
         Sample an action in ``state`` using the policy.
 
@@ -85,7 +85,7 @@ def __init__(self, policy_state_shape=None):
         """
         super().__init__(policy_state_shape)
 
-    def diff_log(self, state, action, policy_state=None):
+    def diff_log(self, state, action, policy_state):
         """
         Compute the gradient of the logarithm of the probability density
         function, in the specified state and action pair, i.e.:
diff --git a/mushroom_rl/policy/torch_policy.py b/mushroom_rl/policy/torch_policy.py
index b626ba508..e9bd9a86a 100644
--- a/mushroom_rl/policy/torch_policy.py
+++ b/mushroom_rl/policy/torch_policy.py
@@ -169,9 +169,6 @@ def parameters(self):
         """
         raise NotImplementedError
 
-    def reset(self):
-        pass
-
     @property
     def use_cuda(self):
         """
diff --git a/tests/core/test_core.py b/tests/core/test_core.py
index 470a9bab0..aa6fc8778 100644
--- a/tests/core/test_core.py
+++ b/tests/core/test_core.py
@@ -20,7 +20,7 @@ def __init__(self, mdp_info):
         policy = RandomDiscretePolicy(mdp_info.action_space.n)
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         pass
 
 
diff --git a/tests/policy/test_policy_interface.py b/tests/policy/test_policy_interface.py
index 680c2ce7b..dee48ba63 100644
--- a/tests/policy/test_policy_interface.py
+++ b/tests/policy/test_policy_interface.py
@@ -12,15 +12,15 @@ def abstract_method_tester(f, ex, *args):
 
 def test_policy_interface():
     tmp = Policy()
-    abstract_method_tester(tmp.__call__, NotImplementedError, None)
+    abstract_method_tester(tmp.__call__, NotImplementedError, None, None, None)
     abstract_method_tester(tmp.draw_action, NotImplementedError, None, None)
     tmp.reset()
 
 
 def test_parametric_policy():
     tmp = ParametricPolicy()
-    abstract_method_tester(tmp.diff_log, RuntimeError, None, None)
-    abstract_method_tester(tmp.diff, RuntimeError, None, None)
+    abstract_method_tester(tmp.diff_log, RuntimeError, None, None, None)
+    abstract_method_tester(tmp.diff, RuntimeError, None, None, None)
     abstract_method_tester(tmp.set_weights, NotImplementedError, None)
     abstract_method_tester(tmp.get_weights, NotImplementedError)
     try:
diff --git a/tests/utils/test_preprocessors.py b/tests/utils/test_preprocessors.py
index dbe584f10..ed7987dac 100644
--- a/tests/utils/test_preprocessors.py
+++ b/tests/utils/test_preprocessors.py
@@ -64,20 +64,18 @@ def test_normalizing_preprocessor(tmpdir):
     alg_params = dict(batch_size=5, initial_replay_size=10,
                       max_replay_size=500, target_update_frequency=50)
 
-    agent = DQN(mdp.info, pi, TorchApproximator,
-                approximator_params=approximator_params, **alg_params)
+    agent = DQN(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, **alg_params)
 
-    norm_box = MinMaxPreprocessor(mdp_info=mdp.info,
-                                  clip_obs=5.0, alpha=0.001)
+    norm_box = MinMaxPreprocessor(mdp_info=mdp.info, clip_obs=5.0, alpha=0.001)
     agent.add_preprocessor(norm_box)
 
     core = Core(agent, mdp)
 
     core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
+    dataset = core.evaluate(n_steps=1000)
 
     # training correctly
-    assert (core._state.min() >= -norm_box._clip_obs
-            and core._state.max() <= norm_box._clip_obs)
+    assert (dataset.state.min() >= -norm_box._clip_obs and dataset.state.max() <= norm_box._clip_obs)
 
     # save current dict
     state_dict1 = deepcopy(norm_box.__dict__)

From 85875464f39220cb4fd9a92e4ec87355b693c4cc Mon Sep 17 00:00:00 2001
From: boris-il-forte <boris.ilpossente@hotmail.it>
Date: Tue, 24 Oct 2023 19:53:09 +0200
Subject: [PATCH 24/24] First stub of vectorized and parallel environments

- created the stub for parallel and vectorized environments interfaces
- still to be tested
- requires vectorized core
---
 mushroom_rl/core/parallel_environment.py | 175 ++++++++++++-----------
 mushroom_rl/core/vectorized_env.py       |  29 ++++
 2 files changed, 118 insertions(+), 86 deletions(-)
 create mode 100644 mushroom_rl/core/vectorized_env.py

diff --git a/mushroom_rl/core/parallel_environment.py b/mushroom_rl/core/parallel_environment.py
index 6ed2e793b..a1bc90d3e 100644
--- a/mushroom_rl/core/parallel_environment.py
+++ b/mushroom_rl/core/parallel_environment.py
@@ -1,124 +1,127 @@
-from .environment import Environment
-
-
-class ParallelEnvironment(object):
+from multiprocessing import Pipe
+from multiprocessing import Process
+
+from .vectorized_env import VectorizedEnvironment
+
+
+def _parallel_env_worker(remote, env_class, use_generator, args, kwargs):
+
+    if use_generator:
+        env = env_class.generate(*args, **kwargs)
+    else:
+        env = env_class(*args, **kwargs)
+
+    try:
+        while True:
+            cmd, data = remote.recv()
+            if cmd == 'step':
+                action = data[0]
+                res = env.step(action)
+                remote.send(res)
+            elif cmd == 'reset':
+                init_states = data[0]
+                res = env.reset(init_states)
+                remote.send(res)
+            elif cmd in 'stop':
+                env.stop()
+            elif cmd == 'info':
+                remote.send(env.info)
+            elif cmd == 'seed':
+                env.seed(int(data))
+            else:
+                raise NotImplementedError()
+    finally:
+        remote.close()
+
+
+class ParallelEnvironment(VectorizedEnvironment):
     """
-    Basic interface to generate and collect multiple copies of the same environment.
+    Basic interface to run in parallel multiple copies of the same environment.
     This class assumes that the environments are homogeneus, i.e. have the same type and MDP info.
 
     """
-    def __init__(self, env_list):
+    def __init__(self, env_class, *args, n_envs=-1, use_generator=False, **kwargs):
         """
         Constructor.
 
         Args:
-            env_list: list of the environments to be evaluated in parallel.
+            env_class (class): The environment class to be used;
+            *args: the positional arguments to give to the constructor or to the generator of the class;
+            n_envs (int, -1): number of parallel copies of environment to construct;
+            use_generator (bool, False): wheather to use the generator to build the environment or not;
+            **kwargs: keyword arguments to set to the constructor or to the generator;
 
         """
-        self.envs = env_list
+        assert n_envs > 1
 
-    @property
-    def info(self):
-        """
-        Returns:
-             An object containing the info of all environments.
+        self._remotes, self._work_remotes = zip(*[Pipe() for _ in range(n_envs)])
+        self._processes = [Process(target=_parallel_env_worker,
+                                   args=(work_remote, env_class, use_generator, args, kwargs))
+                           for work_remote in self._work_remotes]
 
-        """
-        return self.envs[0].info
+        for p in self._processes:
+            p.start()
 
-    def __len__(self):
-        return len(self.envs)
+        self._remotes[0].send(('info', None))
+        mdp_info = self._remotes[0].recv()
 
-    def __getitem__(self, item):
-        return self.envs[item]
+        super().__init__(mdp_info, n_envs)
 
-    def seed(self, seeds):
-        """
-        Set the seed of all environments.
+    def step_all(self, env_mask, action):
+        for i, remote in enumerate(self._remotes):
+            if env_mask[i]:
+                remote.send(('step', action[i, :]))
 
-        Args:
-            seeds ([int, list]): the value of the seed or a list of seeds for each environment. The list lenght must be
-                equal to the number of parallel environments.
+        results = []
+        for i, remote in enumerate(self._remotes):
+            if env_mask[i]:
+                results.extend(remote.recv())
 
-        """
-        if isinstance(seeds, list):
-            assert len(seeds) == len(self)
-            for env, seed in zip(self.envs,seeds):
-                env.seed(seed)
-        else:
-            for env in self.envs:
-                env.seed(seeds)
+        return zip(*results)  # FIXME!!!
 
-    def stop(self):
-        """
-        Method used to stop an mdp. Useful when dealing with real world environments, simulators, or when using
-        openai-gym rendering
+    def reset_all(self, env_mask, state=None):
+        for i in range(self._n_envs):
+            state_i = state[i, :] if state is not None else None
+            self._remotes[i].send(('reset', state_i))
 
-        """
-        for env in self.envs:
-            env.stop()
+        results = []
+        for i, remote in enumerate(self._remotes):
+            if env_mask[i]:
+                results.extend(remote.recv())
 
-    @staticmethod
-    def make(env_name, n_envs, use_constructor=False, *args, **kwargs):
-        """
-        Generate multiple copies of a given environment using the specified name and parameters.
-        The environment is created using the generate method, if available. Otherwise, the constructor is used.
-        See the `Environment.make` documentation for more information.
+        return zip(*results)  # FIXME!!!
 
-        Args:
-            env_name (str): Name of the environment;
-            n_envs (int): Number of environments in parallel to generate;
-            use_constructor (bool, False): whether to force the method to use the constructor instead of the generate
-                method;
-            *args: positional arguments to be provided to the environment generator/constructor;
-            **kwargs: keyword arguments to be provided to the environment generator/constructor.
+    def seed(self, seed):
+        for remote in self._remotes:
+            remote.send(('seed', seed))
 
-        Returns:
-            An instance of the constructed environment.
+        for remote in self._remotes:
+            remote.recv()
 
-        """
-        if '.' in env_name:
-            env_data = env_name.split('.')
-            env_name = env_data[0]
-            args = env_data[1:] + list(args)
-
-        env = Environment._registered_envs[env_name]
-
-        if not use_constructor and hasattr(env, 'generate'):
-            return ParallelEnvironment.generate(env, *args, **kwargs)
-        else:
-            return ParallelEnvironment([env(*args, **kwargs) for _ in range(n_envs)])
-
-    @staticmethod
-    def init(env, n_envs, *args, **kwargs):
-        """
-        Method to generate an array of multiple copies of the same environment, calling the constructor n_envs times
-
-        Args:
-            env (class): the environment to be constructed;
-            *args: positional arguments to be passed to the constructor;
-            n_envs (int, 1): number of environments to generate;
-            **kwargs: keywords arguments to be passed to the constructor
-
-        Returns:
-            A list containing multiple copies of the environment.
+    def stop(self):
+        for remote in self._remotes:
+            remote.send(('stop', None))
 
-        """
-        return
+    def __del__(self):
+        for remote in self._remotes:
+            remote.send(('close', None))
+        for p in self._processes:
+            p.join()
 
     @staticmethod
-    def generate(env, n_envs, *args, **kwargs):
+    def generate(env, *args, n_envs=-1, **kwargs):
         """
         Method to generate an array of multiple copies of the same environment, calling the generate method n_envs times
 
         Args:
             env (class): the environment to be constructed;
             *args: positional arguments to be passed to the constructor;
-            n_envs (int, 1): number of environments to generate;
+            n_envs (int, -1): number of environments to generate;
             **kwargs: keywords arguments to be passed to the constructor
 
         Returns:
             A list containing multiple copies of the environment.
 
         """
-        return ParallelEnvironment([env.generate(*args, **kwargs) for _ in range(n_envs)])
\ No newline at end of file
+        use_generator = hasattr(env, 'generate')
+        return ParallelEnvironment(env, *args, n_envs=n_envs, use_generator=use_generator, **kwargs)
\ No newline at end of file
diff --git a/mushroom_rl/core/vectorized_env.py b/mushroom_rl/core/vectorized_env.py
new file mode 100644
index 000000000..ad5bf38ab
--- /dev/null
+++ b/mushroom_rl/core/vectorized_env.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+from .environment import Environment
+
+
+class VectorizedEnvironment(Environment):
+    """
+    Class to create a Mushroom environment using the PyBullet simulator.
+
+    """
+    def __init__(self, mdp_info, n_envs):
+        self._n_envs = n_envs
+        super().__init__(mdp_info)
+
+    def reset(self, state=None):
+        env_mask = np.zeros(dtype=bool)
+        env_mask[0] = True
+        return self.reset_all(env_mask, state)
+
+    def step(self, action):
+        env_mask = np.zeros(dtype=bool)
+        env_mask[0] = True
+        return self.step_all(env_mask, action)
+
+    def step_all(self, env_mask, action):
+        raise NotImplementedError
+
+    def reset_all(self, env_mask, state=None):
+        raise NotImplementedError