From dea712f8af1ab56188e0661f0bc78e42dc3fbc90 Mon Sep 17 00:00:00 2001
From: Noah Farr <login@noahfarr.com>
Date: Sat, 28 Dec 2024 23:58:09 +0100
Subject: [PATCH] Finish locomotion envs

---
 examples/mujoco_ant_ppo.py                    | 121 ++++++++++++
 examples/mujoco_ant_sac.py                    | 159 ----------------
 examples/mujoco_half_cheetah_ppo.py           | 121 ++++++++++++
 examples/mujoco_half_cheetah_sac.py           | 159 ----------------
 examples/mujoco_hopper_ppo.py                 | 121 ++++++++++++
 examples/mujoco_hopper_sac.py                 | 159 ----------------
 examples/mujoco_walker_2d_ppo.py              | 121 ++++++++++++
 examples/mujoco_walker_2d_sac.py              | 159 ----------------
 mushroom_rl/environments/mujoco.py            | 177 +++++++++++++-----
 .../mujoco_envs/air_hockey/double.py          | 122 ++++++++----
 .../mujoco_envs/air_hockey/single.py          |  82 +++++---
 mushroom_rl/environments/mujoco_envs/ant.py   |  63 +++----
 .../environments/mujoco_envs/half_cheetah.py  |  40 ++--
 .../environments/mujoco_envs/hopper.py        | 112 +++++------
 .../environments/mujoco_envs/walker_2d.py     |  91 +++------
 15 files changed, 878 insertions(+), 929 deletions(-)
 create mode 100644 examples/mujoco_ant_ppo.py
 delete mode 100644 examples/mujoco_ant_sac.py
 create mode 100644 examples/mujoco_half_cheetah_ppo.py
 delete mode 100644 examples/mujoco_half_cheetah_sac.py
 create mode 100644 examples/mujoco_hopper_ppo.py
 delete mode 100644 examples/mujoco_hopper_sac.py
 create mode 100644 examples/mujoco_walker_2d_ppo.py
 delete mode 100644 examples/mujoco_walker_2d_sac.py

diff --git a/examples/mujoco_ant_ppo.py b/examples/mujoco_ant_ppo.py
new file mode 100644
index 00000000..362ecaf5
--- /dev/null
+++ b/examples/mujoco_ant_ppo.py
@@ -0,0 +1,121 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+from mushroom_rl.algorithms.actor_critic import PPO
+from mushroom_rl.core import Core, Logger
+from mushroom_rl.environments.mujoco_envs.ant import Ant
+from mushroom_rl.policy import GaussianTorchPolicy
+
+from tqdm import trange
+
+
+class Network(nn.Module):
+    def __init__(self, input_shape, output_shape, n_features, **kwargs):
+        super(Network, self).__init__()
+
+        n_input = input_shape[-1]
+        n_output = output_shape[0]
+
+        self._h1 = nn.Linear(n_input, n_features)
+        self._h2 = nn.Linear(n_features, n_features)
+        self._h3 = nn.Linear(n_features, n_output)
+
+        nn.init.xavier_uniform_(
+            self._h1.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h2.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h3.weight, gain=nn.init.calculate_gain("linear") / 10
+        )
+
+    def forward(self, state, **kwargs):
+        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
+        features2 = F.relu(self._h2(features1))
+        a = self._h3(features2)
+
+        return a
+
+
+def experiment(n_epochs, n_steps, n_episodes_test, seed=0):
+
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    logger = Logger(PPO.__name__, results_dir=None)
+    logger.strong_line()
+    logger.info("Experiment Algorithm: " + PPO.__name__)
+
+    mdp = Ant()
+
+    actor_lr = 3e-4
+    critic_lr = 3e-4
+    n_features = 32
+    batch_size = 64
+    n_epochs_policy = 10
+    eps = 0.2
+    lam = 0.95
+    std_0 = 1.0
+    n_steps_per_fit = 2000
+
+    critic_params = dict(
+        network=Network,
+        optimizer={"class": optim.Adam, "params": {"lr": critic_lr}},
+        loss=F.mse_loss,
+        n_features=n_features,
+        batch_size=batch_size,
+        input_shape=mdp.info.observation_space.shape,
+        output_shape=(1,),
+    )
+
+    alg_params = dict(
+        actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}},
+        n_epochs_policy=n_epochs_policy,
+        batch_size=batch_size,
+        eps_ppo=eps,
+        lam=lam,
+        critic_params=critic_params,
+    )
+
+    policy_params = dict(std_0=std_0, n_features=n_features)
+
+    policy = GaussianTorchPolicy(
+        Network,
+        mdp.info.observation_space.shape,
+        mdp.info.action_space.shape,
+        **policy_params,
+    )
+
+    agent = PPO(mdp.info, policy, **alg_params)
+
+    core = Core(agent, mdp)
+
+    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+    J = np.mean(dataset.discounted_return)
+    R = np.mean(dataset.undiscounted_return)
+    E = agent.policy.entropy()
+
+    logger.epoch_info(0, J=J, R=R, entropy=E)
+
+    for it in trange(n_epochs, leave=False):
+        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
+        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+        J = np.mean(dataset.discounted_return)
+        R = np.mean(dataset.undiscounted_return)
+        E = agent.policy.entropy()
+
+        logger.epoch_info(it + 1, J=J, R=R, entropy=E)
+
+    logger.info("Press a button to visualize")
+    input()
+    core.evaluate(n_episodes=5, render=True)
+
+
+if __name__ == "__main__":
+    experiment(n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/examples/mujoco_ant_sac.py b/examples/mujoco_ant_sac.py
deleted file mode 100644
index e633add2..00000000
--- a/examples/mujoco_ant_sac.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-
-from mushroom_rl.algorithms.actor_critic import SAC
-from mushroom_rl.core import Core, Logger
-from mushroom_rl.environments.mujoco_envs.ant import Ant
-
-from tqdm import trange
-
-
-class CriticNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super().__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state, action):
-        state_action = torch.cat((state.float(), action.float()), dim=1)
-        features1 = F.relu(self._h1(state_action))
-        features2 = F.relu(self._h2(features1))
-        q = self._h3(features2)
-
-        return torch.squeeze(q)
-
-
-class ActorNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super(ActorNetwork, self).__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state):
-        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
-        features2 = F.relu(self._h2(features1))
-        a = self._h3(features2)
-
-        return a
-
-
-def experiment(alg, n_epochs, n_steps, n_episodes_test):
-    np.random.seed()
-
-    logger = Logger(alg.__name__, results_dir=None)
-    logger.strong_line()
-    logger.info("Experiment Algorithm: " + alg.__name__)
-
-    # MDP
-    mdp = Ant()
-
-    # Settings
-    initial_replay_size = 5_000
-    max_replay_size = 500_000
-    batch_size = 256
-    n_features = 256
-    warmup_transitions = 10_000
-    tau = 5e-3
-    lr_alpha = 3e-4
-
-    # Approximator
-    actor_input_shape = mdp.info.observation_space.shape
-    actor_mu_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-    actor_sigma_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-
-    actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}}
-
-    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
-    critic_params = dict(
-        network=CriticNetwork,
-        optimizer={"class": optim.Adam, "params": {"lr": 3e-4}},
-        loss=F.mse_loss,
-        n_features=n_features,
-        input_shape=critic_input_shape,
-        output_shape=(1,),
-    )
-
-    # Agent
-    agent = alg(
-        mdp.info,
-        actor_mu_params,
-        actor_sigma_params,
-        actor_optimizer,
-        critic_params,
-        batch_size,
-        initial_replay_size,
-        max_replay_size,
-        warmup_transitions,
-        tau,
-        lr_alpha,
-        critic_fit_params=None,
-    )
-
-    # Algorithm
-    core = Core(agent, mdp)
-
-    # RUN
-    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
-
-    J = np.mean(dataset.discounted_return)
-    R = np.mean(dataset.undiscounted_return)
-    E = agent.policy.entropy(dataset.state)
-
-    logger.epoch_info(0, J=J, R=R, entropy=E)
-
-    core.learn(
-        n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True
-    )
-
-    for n in trange(n_epochs, leave=False):
-        core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)
-        dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True)
-
-        J = np.mean(dataset.discounted_return)
-        R = np.mean(dataset.undiscounted_return)
-        E = agent.policy.entropy(dataset.state)
-
-        logger.epoch_info(n + 1, J=J, R=R, entropy=E)
-
-    logger.info("Press a button to visualize")
-    input()
-    core.evaluate(n_episodes=5, render=True)
-
-
-if __name__ == "__main__":
-    algs = [SAC]
-
-    for alg in algs:
-        experiment(alg=alg, n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/examples/mujoco_half_cheetah_ppo.py b/examples/mujoco_half_cheetah_ppo.py
new file mode 100644
index 00000000..643e09d3
--- /dev/null
+++ b/examples/mujoco_half_cheetah_ppo.py
@@ -0,0 +1,121 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+from mushroom_rl.algorithms.actor_critic import PPO
+from mushroom_rl.core import Core, Logger
+from mushroom_rl.environments.mujoco_envs.half_cheetah import HalfCheetah
+from mushroom_rl.policy import GaussianTorchPolicy
+
+from tqdm import trange
+
+
+class Network(nn.Module):
+    def __init__(self, input_shape, output_shape, n_features, **kwargs):
+        super(Network, self).__init__()
+
+        n_input = input_shape[-1]
+        n_output = output_shape[0]
+
+        self._h1 = nn.Linear(n_input, n_features)
+        self._h2 = nn.Linear(n_features, n_features)
+        self._h3 = nn.Linear(n_features, n_output)
+
+        nn.init.xavier_uniform_(
+            self._h1.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h2.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h3.weight, gain=nn.init.calculate_gain("linear") / 10
+        )
+
+    def forward(self, state, **kwargs):
+        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
+        features2 = F.relu(self._h2(features1))
+        a = self._h3(features2)
+
+        return a
+
+
+def experiment(n_epochs, n_steps, n_episodes_test, seed=0):
+
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    logger = Logger(PPO.__name__, results_dir=None)
+    logger.strong_line()
+    logger.info("Experiment Algorithm: " + PPO.__name__)
+
+    mdp = HalfCheetah()
+
+    actor_lr = 3e-4
+    critic_lr = 3e-4
+    n_features = 32
+    batch_size = 64
+    n_epochs_policy = 10
+    eps = 0.2
+    lam = 0.95
+    std_0 = 1.0
+    n_steps_per_fit = 2000
+
+    critic_params = dict(
+        network=Network,
+        optimizer={"class": optim.Adam, "params": {"lr": critic_lr}},
+        loss=F.mse_loss,
+        n_features=n_features,
+        batch_size=batch_size,
+        input_shape=mdp.info.observation_space.shape,
+        output_shape=(1,),
+    )
+
+    alg_params = dict(
+        actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}},
+        n_epochs_policy=n_epochs_policy,
+        batch_size=batch_size,
+        eps_ppo=eps,
+        lam=lam,
+        critic_params=critic_params,
+    )
+
+    policy_params = dict(std_0=std_0, n_features=n_features)
+
+    policy = GaussianTorchPolicy(
+        Network,
+        mdp.info.observation_space.shape,
+        mdp.info.action_space.shape,
+        **policy_params,
+    )
+
+    agent = PPO(mdp.info, policy, **alg_params)
+
+    core = Core(agent, mdp)
+
+    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+    J = np.mean(dataset.discounted_return)
+    R = np.mean(dataset.undiscounted_return)
+    E = agent.policy.entropy()
+
+    logger.epoch_info(0, J=J, R=R, entropy=E)
+
+    for it in trange(n_epochs, leave=False):
+        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
+        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+        J = np.mean(dataset.discounted_return)
+        R = np.mean(dataset.undiscounted_return)
+        E = agent.policy.entropy()
+
+        logger.epoch_info(it + 1, J=J, R=R, entropy=E)
+
+    logger.info("Press a button to visualize")
+    input()
+    core.evaluate(n_episodes=5, render=True)
+
+
+if __name__ == "__main__":
+    experiment(n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/examples/mujoco_half_cheetah_sac.py b/examples/mujoco_half_cheetah_sac.py
deleted file mode 100644
index d3b56226..00000000
--- a/examples/mujoco_half_cheetah_sac.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-
-from mushroom_rl.algorithms.actor_critic import SAC
-from mushroom_rl.core import Core, Logger
-from mushroom_rl.environments.mujoco_envs.half_cheetah import HalfCheetah
-
-from tqdm import trange
-
-
-class CriticNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super().__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state, action):
-        state_action = torch.cat((state.float(), action.float()), dim=1)
-        features1 = F.relu(self._h1(state_action))
-        features2 = F.relu(self._h2(features1))
-        q = self._h3(features2)
-
-        return torch.squeeze(q)
-
-
-class ActorNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super(ActorNetwork, self).__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state):
-        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
-        features2 = F.relu(self._h2(features1))
-        a = self._h3(features2)
-
-        return a
-
-
-def experiment(alg, n_epochs, n_steps, n_episodes_test):
-    np.random.seed()
-
-    logger = Logger(alg.__name__, results_dir=None)
-    logger.strong_line()
-    logger.info("Experiment Algorithm: " + alg.__name__)
-
-    # MDP
-    mdp = HalfCheetah()
-
-    # Settings
-    initial_replay_size = 5_000
-    max_replay_size = 500_000
-    batch_size = 256
-    n_features = 256
-    warmup_transitions = 10_000
-    tau = 5e-3
-    lr_alpha = 3e-4
-
-    # Approximator
-    actor_input_shape = mdp.info.observation_space.shape
-    actor_mu_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-    actor_sigma_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-
-    actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}}
-
-    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
-    critic_params = dict(
-        network=CriticNetwork,
-        optimizer={"class": optim.Adam, "params": {"lr": 3e-4}},
-        loss=F.mse_loss,
-        n_features=n_features,
-        input_shape=critic_input_shape,
-        output_shape=(1,),
-    )
-
-    # Agent
-    agent = alg(
-        mdp.info,
-        actor_mu_params,
-        actor_sigma_params,
-        actor_optimizer,
-        critic_params,
-        batch_size,
-        initial_replay_size,
-        max_replay_size,
-        warmup_transitions,
-        tau,
-        lr_alpha,
-        critic_fit_params=None,
-    )
-
-    # Algorithm
-    core = Core(agent, mdp)
-
-    # RUN
-    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
-
-    J = np.mean(dataset.discounted_return)
-    R = np.mean(dataset.undiscounted_return)
-    E = agent.policy.entropy(dataset.state)
-
-    logger.epoch_info(0, J=J, R=R, entropy=E)
-
-    core.learn(
-        n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True
-    )
-
-    for n in trange(n_epochs, leave=False):
-        core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)
-        dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True)
-
-        J = np.mean(dataset.discounted_return)
-        R = np.mean(dataset.undiscounted_return)
-        E = agent.policy.entropy(dataset.state)
-
-        logger.epoch_info(n + 1, J=J, R=R, entropy=E)
-
-    logger.info("Press a button to visualize")
-    input()
-    core.evaluate(n_episodes=5, render=True)
-
-
-if __name__ == "__main__":
-    algs = [SAC]
-
-    for alg in algs:
-        experiment(alg=alg, n_epochs=50, n_steps=30_000, n_episodes_test=10)
diff --git a/examples/mujoco_hopper_ppo.py b/examples/mujoco_hopper_ppo.py
new file mode 100644
index 00000000..cabcf636
--- /dev/null
+++ b/examples/mujoco_hopper_ppo.py
@@ -0,0 +1,121 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+from mushroom_rl.algorithms.actor_critic import PPO
+from mushroom_rl.core import Core, Logger
+from mushroom_rl.environments.mujoco_envs.hopper import Hopper
+from mushroom_rl.policy import GaussianTorchPolicy
+
+from tqdm import trange
+
+
+class Network(nn.Module):
+    def __init__(self, input_shape, output_shape, n_features, **kwargs):
+        super(Network, self).__init__()
+
+        n_input = input_shape[-1]
+        n_output = output_shape[0]
+
+        self._h1 = nn.Linear(n_input, n_features)
+        self._h2 = nn.Linear(n_features, n_features)
+        self._h3 = nn.Linear(n_features, n_output)
+
+        nn.init.xavier_uniform_(
+            self._h1.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h2.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h3.weight, gain=nn.init.calculate_gain("linear") / 10
+        )
+
+    def forward(self, state, **kwargs):
+        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
+        features2 = F.relu(self._h2(features1))
+        a = self._h3(features2)
+
+        return a
+
+
+def experiment(n_epochs, n_steps, n_episodes_test, seed=0):
+
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    logger = Logger(PPO.__name__, results_dir=None)
+    logger.strong_line()
+    logger.info("Experiment Algorithm: " + PPO.__name__)
+
+    mdp = Hopper()
+
+    actor_lr = 3e-4
+    critic_lr = 3e-4
+    n_features = 32
+    batch_size = 64
+    n_epochs_policy = 10
+    eps = 0.2
+    lam = 0.95
+    std_0 = 1.0
+    n_steps_per_fit = 2000
+
+    critic_params = dict(
+        network=Network,
+        optimizer={"class": optim.Adam, "params": {"lr": critic_lr}},
+        loss=F.mse_loss,
+        n_features=n_features,
+        batch_size=batch_size,
+        input_shape=mdp.info.observation_space.shape,
+        output_shape=(1,),
+    )
+
+    alg_params = dict(
+        actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}},
+        n_epochs_policy=n_epochs_policy,
+        batch_size=batch_size,
+        eps_ppo=eps,
+        lam=lam,
+        critic_params=critic_params,
+    )
+
+    policy_params = dict(std_0=std_0, n_features=n_features)
+
+    policy = GaussianTorchPolicy(
+        Network,
+        mdp.info.observation_space.shape,
+        mdp.info.action_space.shape,
+        **policy_params,
+    )
+
+    agent = PPO(mdp.info, policy, **alg_params)
+
+    core = Core(agent, mdp)
+
+    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+    J = np.mean(dataset.discounted_return)
+    R = np.mean(dataset.undiscounted_return)
+    E = agent.policy.entropy()
+
+    logger.epoch_info(0, J=J, R=R, entropy=E)
+
+    for it in trange(n_epochs, leave=False):
+        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
+        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+        J = np.mean(dataset.discounted_return)
+        R = np.mean(dataset.undiscounted_return)
+        E = agent.policy.entropy()
+
+        logger.epoch_info(it + 1, J=J, R=R, entropy=E)
+
+    logger.info("Press a button to visualize")
+    input()
+    core.evaluate(n_episodes=5, render=True)
+
+
+if __name__ == "__main__":
+    experiment(n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/examples/mujoco_hopper_sac.py b/examples/mujoco_hopper_sac.py
deleted file mode 100644
index 6cf3a995..00000000
--- a/examples/mujoco_hopper_sac.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-
-from mushroom_rl.algorithms.actor_critic import SAC
-from mushroom_rl.core import Core, Logger
-from mushroom_rl.environments.mujoco_envs.hopper import Hopper
-
-from tqdm import trange
-
-
-class CriticNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super().__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state, action):
-        state_action = torch.cat((state.float(), action.float()), dim=1)
-        features1 = F.relu(self._h1(state_action))
-        features2 = F.relu(self._h2(features1))
-        q = self._h3(features2)
-
-        return torch.squeeze(q)
-
-
-class ActorNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super(ActorNetwork, self).__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state):
-        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
-        features2 = F.relu(self._h2(features1))
-        a = self._h3(features2)
-
-        return a
-
-
-def experiment(alg, n_epochs, n_steps, n_episodes_test):
-    np.random.seed()
-
-    logger = Logger(alg.__name__, results_dir=None)
-    logger.strong_line()
-    logger.info("Experiment Algorithm: " + alg.__name__)
-
-    # MDP
-    mdp = Hopper()
-
-    # Settings
-    initial_replay_size = 5_000
-    max_replay_size = 500_000
-    batch_size = 256
-    n_features = 256
-    warmup_transitions = 10_000
-    tau = 5e-3
-    lr_alpha = 3e-4
-
-    # Approximator
-    actor_input_shape = mdp.info.observation_space.shape
-    actor_mu_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-    actor_sigma_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-
-    actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}}
-
-    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
-    critic_params = dict(
-        network=CriticNetwork,
-        optimizer={"class": optim.Adam, "params": {"lr": 3e-4}},
-        loss=F.mse_loss,
-        n_features=n_features,
-        input_shape=critic_input_shape,
-        output_shape=(1,),
-    )
-
-    # Agent
-    agent = alg(
-        mdp.info,
-        actor_mu_params,
-        actor_sigma_params,
-        actor_optimizer,
-        critic_params,
-        batch_size,
-        initial_replay_size,
-        max_replay_size,
-        warmup_transitions,
-        tau,
-        lr_alpha,
-        critic_fit_params=None,
-    )
-
-    # Algorithm
-    core = Core(agent, mdp)
-
-    # RUN
-    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
-
-    J = np.mean(dataset.discounted_return)
-    R = np.mean(dataset.undiscounted_return)
-    E = agent.policy.entropy(dataset.state)
-
-    logger.epoch_info(0, J=J, R=R, entropy=E)
-
-    core.learn(
-        n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True
-    )
-
-    for n in trange(n_epochs, leave=False):
-        core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)
-        dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True)
-
-        J = np.mean(dataset.discounted_return)
-        R = np.mean(dataset.undiscounted_return)
-        E = agent.policy.entropy(dataset.state)
-
-        logger.epoch_info(n + 1, J=J, R=R, entropy=E)
-
-    logger.info("Press a button to visualize")
-    input()
-    core.evaluate(n_episodes=5, render=True)
-
-
-if __name__ == "__main__":
-    algs = [SAC]
-
-    for alg in algs:
-        experiment(alg=alg, n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/examples/mujoco_walker_2d_ppo.py b/examples/mujoco_walker_2d_ppo.py
new file mode 100644
index 00000000..bfcb2eca
--- /dev/null
+++ b/examples/mujoco_walker_2d_ppo.py
@@ -0,0 +1,121 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+from mushroom_rl.algorithms.actor_critic import PPO
+from mushroom_rl.core import Core, Logger
+from mushroom_rl.environments.mujoco_envs.walker_2d import Walker2D
+from mushroom_rl.policy import GaussianTorchPolicy
+
+from tqdm import trange
+
+
+class Network(nn.Module):
+    def __init__(self, input_shape, output_shape, n_features, **kwargs):
+        super(Network, self).__init__()
+
+        n_input = input_shape[-1]
+        n_output = output_shape[0]
+
+        self._h1 = nn.Linear(n_input, n_features)
+        self._h2 = nn.Linear(n_features, n_features)
+        self._h3 = nn.Linear(n_features, n_output)
+
+        nn.init.xavier_uniform_(
+            self._h1.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h2.weight, gain=nn.init.calculate_gain("relu") / 10
+        )
+        nn.init.xavier_uniform_(
+            self._h3.weight, gain=nn.init.calculate_gain("linear") / 10
+        )
+
+    def forward(self, state, **kwargs):
+        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
+        features2 = F.relu(self._h2(features1))
+        a = self._h3(features2)
+
+        return a
+
+
+def experiment(n_epochs, n_steps, n_episodes_test, seed=0):
+
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    logger = Logger(PPO.__name__, results_dir=None)
+    logger.strong_line()
+    logger.info("Experiment Algorithm: " + PPO.__name__)
+
+    mdp = Walker2D()
+
+    actor_lr = 3e-4
+    critic_lr = 3e-4
+    n_features = 32
+    batch_size = 64
+    n_epochs_policy = 10
+    eps = 0.2
+    lam = 0.95
+    std_0 = 1.0
+    n_steps_per_fit = 2000
+
+    critic_params = dict(
+        network=Network,
+        optimizer={"class": optim.Adam, "params": {"lr": critic_lr}},
+        loss=F.mse_loss,
+        n_features=n_features,
+        batch_size=batch_size,
+        input_shape=mdp.info.observation_space.shape,
+        output_shape=(1,),
+    )
+
+    alg_params = dict(
+        actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}},
+        n_epochs_policy=n_epochs_policy,
+        batch_size=batch_size,
+        eps_ppo=eps,
+        lam=lam,
+        critic_params=critic_params,
+    )
+
+    policy_params = dict(std_0=std_0, n_features=n_features)
+
+    policy = GaussianTorchPolicy(
+        Network,
+        mdp.info.observation_space.shape,
+        mdp.info.action_space.shape,
+        **policy_params,
+    )
+
+    agent = PPO(mdp.info, policy, **alg_params)
+
+    core = Core(agent, mdp)
+
+    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+    J = np.mean(dataset.discounted_return)
+    R = np.mean(dataset.undiscounted_return)
+    E = agent.policy.entropy()
+
+    logger.epoch_info(0, J=J, R=R, entropy=E)
+
+    for it in trange(n_epochs, leave=False):
+        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
+        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
+
+        J = np.mean(dataset.discounted_return)
+        R = np.mean(dataset.undiscounted_return)
+        E = agent.policy.entropy()
+
+        logger.epoch_info(it + 1, J=J, R=R, entropy=E)
+
+    logger.info("Press a button to visualize")
+    input()
+    core.evaluate(n_episodes=5, render=True)
+
+
+if __name__ == "__main__":
+    experiment(n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/examples/mujoco_walker_2d_sac.py b/examples/mujoco_walker_2d_sac.py
deleted file mode 100644
index 793a0d72..00000000
--- a/examples/mujoco_walker_2d_sac.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-
-from mushroom_rl.algorithms.actor_critic import SAC
-from mushroom_rl.core import Core, Logger
-from mushroom_rl.environments.mujoco_envs.walker_2d import Walker2D
-
-from tqdm import trange
-
-
-class CriticNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super().__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state, action):
-        state_action = torch.cat((state.float(), action.float()), dim=1)
-        features1 = F.relu(self._h1(state_action))
-        features2 = F.relu(self._h2(features1))
-        q = self._h3(features2)
-
-        return torch.squeeze(q)
-
-
-class ActorNetwork(nn.Module):
-    def __init__(self, input_shape, output_shape, n_features, **kwargs):
-        super(ActorNetwork, self).__init__()
-
-        n_input = input_shape[-1]
-        n_output = output_shape[0]
-
-        self._h1 = nn.Linear(n_input, n_features)
-        self._h2 = nn.Linear(n_features, n_features)
-        self._h3 = nn.Linear(n_features, n_output)
-
-        nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu"))
-        nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear"))
-
-    def forward(self, state):
-        features1 = F.relu(self._h1(torch.squeeze(state, 1).float()))
-        features2 = F.relu(self._h2(features1))
-        a = self._h3(features2)
-
-        return a
-
-
-def experiment(alg, n_epochs, n_steps, n_episodes_test):
-    np.random.seed()
-
-    logger = Logger(alg.__name__, results_dir=None)
-    logger.strong_line()
-    logger.info("Experiment Algorithm: " + alg.__name__)
-
-    # MDP
-    mdp = Walker2D()
-
-    # Settings
-    initial_replay_size = 5_000
-    max_replay_size = 500_000
-    batch_size = 256
-    n_features = 256
-    warmup_transitions = 10_000
-    tau = 5e-3
-    lr_alpha = 3e-4
-
-    # Approximator
-    actor_input_shape = mdp.info.observation_space.shape
-    actor_mu_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-    actor_sigma_params = dict(
-        network=ActorNetwork,
-        n_features=n_features,
-        input_shape=actor_input_shape,
-        output_shape=mdp.info.action_space.shape,
-    )
-
-    actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}}
-
-    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
-    critic_params = dict(
-        network=CriticNetwork,
-        optimizer={"class": optim.Adam, "params": {"lr": 3e-4}},
-        loss=F.mse_loss,
-        n_features=n_features,
-        input_shape=critic_input_shape,
-        output_shape=(1,),
-    )
-
-    # Agent
-    agent = alg(
-        mdp.info,
-        actor_mu_params,
-        actor_sigma_params,
-        actor_optimizer,
-        critic_params,
-        batch_size,
-        initial_replay_size,
-        max_replay_size,
-        warmup_transitions,
-        tau,
-        lr_alpha,
-        critic_fit_params=None,
-    )
-
-    # Algorithm
-    core = Core(agent, mdp)
-
-    # RUN
-    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
-
-    J = np.mean(dataset.discounted_return)
-    R = np.mean(dataset.undiscounted_return)
-    E = agent.policy.entropy(dataset.state)
-
-    logger.epoch_info(0, J=J, R=R, entropy=E)
-
-    core.learn(
-        n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True
-    )
-
-    for n in trange(n_epochs, leave=False):
-        core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True)
-        dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True)
-
-        J = np.mean(dataset.discounted_return)
-        R = np.mean(dataset.undiscounted_return)
-        E = agent.policy.entropy(dataset.state)
-
-        logger.epoch_info(n + 1, J=J, R=R, entropy=E)
-
-    logger.info("Press a button to visualize")
-    input()
-    core.evaluate(n_episodes=5, render=True)
-
-
-if __name__ == "__main__":
-    algs = [SAC]
-
-    for alg in algs:
-        experiment(alg=alg, n_epochs=50, n_steps=30000, n_episodes_test=10)
diff --git a/mushroom_rl/environments/mujoco.py b/mushroom_rl/environments/mujoco.py
index e76bec38..a75adf2c 100644
--- a/mushroom_rl/environments/mujoco.py
+++ b/mushroom_rl/environments/mujoco.py
@@ -11,9 +11,21 @@ class MuJoCo(Environment):
     Class to create a Mushroom environment using the MuJoCo simulator.
     """
 
-    def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, timestep=None, n_substeps=1,
-                 n_intermediate_steps=1, additional_data_spec=None, collision_groups=None, max_joint_vel=None,
-                 **viewer_params):
+    def __init__(
+        self,
+        xml_file,
+        actuation_spec,
+        observation_spec,
+        gamma,
+        horizon,
+        timestep=None,
+        n_substeps=1,
+        n_intermediate_steps=1,
+        additional_data_spec=None,
+        collision_groups=None,
+        max_joint_vel=None,
+        **viewer_params,
+    ):
         """
         Constructor.
 
@@ -66,13 +78,17 @@ def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, t
 
         # Read the actuation spec and build the mapping between actions and ids
         # as well as their limits
-        self._action_indices = self.get_action_indices(self._model, self._data, actuation_spec)
+        self._action_indices = self.get_action_indices(
+            self._model, self._data, actuation_spec
+        )
 
         action_space = self.get_action_space(self._action_indices, self._model)
 
         # Read the observation spec to build a mapping at every step. It is
         # ensured that the values appear in the order they are specified.
-        self.obs_helper = ObservationHelper(observation_spec, self._model, self._data, max_joint_velocity=max_joint_vel)
+        self.obs_helper = ObservationHelper(
+            observation_spec, self._model, self._data, max_joint_velocity=max_joint_vel
+        )
 
         observation_space = Box(*self.obs_helper.get_obs_limits())
 
@@ -89,8 +105,12 @@ def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, t
             for name, geom_names in collision_groups:
                 col_group = list()
                 for geom_name in geom_names:
-                    mj_id = mujoco.mj_name2id(self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name)
-                    assert mj_id != -1, f"geom \"{geom_name}\" not found! Can't be used for collision-checking."
+                    mj_id = mujoco.mj_name2id(
+                        self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name
+                    )
+                    assert (
+                        mj_id != -1
+                    ), f'geom "{geom_name}" not found! Can\'t be used for collision-checking.'
                     col_group.append(mj_id)
                 self.collision_groups[name] = set(col_group)
 
@@ -119,7 +139,9 @@ def reset(self, obs=None):
         mujoco.mj_resetData(self._model, self._data)
         self.setup(obs)
 
-        self._obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data))
+        self._obs = self._create_observation(
+            self.obs_helper._build_obs(self._model, self._data)
+        )
         return self._modify_observation(self._obs), {}
 
     def step(self, action):
@@ -144,16 +166,20 @@ def step(self, action):
             self._simulation_post_step()
 
             if self._recompute_action_per_step:
-                cur_obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data))
+                cur_obs = self._create_observation(
+                    self.obs_helper._build_obs(self._model, self._data)
+                )
 
         if not self._recompute_action_per_step:
-            cur_obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data))
+            cur_obs = self._create_observation(
+                self.obs_helper._build_obs(self._model, self._data)
+            )
 
         self._step_finalize()
 
         absorbing = self.is_absorbing(cur_obs)
         reward = self.reward(self._obs, action, cur_obs, absorbing)
-        info = self._create_info_dictionary(cur_obs)
+        info = self._create_info_dictionary(cur_obs, action)
 
         self._obs = cur_obs
 
@@ -161,7 +187,9 @@ def step(self, action):
 
     def render(self, record=False):
         if self._viewer is None:
-            self._viewer = MujocoViewer(self._model, self.dt, record=record, **self._viewer_params)
+            self._viewer = MujocoViewer(
+                self._model, self.dt, record=record, **self._viewer_params
+            )
 
         return self._viewer.render(self._data, record)
 
@@ -199,12 +227,13 @@ def _create_observation(self, obs):
         """
         return obs
 
-    def _create_info_dictionary(self, obs):
+    def _create_info_dictionary(self, obs, action):
         """
         This method can be overridden to create a custom info dictionary.
 
         Args:
             obs (np.ndarray): the generated observation
+            action (np.ndarray): the action taken
 
         Returns:
             The information dictionary.
@@ -305,7 +334,9 @@ def _read_data(self, name):
 
         """
         data_id, otype = self.additional_data[name]
-        return np.array(self.obs_helper.get_state(self._model, self._data, data_id, otype))
+        return np.array(
+            self.obs_helper.get_state(self._model, self._data, data_id, otype)
+        )
 
     def _write_data(self, name, value):
         """
@@ -324,7 +355,9 @@ def _write_data(self, name, value):
         elif otype == ObservationType.JOINT_VEL:
             self._data.joint(data_id).qvel = value
         else:
-            data_buffer = self.obs_helper.get_state(self._model, self._data, data_id, otype)
+            data_buffer = self.obs_helper.get_state(
+                self._model, self._data, data_id, otype
+            )
             data_buffer[:] = value
 
     def _check_collision(self, group1, group2):
@@ -381,11 +414,14 @@ def _get_collision_force(self, group1, group2):
         for con_i in range(0, self._data.ncon):
             con = self._data.contact[con_i]
 
-            if (con.geom1 in ids1 and con.geom2 in ids2 or
-               con.geom1 in ids2 and con.geom2 in ids1):
+            if (
+                con.geom1 in ids1
+                and con.geom2 in ids2
+                or con.geom1 in ids2
+                and con.geom2 in ids1
+            ):
 
-                mujoco.mj_contactForce(self._model, self._data,
-                                       con_i, c_array)
+                mujoco.mj_contactForce(self._model, self._data, con_i, c_array)
                 return c_array
 
         return c_array
@@ -478,7 +514,7 @@ def get_action_space(action_indices, model):
          Returns:
              A bounding box for the action space.
 
-         """
+        """
         low = []
         high = []
         for index in action_indices:
@@ -500,14 +536,14 @@ def user_warning_raise_exception(warning):
             warning: Mujoco warning.
 
         """
-        if 'Pre-allocated constraint buffer is full' in warning:
-            raise RuntimeError(warning + 'Increase njmax in mujoco XML')
-        elif 'Pre-allocated contact buffer is full' in warning:
-            raise RuntimeError(warning + 'Increase njconmax in mujoco XML')
-        elif 'Unknown warning type' in warning:
-            raise RuntimeError(warning + 'Check for NaN in simulation.')
+        if "Pre-allocated constraint buffer is full" in warning:
+            raise RuntimeError(warning + "Increase njmax in mujoco XML")
+        elif "Pre-allocated contact buffer is full" in warning:
+            raise RuntimeError(warning + "Increase njconmax in mujoco XML")
+        elif "Unknown warning type" in warning:
+            raise RuntimeError(warning + "Check for NaN in simulation.")
         else:
-            raise RuntimeError('Got MuJoCo Warning: ' + warning)
+            raise RuntimeError("Got MuJoCo Warning: " + warning)
 
     @staticmethod
     def load_model(xml_file):
@@ -523,8 +559,9 @@ def load_model(xml_file):
         """
         if type(xml_file) == mjcf.element.RootElement:
             # load from xml handle
-            model = mujoco.MjModel.from_xml_string(xml=xml_file.to_xml_string(),
-                                                   assets=xml_file.get_assets())
+            model = mujoco.MjModel.from_xml_string(
+                xml=xml_file.to_xml_string(), assets=xml_file.get_assets()
+            )
         elif type(xml_file) == str:
             # load from path
             model = mujoco.MjModel.from_xml_path(xml_file)
@@ -542,9 +579,22 @@ class MultiMuJoCo(MuJoCo):
 
     """
 
-    def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, timestep=None,
-                 n_substeps=1, n_intermediate_steps=1, additional_data_spec=None, collision_groups=None,
-                 max_joint_vel=None, random_env_reset=True, **viewer_params):
+    def __init__(
+        self,
+        xml_files,
+        actuation_spec,
+        observation_spec,
+        gamma,
+        horizon,
+        timestep=None,
+        n_substeps=1,
+        n_intermediate_steps=1,
+        additional_data_spec=None,
+        collision_groups=None,
+        max_joint_vel=None,
+        random_env_reset=True,
+        **viewer_params,
+    ):
         """
         Constructor.
 
@@ -603,7 +653,9 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon,
 
         # Read the actuation spec and build the mapping between actions and ids
         # as well as their limits
-        self._action_indices = self.get_action_indices(self._model, self._data, actuation_spec)
+        self._action_indices = self.get_action_indices(
+            self._model, self._data, actuation_spec
+        )
 
         action_space = self.get_action_space(self._action_indices, self._model)
 
@@ -611,17 +663,27 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon,
         for m, d in zip(self._models, self._datas):
             action_ind = self.get_action_indices(m, d, actuation_spec)
             action_sp = self.get_action_space(action_ind, m)
-            if not np.array_equal(action_ind, self._action_indices) or \
-                    not np.array_equal(action_space.low, action_sp.low) or\
-                    not np.array_equal(action_space.high, action_sp.high):
-                raise ValueError("The provided environments differ in the their action spaces. "
-                                 "This is not allowed.")
+            if (
+                not np.array_equal(action_ind, self._action_indices)
+                or not np.array_equal(action_space.low, action_sp.low)
+                or not np.array_equal(action_space.high, action_sp.high)
+            ):
+                raise ValueError(
+                    "The provided environments differ in the their action spaces. "
+                    "This is not allowed."
+                )
 
         # Read the observation spec to build a mapping at every step. It is
         # ensured that the values appear in the order they are specified.
-        self.obs_helpers = [ObservationHelper(observation_spec, self._model, self._data,
-                                              max_joint_velocity=max_joint_vel)
-                            for m, d in zip(self._models, self._datas)]
+        self.obs_helpers = [
+            ObservationHelper(
+                observation_spec,
+                self._model,
+                self._data,
+                max_joint_velocity=max_joint_vel,
+            )
+            for m, d in zip(self._models, self._datas)
+        ]
         self.obs_helper = self.obs_helpers[self._current_model_idx]
 
         observation_space = Box(*self.obs_helper.get_obs_limits())
@@ -629,9 +691,13 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon,
         # multi envs with different obs limits are now allowed, do sanity check
         for oh in self.obs_helpers:
             low, high = self.obs_helper.get_obs_limits()
-            if not np.array_equal(low, observation_space.low) or not np.array_equal(high, observation_space.high):
-                raise ValueError("The provided environments differ in the their observation limits. "
-                                 "This is not allowed.")
+            if not np.array_equal(low, observation_space.low) or not np.array_equal(
+                high, observation_space.high
+            ):
+                raise ValueError(
+                    "The provided environments differ in the their observation limits. "
+                    "This is not allowed."
+                )
 
         # Pre-process the additional data to allow easier writing and reading
         # to and from arrays in MuJoCo
@@ -646,8 +712,12 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon,
             for name, geom_names in collision_groups:
                 col_group = list()
                 for geom_name in geom_names:
-                    mj_id = mujoco.mj_name2id(self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name)
-                    assert mj_id != -1, f"geom \"{geom_name}\" not found! Can't be used for collision-checking."
+                    mj_id = mujoco.mj_name2id(
+                        self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name
+                    )
+                    assert (
+                        mj_id != -1
+                    ), f'geom "{geom_name}" not found! Can\'t be used for collision-checking.'
                     col_group.append(mj_id)
                 self.collision_groups[name] = set(col_group)
 
@@ -676,8 +746,11 @@ def reset(self, obs=None):
         if self._random_env_reset:
             self._current_model_idx = np.random.randint(0, len(self._models))
         else:
-            self._current_model_idx = self._current_model_idx + 1 \
-                if self._current_model_idx < len(self._models) - 1 else 0
+            self._current_model_idx = (
+                self._current_model_idx + 1
+                if self._current_model_idx < len(self._models) - 1
+                else 0
+            )
 
         self._model = self._models[self._current_model_idx]
         self._data = self._datas[self._current_model_idx]
@@ -687,7 +760,9 @@ def reset(self, obs=None):
         if self._viewer is not None and self.more_than_one_env:
             self._viewer.load_new_model(self._model)
 
-        self._obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data))
+        self._obs = self._create_observation(
+            self.obs_helper._build_obs(self._model, self._data)
+        )
         return self._modify_observation(self._obs)
 
     @property
@@ -708,11 +783,11 @@ def _get_env_id_map(current_model_idx, n_models):
 
         """
         n_models = np.maximum(n_models, 2)
-        bits_needed = 1+int(np.log((n_models-1))/np.log(2))
+        bits_needed = 1 + int(np.log((n_models - 1)) / np.log(2))
         id_mask = np.zeros(bits_needed)
         bin_rep = np.binary_repr(current_model_idx)[::-1]
         for i, b in enumerate(bin_rep):
-            idx = bits_needed - 1 - i   # reverse idx
+            idx = bits_needed - 1 - i  # reverse idx
             if int(b):
                 id_mask[idx] = 1.0
             else:
diff --git a/mushroom_rl/environments/mujoco_envs/air_hockey/double.py b/mushroom_rl/environments/mujoco_envs/air_hockey/double.py
index 27308b08..a82f4c7e 100644
--- a/mushroom_rl/environments/mujoco_envs/air_hockey/double.py
+++ b/mushroom_rl/environments/mujoco_envs/air_hockey/double.py
@@ -5,21 +5,38 @@
 from mushroom_rl.environments.mujoco_envs.air_hockey.base import AirHockeyBase
 from mushroom_rl.utils.mujoco import forward_kinematics
 
+
 class AirHockeyDouble(AirHockeyBase):
     """
     Base class for two agents air hockey tasks.
 
     """
-    def __init__(self, gamma=0.99, horizon=120, env_noise=False, obs_noise=False, timestep=1 / 240.,
-                 n_intermediate_steps=1, **viewer_params):
 
+    def __init__(
+        self,
+        gamma=0.99,
+        horizon=120,
+        env_noise=False,
+        obs_noise=False,
+        timestep=1 / 240.0,
+        n_intermediate_steps=1,
+        **viewer_params,
+    ):
         """
         Constructor.
 
         """
         self.init_state = np.array([-0.9273, 0.9273, np.pi / 2])
-        super().__init__(gamma=gamma, horizon=horizon, env_noise=env_noise, n_agents=2, obs_noise=obs_noise,
-                         timestep=timestep, n_intermediate_steps=n_intermediate_steps, **viewer_params)
+        super().__init__(
+            gamma=gamma,
+            horizon=horizon,
+            env_noise=env_noise,
+            n_agents=2,
+            obs_noise=obs_noise,
+            timestep=timestep,
+            n_intermediate_steps=n_intermediate_steps,
+            **viewer_params,
+        )
 
         # Remove z position and quaternion from puck pos
         self.obs_helper.remove_obs("puck_pos", 2)
@@ -68,14 +85,26 @@ def get_ee(self, robot=1):
 
     def _modify_observation(self, obs):
         new_obs = obs.copy()
-        self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"])
-
-        self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_vel"), self.agents[0]["frame"], type='vel')
-
-        self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "robot_2/puck_pos"), self.agents[1]["frame"])
-
-        self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "robot_2/puck_vel"), self.agents[1]["frame"],
-                                     type='vel')
+        self._puck_2d_in_robot_frame(
+            self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"]
+        )
+
+        self._puck_2d_in_robot_frame(
+            self.obs_helper.get_from_obs(new_obs, "puck_vel"),
+            self.agents[0]["frame"],
+            type="vel",
+        )
+
+        self._puck_2d_in_robot_frame(
+            self.obs_helper.get_from_obs(new_obs, "robot_2/puck_pos"),
+            self.agents[1]["frame"],
+        )
+
+        self._puck_2d_in_robot_frame(
+            self.obs_helper.get_from_obs(new_obs, "robot_2/puck_vel"),
+            self.agents[1]["frame"],
+            type="vel",
+        )
 
         if self.obs_noise:
             noise = np.random.randn(2) * 0.001
@@ -93,10 +122,14 @@ def setup(self, obs):
         self.has_bounce = False
 
         for i in range(3):
-            self._data.joint("planar_robot_1/joint_" + str(i+1)).qpos = self.init_state[i]
+            self._data.joint("planar_robot_1/joint_" + str(i + 1)).qpos = (
+                self.init_state[i]
+            )
 
         for i in range(3):
-            self._data.joint("planar_robot_2/joint_" + str(i+1)).qpos = self.init_state[i]
+            self._data.joint("planar_robot_2/joint_" + str(i + 1)).qpos = (
+                self.init_state[i]
+            )
 
         super().setup(obs)
         mujoco.mj_fwdPosition(self._model, self._data)
@@ -115,15 +148,17 @@ def _create_observation(self, state):
         obs = super(AirHockeyDouble, self)._create_observation(state)
         return np.append(obs, [self.robot_1_hit, self.robot_2_hit, self.has_bounce])
 
-    def _create_info_dictionary(self, obs):
-        constraints = {"agent-1": {}, "agent-2":{}}
+    def _create_info_dictionary(self, obs, action):
+        constraints = {"agent-1": {}, "agent-2": {}}
 
         for i, key in enumerate(constraints.keys()):
 
-            q_pos = self.obs_helper.get_joint_pos_from_obs(obs)[i * 3: (i+1) * 3]
-            q_vel = self.obs_helper.get_joint_vel_from_obs(obs)[i * 3: (i+1) * 3]
+            q_pos = self.obs_helper.get_joint_pos_from_obs(obs)[i * 3 : (i + 1) * 3]
+            q_vel = self.obs_helper.get_joint_vel_from_obs(obs)[i * 3 : (i + 1) * 3]
 
-            x_pos, _ = forward_kinematics(self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee")
+            x_pos, _ = forward_kinematics(
+                self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee"
+            )
 
             # Translate to table space
             ee_pos = x_pos + self.agents[0]["frame"][:3, 3]
@@ -131,33 +166,48 @@ def _create_info_dictionary(self, obs):
             # ee_constraint: force the ee to stay within the bounds of the table
             # 1 Dimension on x direction: x > x_lb
             # 2 Dimension on y direction: y > y_lb, y < y_ub
-            x_lb = - (self.env_spec['table']['length'] / 2 + self.env_spec['mallet']['radius'])
-            y_lb = - (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius'])
-            y_ub = (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius'])
-
-            constraints[key]["ee_constraints"] = np.array([-ee_pos[0] + x_lb,
-                                                           -ee_pos[1] + y_lb, ee_pos[1] - y_ub])
+            x_lb = -(
+                self.env_spec["table"]["length"] / 2 + self.env_spec["mallet"]["radius"]
+            )
+            y_lb = -(
+                self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"]
+            )
+            y_ub = (
+                self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"]
+            )
+
+            constraints[key]["ee_constraints"] = np.array(
+                [-ee_pos[0] + x_lb, -ee_pos[1] + y_lb, ee_pos[1] - y_ub]
+            )
 
             # joint_pos_constraint: stay within the robots joint position limits
             constraints[key]["joint_pos_constraints"] = np.zeros(6)
-            constraints[key]["joint_pos_constraints"][:3] = q_vel - self.obs_helper.get_joint_pos_limits()[1][i * 3: (i+1) * 3]
-            constraints[key]["joint_pos_constraints"][3:] = self.obs_helper.get_joint_pos_limits()[0][i * 3: (i+1) * 3] - q_vel
+            constraints[key]["joint_pos_constraints"][:3] = (
+                q_vel - self.obs_helper.get_joint_pos_limits()[1][i * 3 : (i + 1) * 3]
+            )
+            constraints[key]["joint_pos_constraints"][3:] = (
+                self.obs_helper.get_joint_pos_limits()[0][i * 3 : (i + 1) * 3] - q_vel
+            )
 
             # joint_vel_constraint: stay within the robots joint velocity limits
             constraints[key]["joint_vel_constraints"] = np.zeros(6)
-            constraints[key]["joint_vel_constraints"][:3] = q_vel - self.obs_helper.get_joint_vel_limits()[1][i * 3: (i+1) * 3]
-            constraints[key]["joint_vel_constraints"][3:] = self.obs_helper.get_joint_vel_limits()[0][i * 3: (i+1) * 3] - q_vel
+            constraints[key]["joint_vel_constraints"][:3] = (
+                q_vel - self.obs_helper.get_joint_vel_limits()[1][i * 3 : (i + 1) * 3]
+            )
+            constraints[key]["joint_vel_constraints"][3:] = (
+                self.obs_helper.get_joint_vel_limits()[0][i * 3 : (i + 1) * 3] - q_vel
+            )
 
         return constraints
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     env = AirHockeyDouble(env_noise=False, obs_noise=False, n_intermediate_steps=4)
 
     env.reset()
-    R = 0.
-    J = 0.
-    gamma = 1.
+    R = 0.0
+    J = 0.0
+    gamma = 1.0
     steps = 0
     while True:
         action = np.random.randn(6) * 5
@@ -169,8 +219,8 @@ def _create_info_dictionary(self, obs):
         steps += 1
         if done or steps > env.info.horizon:
             print("J: ", J, " R: ", R)
-            R = 0.
-            J = 0.
-            gamma = 1.
+            R = 0.0
+            J = 0.0
+            gamma = 1.0
             steps = 0
             env.reset()
diff --git a/mushroom_rl/environments/mujoco_envs/air_hockey/single.py b/mushroom_rl/environments/mujoco_envs/air_hockey/single.py
index 5d138a93..2f5c77e5 100644
--- a/mushroom_rl/environments/mujoco_envs/air_hockey/single.py
+++ b/mushroom_rl/environments/mujoco_envs/air_hockey/single.py
@@ -11,16 +11,32 @@ class AirHockeySingle(AirHockeyBase):
     Base class for single agent air hockey tasks.
 
     """
-    def __init__(self, gamma=0.99, horizon=120, env_noise=False, obs_noise=False, timestep=1 / 240.,
-                 n_intermediate_steps=1, **viewer_params):
 
+    def __init__(
+        self,
+        gamma=0.99,
+        horizon=120,
+        env_noise=False,
+        obs_noise=False,
+        timestep=1 / 240.0,
+        n_intermediate_steps=1,
+        **viewer_params,
+    ):
         """
         Constructor.
 
         """
         self.init_state = np.array([-0.9273, 0.9273, np.pi / 2])
-        super().__init__(gamma=gamma, horizon=horizon, env_noise=env_noise, n_agents=1, obs_noise=obs_noise,
-                         timestep=timestep, n_intermediate_steps=n_intermediate_steps, **viewer_params)
+        super().__init__(
+            gamma=gamma,
+            horizon=horizon,
+            env_noise=env_noise,
+            n_agents=1,
+            obs_noise=obs_noise,
+            timestep=timestep,
+            n_intermediate_steps=n_intermediate_steps,
+            **viewer_params,
+        )
 
         # Remove z position and quaternion from puck pos
         self.obs_helper.remove_obs("puck_pos", 2)
@@ -73,12 +89,20 @@ def get_ee(self):
 
     def _modify_observation(self, obs):
         new_obs = obs.copy()
-        self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"])
+        self._puck_2d_in_robot_frame(
+            self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"]
+        )
 
-        self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_vel"), self.agents[0]["frame"], type='vel')
+        self._puck_2d_in_robot_frame(
+            self.obs_helper.get_from_obs(new_obs, "puck_vel"),
+            self.agents[0]["frame"],
+            type="vel",
+        )
 
         if self.obs_noise:
-            self.obs_helper.get_from_obs(new_obs, "puck_pos")[:] += np.random.randn(2) * 0.001
+            self.obs_helper.get_from_obs(new_obs, "puck_pos")[:] += (
+                np.random.randn(2) * 0.001
+            )
 
         return new_obs
 
@@ -87,7 +111,9 @@ def setup(self, obs):
         self.has_bounce = False
 
         for i in range(3):
-            self._data.joint("planar_robot_1/joint_" + str(i+1)).qpos = self.init_state[i]
+            self._data.joint("planar_robot_1/joint_" + str(i + 1)).qpos = (
+                self.init_state[i]
+            )
 
         super().setup(obs)
         mujoco.mj_fwdPosition(self._model, self._data)
@@ -103,12 +129,14 @@ def _create_observation(self, state):
         obs = super(AirHockeySingle, self)._create_observation(state)
         return np.append(obs, [self.has_hit, self.has_bounce])
 
-    def _create_info_dictionary(self, obs):
+    def _create_info_dictionary(self, obs, action):
         constraints = {}
         q_pos = self.obs_helper.get_joint_pos_from_obs(obs)
         q_vel = self.obs_helper.get_joint_vel_from_obs(obs)
 
-        x_pos, _ = forward_kinematics(self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee")
+        x_pos, _ = forward_kinematics(
+            self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee"
+        )
 
         # Translate to table space
         ee_pos = x_pos + self.agents[0]["frame"][:3, 3]
@@ -116,22 +144,34 @@ def _create_info_dictionary(self, obs):
         # ee_constraint: force the ee to stay within the bounds of the table
         # 1 Dimension on x direction: x > x_lb
         # 2 Dimension on y direction: y > y_lb, y < y_ub
-        x_lb = - (self.env_spec['table']['length'] / 2 + self.env_spec['mallet']['radius'])
-        y_lb = - (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius'])
-        y_ub = (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius'])
-
-        constraints["ee_constraints"] = np.array([-ee_pos[0] + x_lb,
-                                                  -ee_pos[1] + y_lb, ee_pos[1] - y_ub])
+        x_lb = -(
+            self.env_spec["table"]["length"] / 2 + self.env_spec["mallet"]["radius"]
+        )
+        y_lb = -(
+            self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"]
+        )
+        y_ub = self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"]
+
+        constraints["ee_constraints"] = np.array(
+            [-ee_pos[0] + x_lb, -ee_pos[1] + y_lb, ee_pos[1] - y_ub]
+        )
 
         # joint_pos_constraint: stay within the robots joint position limits
         constraints["joint_pos_constraints"] = np.zeros(6)
-        constraints["joint_pos_constraints"][:3] = q_vel - self.obs_helper.get_joint_pos_limits()[1]
-        constraints["joint_pos_constraints"][3:] = self.obs_helper.get_joint_pos_limits()[0] - q_vel
+        constraints["joint_pos_constraints"][:3] = (
+            q_vel - self.obs_helper.get_joint_pos_limits()[1]
+        )
+        constraints["joint_pos_constraints"][3:] = (
+            self.obs_helper.get_joint_pos_limits()[0] - q_vel
+        )
 
         # joint_vel_constraint: stay within the robots joint velocity limits
         constraints["joint_vel_constraints"] = np.zeros(6)
-        constraints["joint_vel_constraints"][:3] = q_vel - self.obs_helper.get_joint_vel_limits()[1]
-        constraints["joint_vel_constraints"][3:] = self.obs_helper.get_joint_vel_limits()[0] - q_vel
+        constraints["joint_vel_constraints"][:3] = (
+            q_vel - self.obs_helper.get_joint_vel_limits()[1]
+        )
+        constraints["joint_vel_constraints"][3:] = (
+            self.obs_helper.get_joint_vel_limits()[0] - q_vel
+        )
 
         return constraints
-
diff --git a/mushroom_rl/environments/mujoco_envs/ant.py b/mushroom_rl/environments/mujoco_envs/ant.py
index 3f5c3fe0..bb55342e 100644
--- a/mushroom_rl/environments/mujoco_envs/ant.py
+++ b/mushroom_rl/environments/mujoco_envs/ant.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-
+from typing import Tuple
 
 import numpy as np
 from mushroom_rl.environments.mujoco import MuJoCo, ObservationType
@@ -11,23 +11,24 @@ class Ant(MuJoCo):
     """
     The Ant MuJoCo environment as presented in:
     "High-Dimensional Continuous Control Using Generalized Advantage Estimation". John Schulman et. al.. 2015.
+    and implemented in Gymnasium
     """
 
     def __init__(
         self,
-        gamma=0.99,
-        horizon=1000,
-        forward_reward_weight=1.0,
-        ctrl_cost_weight=0.5,
-        contact_cost_weight=5e-4,
-        healthy_reward=1.0,
-        terminate_when_unhealthy=True,
-        healthy_z_range=(0.2, 1.0),
-        contact_force_range=(-1.0, 1.0),
-        reset_noise_scale=0.1,
-        n_substeps=5,
-        exclude_current_positions_from_observation=True,
-        use_contact_forces=False,
+        gamma: float = 0.99,
+        horizon: int = 1000,
+        forward_reward_weight: float = 1.0,
+        ctrl_cost_weight: float = 0.5,
+        contact_cost_weight: float = 5e-4,
+        healthy_reward: float = 1.0,
+        terminate_when_unhealthy: bool = True,
+        healthy_z_range: Tuple[float, float] = (0.2, 1.0),
+        contact_force_range: Tuple[float, float] = (-1.0, 1.0),
+        reset_noise_scale: float = 0.1,
+        n_substeps: int = 5,
+        exclude_current_positions_from_observation: bool = True,
+        use_contact_forces: bool = False,
         **viewer_params,
     ):
         """
@@ -72,10 +73,8 @@ def __init__(
         ]
 
         additional_data_spec = [
-            # Include torso vel for reward calculation
             ("torso_pos", "torso", ObservationType.BODY_POS),
             ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD),
-            ("root_pose", "root", ObservationType.JOINT_POS),
         ]
 
         collision_groups = [
@@ -129,28 +128,26 @@ def _is_finite(self):
         states = self.get_states()
         return np.isfinite(states).all()
 
-    def _is_within_z_range(self, z_pos):
-        z_pos = self._read_data("root_pose")[2]
+    def _is_within_z_range(self):
+        z_pos = self._read_data("torso_pos")[2]
         min_z, max_z = self._healthy_z_range
         return min_z <= z_pos <= max_z
 
-    def _is_healthy(self, obs):
-        is_healthy = self._is_finite() and self._is_within_z_range(obs)
+    def _is_healthy(self):
+        is_healthy = self._is_finite() and self._is_within_z_range()
         return is_healthy
 
     def is_absorbing(self, obs):
-        absorbing = self._terminate_when_unhealthy and not self._is_healthy(obs)
+        absorbing = self._terminate_when_unhealthy and not self._is_healthy()
         return absorbing
 
     def _get_healthy_reward(self, obs):
         return (
-            self._healthy_reward
-            if self._is_healthy(obs) or self._terminate_when_unhealthy
-            else 0.0
-        )
+            self._terminate_when_unhealthy and self._is_healthy()
+        ) * self._healthy_reward
 
     def _get_forward_reward(self):
-        forward_reward = self._read_data("torso_vel")[0]
+        forward_reward = self._read_data("torso_vel")[3]
         return self._forward_reward_weight * forward_reward
 
     def _get_ctrl_cost(self, action):
@@ -167,11 +164,11 @@ def _get_contact_cost(self, obs):
     def reward(self, obs, action, next_obs, absorbing):
         healthy_reward = self._get_healthy_reward(next_obs)
         forward_reward = self._get_forward_reward()
-        ctrl_cost = self._get_ctrl_cost(action)
-        contact_cost = (
-            self._get_contact_cost(next_obs) if self._use_contact_forces else 0.0
-        )
-        reward = healthy_reward + forward_reward - ctrl_cost - contact_cost
+        cost = self._get_ctrl_cost(action)
+        if self._use_contact_forces:
+            contact_cost = self._get_contact_cost(next_obs)
+            cost += contact_cost
+        reward = healthy_reward + forward_reward - cost
         return reward
 
     def _generate_noise(self):
@@ -191,12 +188,12 @@ def setup(self, obs):
 
         mujoco.mj_forward(self._model, self._data)  # type: ignore
 
-    def _create_info_dictionary(self, obs):
+    def _create_info_dictionary(self, obs, action):
         info = {
             "healthy_reward": self._get_healthy_reward(obs),
             "forward_reward": self._get_forward_reward(),
         }
-        # info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1)
+        info["ctrl_cost"] = self._get_ctrl_cost(action)
         if self._use_contact_forces:
             info["contact_cost"] = self._get_contact_cost(obs)
         return info
diff --git a/mushroom_rl/environments/mujoco_envs/half_cheetah.py b/mushroom_rl/environments/mujoco_envs/half_cheetah.py
index 3a9c3937..9fa02cea 100644
--- a/mushroom_rl/environments/mujoco_envs/half_cheetah.py
+++ b/mushroom_rl/environments/mujoco_envs/half_cheetah.py
@@ -55,6 +55,7 @@ def __init__(
 
         additional_data_spec = [
             ("x_pos", "rootx", ObservationType.JOINT_POS),
+            ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD),
         ]
 
         self._forward_reward_weight = forward_reward_weight
@@ -92,11 +93,8 @@ def _create_observation(self, obs):
     def is_absorbing(self, obs):
         return False
 
-    def _get_x_vel(self):
-        return (self._next_x_pos - self._x_pos) / self.dt
-
     def _get_forward_reward(self):
-        forward_reward = self._get_x_vel()
+        forward_reward = self._read_data("torso_vel")[3]
         return self._forward_reward_weight * forward_reward
 
     def _get_ctrl_cost(self, action):
@@ -109,34 +107,24 @@ def reward(self, obs, action, next_obs, absorbing):
         reward = forward_reward - ctrl_cost
         return reward
 
+    def _generate_noise(self):
+        self._data.qpos[:] = self._data.qpos + np.random.uniform(
+            -self._reset_noise_scale, self._reset_noise_scale, self._model.nq
+        )
+        self._data.qvel[:] = self._data.qvel + np.random.uniform(
+            -self._reset_noise_scale, self._reset_noise_scale, self._model.nv
+        )
+
     def setup(self, obs):
         super().setup(obs)
-        self._data.qpos[:] = (
-            self._data.qpos
-            + np.random.uniform(
-                -self._reset_noise_scale, self._reset_noise_scale, self._model.nq
-            )
-        ).copy()
-        self._data.qvel[:] = (
-            self._data.qvel
-            + self._reset_noise_scale * np.random.standard_normal(self._model.nv).copy()
-        ).copy()
+
+        self._generate_noise()
 
         mujoco.mj_forward(self._model, self._data)  # type: ignore
 
-    def _create_info_dictionary(self, obs):
+    def _create_info_dictionary(self, obs, action):
         info = {
-            "x_vel": self._get_x_vel(),
             "forward_reward": self._get_forward_reward(),
         }
-        # if action is not None:
-        #     info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1)
+        info["ctrl_cost"] = self._get_ctrl_cost(action)
         return info
-
-    def _step_init(self, obs, action):
-        super()._step_init(obs, action)
-        self._x_pos = self._read_data("x_pos").item()
-
-    def _step_finalize(self):
-        super()._step_finalize()
-        self._next_x_pos = self._read_data("x_pos").item()
diff --git a/mushroom_rl/environments/mujoco_envs/hopper.py b/mushroom_rl/environments/mujoco_envs/hopper.py
index 491b6332..a079a0e6 100644
--- a/mushroom_rl/environments/mujoco_envs/hopper.py
+++ b/mushroom_rl/environments/mujoco_envs/hopper.py
@@ -53,6 +53,7 @@ def __init__(
 
         additional_data_spec = [
             ("x_pos", "rootx", ObservationType.JOINT_POS),
+            ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD),
         ]
 
         self._forward_reward_weight = forward_reward_weight
@@ -87,32 +88,51 @@ def _modify_mdp_info(self, mdp_info):
 
     def _create_observation(self, obs):
         obs = super()._create_observation(obs)
+        # Clip the qvels
         obs[5:] = np.clip(obs[5:], -10, 10)
         if not self._exclude_current_positions_from_observation:
             x_pos = self._read_data("x_pos")
             obs = np.concatenate([obs, x_pos])
         return obs
 
+    def _is_within_state_range(self) -> bool:
+        """Check if state variables are within the healthy range."""
+        state = self.get_states()
+        min_state, max_state = self._healthy_state_range
+        return np.all(np.logical_and(min_state < state, state < max_state))
+
+    def _is_within_z_range(self, obs: np.ndarray) -> bool:
+        """Check if Z position of torso is within the healthy range."""
+        z_pos = self.obs_helper.get_from_obs(obs, "z_pos").item()
+        min_z, max_z = self._healthy_z_range
+        return min_z < z_pos < max_z
+
+    def _is_within_angle_range(self, obs):
+        """Check if Y angle of torso is within the healthy range."""
+        y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item()
+        min_angle, max_angle = self._healthy_angle_range
+        return min_angle < y_angle < max_angle
+
+    def _is_healthy(self, obs: np.ndarray) -> bool:
+        """Check if the agent is healthy."""
+        is_within_state_range = self._is_within_state_range()
+        is_within_z_range = self._is_within_z_range(obs)
+        is_within_angle_range = self._is_within_angle_range(obs)
+        return is_within_state_range and is_within_z_range and is_within_angle_range
+
     def is_absorbing(self, obs):
         """Return True if the agent is unhealthy and terminate_when_unhealthy is True."""
         return self._terminate_when_unhealthy and not self._is_healthy(obs)
 
-    def _get_x_vel(self):
-        x_pos = self._x_pos
-        next_x_pos = self._next_x_pos
-        return (next_x_pos - x_pos) / self.dt
-
-    def _get_forward_reward(self):
-        forward_reward = self._get_x_vel()
-        return self._forward_reward_weight * forward_reward
-
     def _get_healthy_reward(self, obs):
         """Return the healthy reward if the agent is healthy, else 0."""
         return (
-            self._healthy_reward
-            if self._is_healthy(obs) or self._terminate_when_unhealthy
-            else 0
-        )
+            self._is_healthy(obs) or self._terminate_when_unhealthy
+        ) * self._healthy_reward
+
+    def _get_forward_reward(self):
+        forward_reward = self._read_data("torso_vel")[3]
+        return self._forward_reward_weight * forward_reward
 
     def _get_ctrl_cost(self, action):
         """Return the control cost."""
@@ -126,69 +146,29 @@ def reward(self, obs, action, next_obs, absorbing):
         reward = healthy_reward + forward_reward - ctrl_cost
         return reward
 
-    def _is_healthy(self, obs: np.ndarray) -> bool:
-        """Check if the agent is healthy."""
-        is_within_state_range = self._is_within_state_range(obs)
-        is_within_z_range = self._is_within_z_range(obs)
-        is_within_angle_range = self._is_within_angle_range(obs)
-        return is_within_state_range and is_within_z_range and is_within_angle_range
-
-    def _is_within_state_range(self, obs: np.ndarray) -> bool:
-        """Check if state variables are within the healthy range."""
-        idx = 1 if self._exclude_current_positions_from_observation else 2
-        state_values = obs[idx:]
-        min_state, max_state = self._healthy_state_range
-        return all(min_state < value.item() < max_state for value in state_values)
-
-    def _is_within_z_range(self, obs: np.ndarray) -> bool:
-        """Check if Z position of torso is within the healthy range."""
-        z_pos = self.obs_helper.get_from_obs(obs, "z_pos").item()
-        min_z, max_z = self._healthy_z_range
-        return min_z < z_pos < max_z
-
-    def _is_within_angle_range(self, obs):
-        """Check if Y angle of torso is within the healthy range."""
-        y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item()
-        min_angle, max_angle = self._healthy_angle_range
-        return min_angle < y_angle < max_angle
-
-    def _get_error(
-        self,
-    ):
-        x_vel = self._get_x_vel()
-        return np.abs(x_vel - self.target_vel)
+    def _generate_noise(self):
+        self._data.qpos[:] = self._data.qpos + np.random.uniform(
+            -self._reset_noise_scale, self._reset_noise_scale, self._model.nq
+        )
+        self._data.qvel[:] = self._data.qvel + np.random.uniform(
+            -self._reset_noise_scale, self._reset_noise_scale, self._model.nv
+        )
 
     def setup(self, obs):
         super().setup(obs)
 
-        self._data.qpos[:] = (
-            self._data.qpos
-            + np.random.uniform(
-                -self._reset_noise_scale, self._reset_noise_scale, self._model.nq
-            )
-        ).copy()
-        self._data.qvel[:] = (
-            self._data.qvel
-            + np.random.uniform(
-                -self._reset_noise_scale, self._reset_noise_scale, self._model.nv
-            )
-        ).copy()
+        self._generate_noise()
 
         mujoco.mj_forward(self._model, self._data)  # type: ignore
 
-    def _create_info_dictionary(self, obs):
+    def _create_info_dictionary(self, obs, action):
         info = {
             "healthy_reward": self._get_healthy_reward(obs),
             "forward_reward": self._get_forward_reward(),
         }
-        # if action is not None:
-        #     info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1)
+        info["ctrl_cost"] = self._get_ctrl_cost(action)
         return info
 
-    def _step_init(self, obs, action):
-        super()._step_init(obs, action)
-        self._x_pos = self._read_data("x_pos").item()
-
-    def _step_finalize(self):
-        super()._step_finalize()
-        self._next_x_pos = self._read_data("x_pos").item()
+    def get_states(self):
+        """Return the position and velocity joint states of the model"""
+        return np.concatenate([self._data.qpos.flat, self._data.qvel.flat])
diff --git a/mushroom_rl/environments/mujoco_envs/walker_2d.py b/mushroom_rl/environments/mujoco_envs/walker_2d.py
index 204147b1..0025355d 100644
--- a/mushroom_rl/environments/mujoco_envs/walker_2d.py
+++ b/mushroom_rl/environments/mujoco_envs/walker_2d.py
@@ -25,7 +25,6 @@ def __init__(
         reset_noise_scale=5e-3,
         exclude_current_positions_from_observation=True,
         n_substeps=4,
-        normalize_reward=False,
         **viewer_params,
     ):
         """
@@ -66,6 +65,7 @@ def __init__(
 
         additional_data_spec = [
             ("x_pos", "rootx", ObservationType.JOINT_POS),
+            ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD),
         ]
 
         self._forward_reward_weight = forward_reward_weight
@@ -99,39 +99,42 @@ def _modify_mdp_info(self, mdp_info):
 
     def _create_observation(self, obs):
         obs = super()._create_observation(obs)
+        # Clip qvels
         obs[9:] = np.clip(obs[9:], -10, 10)
         if not self._exclude_current_positions_from_observation:
             x_pos = self._read_data("x_pos")
             obs = np.concatenate([obs, x_pos])
         return obs
 
+    def _is_within_z_range(self, obs):
+        """Check if Z position of torso is within the healthy range."""
+        z_position = self.obs_helper.get_from_obs(obs, "z_pos").item()
+        min_z, max_z = self._healthy_z_range
+        return min_z < z_position < max_z
+
+    def _is_within_angle_range(self, obs):
+        """Check if y-angle of torso is within the healthy range."""
+        y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item()
+        min_angle, max_angle = self._healthy_angle_range
+        return min_angle < y_angle < max_angle
+
     def is_absorbing(self, obs):
-        """Return True if the agent is unhealthy and terminate_when_unhealthy is True."""
         return self._terminate_when_unhealthy and not self._is_healthy(obs)
 
     def _is_healthy(self, obs):
-        """Check if the agent is healthy."""
-        is_finite = self._is_finite(obs)
         is_within_z_range = self._is_within_z_range(obs)
         is_within_angle_range = self._is_within_angle_range(obs)
-        return is_finite and is_within_z_range and is_within_angle_range
-
-    def _get_x_vel(self):
-        x_pos = self._x_pos
-        next_x_pos = self._next_x_pos
-        return (next_x_pos - x_pos) / self.dt
-
-    def _get_forward_reward(self):
-        forward_reward = self._get_x_vel()
-        return self._forward_reward_weight * forward_reward
+        return is_within_z_range and is_within_angle_range
 
     def _get_healthy_reward(self, obs):
         """Return the healthy reward if the agent is healthy, else 0."""
         return (
-            self._healthy_reward
-            if self._is_healthy(obs) or self._terminate_when_unhealthy
-            else 0
-        )
+            self._is_healthy(obs) or self._terminate_when_unhealthy
+        ) * self._healthy_reward
+
+    def _get_forward_reward(self):
+        forward_reward = self._read_data("torso_vel")[3]
+        return self._forward_reward_weight * forward_reward
 
     def _get_ctrl_cost(self, action):
         """Return the control cost."""
@@ -145,57 +148,25 @@ def reward(self, obs, action, next_obs, absorbing):
         reward = healthy_reward + forward_reward - ctrl_cost
         return reward
 
-    def _is_finite(self, obs):
-        """Check if the observation is finite."""
-        return np.isfinite(obs).all()
-
-    def _is_within_z_range(self, obs):
-        """Check if Z position of torso is within the healthy range."""
-        z_position = self.obs_helper.get_from_obs(obs, "z_pos").item()
-        min_z, max_z = self._healthy_z_range
-        return min_z < z_position < max_z
-
-    def _is_within_angle_range(self, obs):
-        """Check if y-angle of torso is within the healthy range."""
-        y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item()
-        min_angle, max_angle = self._healthy_angle_range
-        return min_angle < y_angle < max_angle
-
-    def _get_error(self):
-        x_vel = self._get_x_vel()
-        return np.abs(self.target_vel - x_vel)
+    def _generate_noise(self):
+        self._data.qpos[:] = self._data.qpos + np.random.uniform(
+            -self._reset_noise_scale, self._reset_noise_scale, self._model.nq
+        )
+        self._data.qvel[:] = self._data.qvel + np.random.uniform(
+            -self._reset_noise_scale, self._reset_noise_scale, self._model.nv
+        )
 
     def setup(self, obs):
         super().setup(obs)
 
-        self._data.qpos[:] = (
-            self._data.qpos
-            + np.random.uniform(
-                -self._reset_noise_scale, self._reset_noise_scale, self._model.nq
-            )
-        ).copy()
-        self._data.qvel[:] = (
-            self._data.qvel
-            + np.random.uniform(
-                -self._reset_noise_scale, self._reset_noise_scale, self._model.nv
-            )
-        ).copy()
+        self._generate_noise()
 
         mujoco.mj_forward(self._model, self._data)  # type: ignore
 
-    def _create_info_dictionary(self, obs):
+    def _create_info_dictionary(self, obs, action):
         info = {
             "healthy_reward": self._get_healthy_reward(obs),
             "forward_reward": self._get_forward_reward(),
         }
-        # if action is not None:
-        #     info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1)
+        info["ctrl_cost"] = self._get_ctrl_cost(action)
         return info
-
-    def _step_init(self, obs, action):
-        super()._step_init(obs, action)
-        self._x_pos = self._read_data("x_pos").item()
-
-    def _step_finalize(self):
-        super()._step_finalize()
-        self._next_x_pos = self._read_data("x_pos").item()