From dea712f8af1ab56188e0661f0bc78e42dc3fbc90 Mon Sep 17 00:00:00 2001 From: Noah Farr Date: Sat, 28 Dec 2024 23:58:09 +0100 Subject: [PATCH] Finish locomotion envs --- examples/mujoco_ant_ppo.py | 121 ++++++++++++ examples/mujoco_ant_sac.py | 159 ---------------- examples/mujoco_half_cheetah_ppo.py | 121 ++++++++++++ examples/mujoco_half_cheetah_sac.py | 159 ---------------- examples/mujoco_hopper_ppo.py | 121 ++++++++++++ examples/mujoco_hopper_sac.py | 159 ---------------- examples/mujoco_walker_2d_ppo.py | 121 ++++++++++++ examples/mujoco_walker_2d_sac.py | 159 ---------------- mushroom_rl/environments/mujoco.py | 177 +++++++++++++----- .../mujoco_envs/air_hockey/double.py | 122 ++++++++---- .../mujoco_envs/air_hockey/single.py | 82 +++++--- mushroom_rl/environments/mujoco_envs/ant.py | 63 +++---- .../environments/mujoco_envs/half_cheetah.py | 40 ++-- .../environments/mujoco_envs/hopper.py | 112 +++++------ .../environments/mujoco_envs/walker_2d.py | 91 +++------ 15 files changed, 878 insertions(+), 929 deletions(-) create mode 100644 examples/mujoco_ant_ppo.py delete mode 100644 examples/mujoco_ant_sac.py create mode 100644 examples/mujoco_half_cheetah_ppo.py delete mode 100644 examples/mujoco_half_cheetah_sac.py create mode 100644 examples/mujoco_hopper_ppo.py delete mode 100644 examples/mujoco_hopper_sac.py create mode 100644 examples/mujoco_walker_2d_ppo.py delete mode 100644 examples/mujoco_walker_2d_sac.py diff --git a/examples/mujoco_ant_ppo.py b/examples/mujoco_ant_ppo.py new file mode 100644 index 00000000..362ecaf5 --- /dev/null +++ b/examples/mujoco_ant_ppo.py @@ -0,0 +1,121 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +from mushroom_rl.algorithms.actor_critic import PPO +from mushroom_rl.core import Core, Logger +from mushroom_rl.environments.mujoco_envs.ant import Ant +from mushroom_rl.policy import GaussianTorchPolicy + +from tqdm import trange + + +class Network(nn.Module): + def __init__(self, input_shape, output_shape, n_features, **kwargs): + super(Network, self).__init__() + + n_input = input_shape[-1] + n_output = output_shape[0] + + self._h1 = nn.Linear(n_input, n_features) + self._h2 = nn.Linear(n_features, n_features) + self._h3 = nn.Linear(n_features, n_output) + + nn.init.xavier_uniform_( + self._h1.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h2.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h3.weight, gain=nn.init.calculate_gain("linear") / 10 + ) + + def forward(self, state, **kwargs): + features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) + features2 = F.relu(self._h2(features1)) + a = self._h3(features2) + + return a + + +def experiment(n_epochs, n_steps, n_episodes_test, seed=0): + + np.random.seed(seed) + torch.manual_seed(seed) + + logger = Logger(PPO.__name__, results_dir=None) + logger.strong_line() + logger.info("Experiment Algorithm: " + PPO.__name__) + + mdp = Ant() + + actor_lr = 3e-4 + critic_lr = 3e-4 + n_features = 32 + batch_size = 64 + n_epochs_policy = 10 + eps = 0.2 + lam = 0.95 + std_0 = 1.0 + n_steps_per_fit = 2000 + + critic_params = dict( + network=Network, + optimizer={"class": optim.Adam, "params": {"lr": critic_lr}}, + loss=F.mse_loss, + n_features=n_features, + batch_size=batch_size, + input_shape=mdp.info.observation_space.shape, + output_shape=(1,), + ) + + alg_params = dict( + actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}}, + n_epochs_policy=n_epochs_policy, + batch_size=batch_size, + eps_ppo=eps, + lam=lam, + critic_params=critic_params, + ) + + policy_params = dict(std_0=std_0, n_features=n_features) + + policy = GaussianTorchPolicy( + Network, + mdp.info.observation_space.shape, + mdp.info.action_space.shape, + **policy_params, + ) + + agent = PPO(mdp.info, policy, **alg_params) + + core = Core(agent, mdp) + + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(0, J=J, R=R, entropy=E) + + for it in trange(n_epochs, leave=False): + core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(it + 1, J=J, R=R, entropy=E) + + logger.info("Press a button to visualize") + input() + core.evaluate(n_episodes=5, render=True) + + +if __name__ == "__main__": + experiment(n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/examples/mujoco_ant_sac.py b/examples/mujoco_ant_sac.py deleted file mode 100644 index e633add2..00000000 --- a/examples/mujoco_ant_sac.py +++ /dev/null @@ -1,159 +0,0 @@ -import numpy as np - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F - -from mushroom_rl.algorithms.actor_critic import SAC -from mushroom_rl.core import Core, Logger -from mushroom_rl.environments.mujoco_envs.ant import Ant - -from tqdm import trange - - -class CriticNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super().__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state, action): - state_action = torch.cat((state.float(), action.float()), dim=1) - features1 = F.relu(self._h1(state_action)) - features2 = F.relu(self._h2(features1)) - q = self._h3(features2) - - return torch.squeeze(q) - - -class ActorNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super(ActorNetwork, self).__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state): - features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) - features2 = F.relu(self._h2(features1)) - a = self._h3(features2) - - return a - - -def experiment(alg, n_epochs, n_steps, n_episodes_test): - np.random.seed() - - logger = Logger(alg.__name__, results_dir=None) - logger.strong_line() - logger.info("Experiment Algorithm: " + alg.__name__) - - # MDP - mdp = Ant() - - # Settings - initial_replay_size = 5_000 - max_replay_size = 500_000 - batch_size = 256 - n_features = 256 - warmup_transitions = 10_000 - tau = 5e-3 - lr_alpha = 3e-4 - - # Approximator - actor_input_shape = mdp.info.observation_space.shape - actor_mu_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - actor_sigma_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - - actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}} - - critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) - critic_params = dict( - network=CriticNetwork, - optimizer={"class": optim.Adam, "params": {"lr": 3e-4}}, - loss=F.mse_loss, - n_features=n_features, - input_shape=critic_input_shape, - output_shape=(1,), - ) - - # Agent - agent = alg( - mdp.info, - actor_mu_params, - actor_sigma_params, - actor_optimizer, - critic_params, - batch_size, - initial_replay_size, - max_replay_size, - warmup_transitions, - tau, - lr_alpha, - critic_fit_params=None, - ) - - # Algorithm - core = Core(agent, mdp) - - # RUN - dataset = core.evaluate(n_episodes=n_episodes_test, render=False) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(0, J=J, R=R, entropy=E) - - core.learn( - n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True - ) - - for n in trange(n_epochs, leave=False): - core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) - dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(n + 1, J=J, R=R, entropy=E) - - logger.info("Press a button to visualize") - input() - core.evaluate(n_episodes=5, render=True) - - -if __name__ == "__main__": - algs = [SAC] - - for alg in algs: - experiment(alg=alg, n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/examples/mujoco_half_cheetah_ppo.py b/examples/mujoco_half_cheetah_ppo.py new file mode 100644 index 00000000..643e09d3 --- /dev/null +++ b/examples/mujoco_half_cheetah_ppo.py @@ -0,0 +1,121 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +from mushroom_rl.algorithms.actor_critic import PPO +from mushroom_rl.core import Core, Logger +from mushroom_rl.environments.mujoco_envs.half_cheetah import HalfCheetah +from mushroom_rl.policy import GaussianTorchPolicy + +from tqdm import trange + + +class Network(nn.Module): + def __init__(self, input_shape, output_shape, n_features, **kwargs): + super(Network, self).__init__() + + n_input = input_shape[-1] + n_output = output_shape[0] + + self._h1 = nn.Linear(n_input, n_features) + self._h2 = nn.Linear(n_features, n_features) + self._h3 = nn.Linear(n_features, n_output) + + nn.init.xavier_uniform_( + self._h1.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h2.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h3.weight, gain=nn.init.calculate_gain("linear") / 10 + ) + + def forward(self, state, **kwargs): + features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) + features2 = F.relu(self._h2(features1)) + a = self._h3(features2) + + return a + + +def experiment(n_epochs, n_steps, n_episodes_test, seed=0): + + np.random.seed(seed) + torch.manual_seed(seed) + + logger = Logger(PPO.__name__, results_dir=None) + logger.strong_line() + logger.info("Experiment Algorithm: " + PPO.__name__) + + mdp = HalfCheetah() + + actor_lr = 3e-4 + critic_lr = 3e-4 + n_features = 32 + batch_size = 64 + n_epochs_policy = 10 + eps = 0.2 + lam = 0.95 + std_0 = 1.0 + n_steps_per_fit = 2000 + + critic_params = dict( + network=Network, + optimizer={"class": optim.Adam, "params": {"lr": critic_lr}}, + loss=F.mse_loss, + n_features=n_features, + batch_size=batch_size, + input_shape=mdp.info.observation_space.shape, + output_shape=(1,), + ) + + alg_params = dict( + actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}}, + n_epochs_policy=n_epochs_policy, + batch_size=batch_size, + eps_ppo=eps, + lam=lam, + critic_params=critic_params, + ) + + policy_params = dict(std_0=std_0, n_features=n_features) + + policy = GaussianTorchPolicy( + Network, + mdp.info.observation_space.shape, + mdp.info.action_space.shape, + **policy_params, + ) + + agent = PPO(mdp.info, policy, **alg_params) + + core = Core(agent, mdp) + + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(0, J=J, R=R, entropy=E) + + for it in trange(n_epochs, leave=False): + core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(it + 1, J=J, R=R, entropy=E) + + logger.info("Press a button to visualize") + input() + core.evaluate(n_episodes=5, render=True) + + +if __name__ == "__main__": + experiment(n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/examples/mujoco_half_cheetah_sac.py b/examples/mujoco_half_cheetah_sac.py deleted file mode 100644 index d3b56226..00000000 --- a/examples/mujoco_half_cheetah_sac.py +++ /dev/null @@ -1,159 +0,0 @@ -import numpy as np - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F - -from mushroom_rl.algorithms.actor_critic import SAC -from mushroom_rl.core import Core, Logger -from mushroom_rl.environments.mujoco_envs.half_cheetah import HalfCheetah - -from tqdm import trange - - -class CriticNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super().__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state, action): - state_action = torch.cat((state.float(), action.float()), dim=1) - features1 = F.relu(self._h1(state_action)) - features2 = F.relu(self._h2(features1)) - q = self._h3(features2) - - return torch.squeeze(q) - - -class ActorNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super(ActorNetwork, self).__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state): - features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) - features2 = F.relu(self._h2(features1)) - a = self._h3(features2) - - return a - - -def experiment(alg, n_epochs, n_steps, n_episodes_test): - np.random.seed() - - logger = Logger(alg.__name__, results_dir=None) - logger.strong_line() - logger.info("Experiment Algorithm: " + alg.__name__) - - # MDP - mdp = HalfCheetah() - - # Settings - initial_replay_size = 5_000 - max_replay_size = 500_000 - batch_size = 256 - n_features = 256 - warmup_transitions = 10_000 - tau = 5e-3 - lr_alpha = 3e-4 - - # Approximator - actor_input_shape = mdp.info.observation_space.shape - actor_mu_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - actor_sigma_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - - actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}} - - critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) - critic_params = dict( - network=CriticNetwork, - optimizer={"class": optim.Adam, "params": {"lr": 3e-4}}, - loss=F.mse_loss, - n_features=n_features, - input_shape=critic_input_shape, - output_shape=(1,), - ) - - # Agent - agent = alg( - mdp.info, - actor_mu_params, - actor_sigma_params, - actor_optimizer, - critic_params, - batch_size, - initial_replay_size, - max_replay_size, - warmup_transitions, - tau, - lr_alpha, - critic_fit_params=None, - ) - - # Algorithm - core = Core(agent, mdp) - - # RUN - dataset = core.evaluate(n_episodes=n_episodes_test, render=False) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(0, J=J, R=R, entropy=E) - - core.learn( - n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True - ) - - for n in trange(n_epochs, leave=False): - core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) - dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(n + 1, J=J, R=R, entropy=E) - - logger.info("Press a button to visualize") - input() - core.evaluate(n_episodes=5, render=True) - - -if __name__ == "__main__": - algs = [SAC] - - for alg in algs: - experiment(alg=alg, n_epochs=50, n_steps=30_000, n_episodes_test=10) diff --git a/examples/mujoco_hopper_ppo.py b/examples/mujoco_hopper_ppo.py new file mode 100644 index 00000000..cabcf636 --- /dev/null +++ b/examples/mujoco_hopper_ppo.py @@ -0,0 +1,121 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +from mushroom_rl.algorithms.actor_critic import PPO +from mushroom_rl.core import Core, Logger +from mushroom_rl.environments.mujoco_envs.hopper import Hopper +from mushroom_rl.policy import GaussianTorchPolicy + +from tqdm import trange + + +class Network(nn.Module): + def __init__(self, input_shape, output_shape, n_features, **kwargs): + super(Network, self).__init__() + + n_input = input_shape[-1] + n_output = output_shape[0] + + self._h1 = nn.Linear(n_input, n_features) + self._h2 = nn.Linear(n_features, n_features) + self._h3 = nn.Linear(n_features, n_output) + + nn.init.xavier_uniform_( + self._h1.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h2.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h3.weight, gain=nn.init.calculate_gain("linear") / 10 + ) + + def forward(self, state, **kwargs): + features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) + features2 = F.relu(self._h2(features1)) + a = self._h3(features2) + + return a + + +def experiment(n_epochs, n_steps, n_episodes_test, seed=0): + + np.random.seed(seed) + torch.manual_seed(seed) + + logger = Logger(PPO.__name__, results_dir=None) + logger.strong_line() + logger.info("Experiment Algorithm: " + PPO.__name__) + + mdp = Hopper() + + actor_lr = 3e-4 + critic_lr = 3e-4 + n_features = 32 + batch_size = 64 + n_epochs_policy = 10 + eps = 0.2 + lam = 0.95 + std_0 = 1.0 + n_steps_per_fit = 2000 + + critic_params = dict( + network=Network, + optimizer={"class": optim.Adam, "params": {"lr": critic_lr}}, + loss=F.mse_loss, + n_features=n_features, + batch_size=batch_size, + input_shape=mdp.info.observation_space.shape, + output_shape=(1,), + ) + + alg_params = dict( + actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}}, + n_epochs_policy=n_epochs_policy, + batch_size=batch_size, + eps_ppo=eps, + lam=lam, + critic_params=critic_params, + ) + + policy_params = dict(std_0=std_0, n_features=n_features) + + policy = GaussianTorchPolicy( + Network, + mdp.info.observation_space.shape, + mdp.info.action_space.shape, + **policy_params, + ) + + agent = PPO(mdp.info, policy, **alg_params) + + core = Core(agent, mdp) + + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(0, J=J, R=R, entropy=E) + + for it in trange(n_epochs, leave=False): + core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(it + 1, J=J, R=R, entropy=E) + + logger.info("Press a button to visualize") + input() + core.evaluate(n_episodes=5, render=True) + + +if __name__ == "__main__": + experiment(n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/examples/mujoco_hopper_sac.py b/examples/mujoco_hopper_sac.py deleted file mode 100644 index 6cf3a995..00000000 --- a/examples/mujoco_hopper_sac.py +++ /dev/null @@ -1,159 +0,0 @@ -import numpy as np - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F - -from mushroom_rl.algorithms.actor_critic import SAC -from mushroom_rl.core import Core, Logger -from mushroom_rl.environments.mujoco_envs.hopper import Hopper - -from tqdm import trange - - -class CriticNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super().__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state, action): - state_action = torch.cat((state.float(), action.float()), dim=1) - features1 = F.relu(self._h1(state_action)) - features2 = F.relu(self._h2(features1)) - q = self._h3(features2) - - return torch.squeeze(q) - - -class ActorNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super(ActorNetwork, self).__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state): - features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) - features2 = F.relu(self._h2(features1)) - a = self._h3(features2) - - return a - - -def experiment(alg, n_epochs, n_steps, n_episodes_test): - np.random.seed() - - logger = Logger(alg.__name__, results_dir=None) - logger.strong_line() - logger.info("Experiment Algorithm: " + alg.__name__) - - # MDP - mdp = Hopper() - - # Settings - initial_replay_size = 5_000 - max_replay_size = 500_000 - batch_size = 256 - n_features = 256 - warmup_transitions = 10_000 - tau = 5e-3 - lr_alpha = 3e-4 - - # Approximator - actor_input_shape = mdp.info.observation_space.shape - actor_mu_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - actor_sigma_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - - actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}} - - critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) - critic_params = dict( - network=CriticNetwork, - optimizer={"class": optim.Adam, "params": {"lr": 3e-4}}, - loss=F.mse_loss, - n_features=n_features, - input_shape=critic_input_shape, - output_shape=(1,), - ) - - # Agent - agent = alg( - mdp.info, - actor_mu_params, - actor_sigma_params, - actor_optimizer, - critic_params, - batch_size, - initial_replay_size, - max_replay_size, - warmup_transitions, - tau, - lr_alpha, - critic_fit_params=None, - ) - - # Algorithm - core = Core(agent, mdp) - - # RUN - dataset = core.evaluate(n_episodes=n_episodes_test, render=False) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(0, J=J, R=R, entropy=E) - - core.learn( - n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True - ) - - for n in trange(n_epochs, leave=False): - core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) - dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(n + 1, J=J, R=R, entropy=E) - - logger.info("Press a button to visualize") - input() - core.evaluate(n_episodes=5, render=True) - - -if __name__ == "__main__": - algs = [SAC] - - for alg in algs: - experiment(alg=alg, n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/examples/mujoco_walker_2d_ppo.py b/examples/mujoco_walker_2d_ppo.py new file mode 100644 index 00000000..bfcb2eca --- /dev/null +++ b/examples/mujoco_walker_2d_ppo.py @@ -0,0 +1,121 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +from mushroom_rl.algorithms.actor_critic import PPO +from mushroom_rl.core import Core, Logger +from mushroom_rl.environments.mujoco_envs.walker_2d import Walker2D +from mushroom_rl.policy import GaussianTorchPolicy + +from tqdm import trange + + +class Network(nn.Module): + def __init__(self, input_shape, output_shape, n_features, **kwargs): + super(Network, self).__init__() + + n_input = input_shape[-1] + n_output = output_shape[0] + + self._h1 = nn.Linear(n_input, n_features) + self._h2 = nn.Linear(n_features, n_features) + self._h3 = nn.Linear(n_features, n_output) + + nn.init.xavier_uniform_( + self._h1.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h2.weight, gain=nn.init.calculate_gain("relu") / 10 + ) + nn.init.xavier_uniform_( + self._h3.weight, gain=nn.init.calculate_gain("linear") / 10 + ) + + def forward(self, state, **kwargs): + features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) + features2 = F.relu(self._h2(features1)) + a = self._h3(features2) + + return a + + +def experiment(n_epochs, n_steps, n_episodes_test, seed=0): + + np.random.seed(seed) + torch.manual_seed(seed) + + logger = Logger(PPO.__name__, results_dir=None) + logger.strong_line() + logger.info("Experiment Algorithm: " + PPO.__name__) + + mdp = Walker2D() + + actor_lr = 3e-4 + critic_lr = 3e-4 + n_features = 32 + batch_size = 64 + n_epochs_policy = 10 + eps = 0.2 + lam = 0.95 + std_0 = 1.0 + n_steps_per_fit = 2000 + + critic_params = dict( + network=Network, + optimizer={"class": optim.Adam, "params": {"lr": critic_lr}}, + loss=F.mse_loss, + n_features=n_features, + batch_size=batch_size, + input_shape=mdp.info.observation_space.shape, + output_shape=(1,), + ) + + alg_params = dict( + actor_optimizer={"class": optim.Adam, "params": {"lr": actor_lr}}, + n_epochs_policy=n_epochs_policy, + batch_size=batch_size, + eps_ppo=eps, + lam=lam, + critic_params=critic_params, + ) + + policy_params = dict(std_0=std_0, n_features=n_features) + + policy = GaussianTorchPolicy( + Network, + mdp.info.observation_space.shape, + mdp.info.action_space.shape, + **policy_params, + ) + + agent = PPO(mdp.info, policy, **alg_params) + + core = Core(agent, mdp) + + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(0, J=J, R=R, entropy=E) + + for it in trange(n_epochs, leave=False): + core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) + dataset = core.evaluate(n_episodes=n_episodes_test, render=False) + + J = np.mean(dataset.discounted_return) + R = np.mean(dataset.undiscounted_return) + E = agent.policy.entropy() + + logger.epoch_info(it + 1, J=J, R=R, entropy=E) + + logger.info("Press a button to visualize") + input() + core.evaluate(n_episodes=5, render=True) + + +if __name__ == "__main__": + experiment(n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/examples/mujoco_walker_2d_sac.py b/examples/mujoco_walker_2d_sac.py deleted file mode 100644 index 793a0d72..00000000 --- a/examples/mujoco_walker_2d_sac.py +++ /dev/null @@ -1,159 +0,0 @@ -import numpy as np - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F - -from mushroom_rl.algorithms.actor_critic import SAC -from mushroom_rl.core import Core, Logger -from mushroom_rl.environments.mujoco_envs.walker_2d import Walker2D - -from tqdm import trange - - -class CriticNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super().__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state, action): - state_action = torch.cat((state.float(), action.float()), dim=1) - features1 = F.relu(self._h1(state_action)) - features2 = F.relu(self._h2(features1)) - q = self._h3(features2) - - return torch.squeeze(q) - - -class ActorNetwork(nn.Module): - def __init__(self, input_shape, output_shape, n_features, **kwargs): - super(ActorNetwork, self).__init__() - - n_input = input_shape[-1] - n_output = output_shape[0] - - self._h1 = nn.Linear(n_input, n_features) - self._h2 = nn.Linear(n_features, n_features) - self._h3 = nn.Linear(n_features, n_output) - - nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h2.weight, gain=nn.init.calculate_gain("relu")) - nn.init.xavier_uniform_(self._h3.weight, gain=nn.init.calculate_gain("linear")) - - def forward(self, state): - features1 = F.relu(self._h1(torch.squeeze(state, 1).float())) - features2 = F.relu(self._h2(features1)) - a = self._h3(features2) - - return a - - -def experiment(alg, n_epochs, n_steps, n_episodes_test): - np.random.seed() - - logger = Logger(alg.__name__, results_dir=None) - logger.strong_line() - logger.info("Experiment Algorithm: " + alg.__name__) - - # MDP - mdp = Walker2D() - - # Settings - initial_replay_size = 5_000 - max_replay_size = 500_000 - batch_size = 256 - n_features = 256 - warmup_transitions = 10_000 - tau = 5e-3 - lr_alpha = 3e-4 - - # Approximator - actor_input_shape = mdp.info.observation_space.shape - actor_mu_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - actor_sigma_params = dict( - network=ActorNetwork, - n_features=n_features, - input_shape=actor_input_shape, - output_shape=mdp.info.action_space.shape, - ) - - actor_optimizer = {"class": optim.Adam, "params": {"lr": 1e-4}} - - critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) - critic_params = dict( - network=CriticNetwork, - optimizer={"class": optim.Adam, "params": {"lr": 3e-4}}, - loss=F.mse_loss, - n_features=n_features, - input_shape=critic_input_shape, - output_shape=(1,), - ) - - # Agent - agent = alg( - mdp.info, - actor_mu_params, - actor_sigma_params, - actor_optimizer, - critic_params, - batch_size, - initial_replay_size, - max_replay_size, - warmup_transitions, - tau, - lr_alpha, - critic_fit_params=None, - ) - - # Algorithm - core = Core(agent, mdp) - - # RUN - dataset = core.evaluate(n_episodes=n_episodes_test, render=False) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(0, J=J, R=R, entropy=E) - - core.learn( - n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=True - ) - - for n in trange(n_epochs, leave=False): - core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) - dataset = core.evaluate(n_episodes=n_episodes_test, render=False, quiet=True) - - J = np.mean(dataset.discounted_return) - R = np.mean(dataset.undiscounted_return) - E = agent.policy.entropy(dataset.state) - - logger.epoch_info(n + 1, J=J, R=R, entropy=E) - - logger.info("Press a button to visualize") - input() - core.evaluate(n_episodes=5, render=True) - - -if __name__ == "__main__": - algs = [SAC] - - for alg in algs: - experiment(alg=alg, n_epochs=50, n_steps=30000, n_episodes_test=10) diff --git a/mushroom_rl/environments/mujoco.py b/mushroom_rl/environments/mujoco.py index e76bec38..a75adf2c 100644 --- a/mushroom_rl/environments/mujoco.py +++ b/mushroom_rl/environments/mujoco.py @@ -11,9 +11,21 @@ class MuJoCo(Environment): Class to create a Mushroom environment using the MuJoCo simulator. """ - def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, timestep=None, n_substeps=1, - n_intermediate_steps=1, additional_data_spec=None, collision_groups=None, max_joint_vel=None, - **viewer_params): + def __init__( + self, + xml_file, + actuation_spec, + observation_spec, + gamma, + horizon, + timestep=None, + n_substeps=1, + n_intermediate_steps=1, + additional_data_spec=None, + collision_groups=None, + max_joint_vel=None, + **viewer_params, + ): """ Constructor. @@ -66,13 +78,17 @@ def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, t # Read the actuation spec and build the mapping between actions and ids # as well as their limits - self._action_indices = self.get_action_indices(self._model, self._data, actuation_spec) + self._action_indices = self.get_action_indices( + self._model, self._data, actuation_spec + ) action_space = self.get_action_space(self._action_indices, self._model) # Read the observation spec to build a mapping at every step. It is # ensured that the values appear in the order they are specified. - self.obs_helper = ObservationHelper(observation_spec, self._model, self._data, max_joint_velocity=max_joint_vel) + self.obs_helper = ObservationHelper( + observation_spec, self._model, self._data, max_joint_velocity=max_joint_vel + ) observation_space = Box(*self.obs_helper.get_obs_limits()) @@ -89,8 +105,12 @@ def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, t for name, geom_names in collision_groups: col_group = list() for geom_name in geom_names: - mj_id = mujoco.mj_name2id(self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name) - assert mj_id != -1, f"geom \"{geom_name}\" not found! Can't be used for collision-checking." + mj_id = mujoco.mj_name2id( + self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name + ) + assert ( + mj_id != -1 + ), f'geom "{geom_name}" not found! Can\'t be used for collision-checking.' col_group.append(mj_id) self.collision_groups[name] = set(col_group) @@ -119,7 +139,9 @@ def reset(self, obs=None): mujoco.mj_resetData(self._model, self._data) self.setup(obs) - self._obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data)) + self._obs = self._create_observation( + self.obs_helper._build_obs(self._model, self._data) + ) return self._modify_observation(self._obs), {} def step(self, action): @@ -144,16 +166,20 @@ def step(self, action): self._simulation_post_step() if self._recompute_action_per_step: - cur_obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data)) + cur_obs = self._create_observation( + self.obs_helper._build_obs(self._model, self._data) + ) if not self._recompute_action_per_step: - cur_obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data)) + cur_obs = self._create_observation( + self.obs_helper._build_obs(self._model, self._data) + ) self._step_finalize() absorbing = self.is_absorbing(cur_obs) reward = self.reward(self._obs, action, cur_obs, absorbing) - info = self._create_info_dictionary(cur_obs) + info = self._create_info_dictionary(cur_obs, action) self._obs = cur_obs @@ -161,7 +187,9 @@ def step(self, action): def render(self, record=False): if self._viewer is None: - self._viewer = MujocoViewer(self._model, self.dt, record=record, **self._viewer_params) + self._viewer = MujocoViewer( + self._model, self.dt, record=record, **self._viewer_params + ) return self._viewer.render(self._data, record) @@ -199,12 +227,13 @@ def _create_observation(self, obs): """ return obs - def _create_info_dictionary(self, obs): + def _create_info_dictionary(self, obs, action): """ This method can be overridden to create a custom info dictionary. Args: obs (np.ndarray): the generated observation + action (np.ndarray): the action taken Returns: The information dictionary. @@ -305,7 +334,9 @@ def _read_data(self, name): """ data_id, otype = self.additional_data[name] - return np.array(self.obs_helper.get_state(self._model, self._data, data_id, otype)) + return np.array( + self.obs_helper.get_state(self._model, self._data, data_id, otype) + ) def _write_data(self, name, value): """ @@ -324,7 +355,9 @@ def _write_data(self, name, value): elif otype == ObservationType.JOINT_VEL: self._data.joint(data_id).qvel = value else: - data_buffer = self.obs_helper.get_state(self._model, self._data, data_id, otype) + data_buffer = self.obs_helper.get_state( + self._model, self._data, data_id, otype + ) data_buffer[:] = value def _check_collision(self, group1, group2): @@ -381,11 +414,14 @@ def _get_collision_force(self, group1, group2): for con_i in range(0, self._data.ncon): con = self._data.contact[con_i] - if (con.geom1 in ids1 and con.geom2 in ids2 or - con.geom1 in ids2 and con.geom2 in ids1): + if ( + con.geom1 in ids1 + and con.geom2 in ids2 + or con.geom1 in ids2 + and con.geom2 in ids1 + ): - mujoco.mj_contactForce(self._model, self._data, - con_i, c_array) + mujoco.mj_contactForce(self._model, self._data, con_i, c_array) return c_array return c_array @@ -478,7 +514,7 @@ def get_action_space(action_indices, model): Returns: A bounding box for the action space. - """ + """ low = [] high = [] for index in action_indices: @@ -500,14 +536,14 @@ def user_warning_raise_exception(warning): warning: Mujoco warning. """ - if 'Pre-allocated constraint buffer is full' in warning: - raise RuntimeError(warning + 'Increase njmax in mujoco XML') - elif 'Pre-allocated contact buffer is full' in warning: - raise RuntimeError(warning + 'Increase njconmax in mujoco XML') - elif 'Unknown warning type' in warning: - raise RuntimeError(warning + 'Check for NaN in simulation.') + if "Pre-allocated constraint buffer is full" in warning: + raise RuntimeError(warning + "Increase njmax in mujoco XML") + elif "Pre-allocated contact buffer is full" in warning: + raise RuntimeError(warning + "Increase njconmax in mujoco XML") + elif "Unknown warning type" in warning: + raise RuntimeError(warning + "Check for NaN in simulation.") else: - raise RuntimeError('Got MuJoCo Warning: ' + warning) + raise RuntimeError("Got MuJoCo Warning: " + warning) @staticmethod def load_model(xml_file): @@ -523,8 +559,9 @@ def load_model(xml_file): """ if type(xml_file) == mjcf.element.RootElement: # load from xml handle - model = mujoco.MjModel.from_xml_string(xml=xml_file.to_xml_string(), - assets=xml_file.get_assets()) + model = mujoco.MjModel.from_xml_string( + xml=xml_file.to_xml_string(), assets=xml_file.get_assets() + ) elif type(xml_file) == str: # load from path model = mujoco.MjModel.from_xml_path(xml_file) @@ -542,9 +579,22 @@ class MultiMuJoCo(MuJoCo): """ - def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, timestep=None, - n_substeps=1, n_intermediate_steps=1, additional_data_spec=None, collision_groups=None, - max_joint_vel=None, random_env_reset=True, **viewer_params): + def __init__( + self, + xml_files, + actuation_spec, + observation_spec, + gamma, + horizon, + timestep=None, + n_substeps=1, + n_intermediate_steps=1, + additional_data_spec=None, + collision_groups=None, + max_joint_vel=None, + random_env_reset=True, + **viewer_params, + ): """ Constructor. @@ -603,7 +653,9 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, # Read the actuation spec and build the mapping between actions and ids # as well as their limits - self._action_indices = self.get_action_indices(self._model, self._data, actuation_spec) + self._action_indices = self.get_action_indices( + self._model, self._data, actuation_spec + ) action_space = self.get_action_space(self._action_indices, self._model) @@ -611,17 +663,27 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, for m, d in zip(self._models, self._datas): action_ind = self.get_action_indices(m, d, actuation_spec) action_sp = self.get_action_space(action_ind, m) - if not np.array_equal(action_ind, self._action_indices) or \ - not np.array_equal(action_space.low, action_sp.low) or\ - not np.array_equal(action_space.high, action_sp.high): - raise ValueError("The provided environments differ in the their action spaces. " - "This is not allowed.") + if ( + not np.array_equal(action_ind, self._action_indices) + or not np.array_equal(action_space.low, action_sp.low) + or not np.array_equal(action_space.high, action_sp.high) + ): + raise ValueError( + "The provided environments differ in the their action spaces. " + "This is not allowed." + ) # Read the observation spec to build a mapping at every step. It is # ensured that the values appear in the order they are specified. - self.obs_helpers = [ObservationHelper(observation_spec, self._model, self._data, - max_joint_velocity=max_joint_vel) - for m, d in zip(self._models, self._datas)] + self.obs_helpers = [ + ObservationHelper( + observation_spec, + self._model, + self._data, + max_joint_velocity=max_joint_vel, + ) + for m, d in zip(self._models, self._datas) + ] self.obs_helper = self.obs_helpers[self._current_model_idx] observation_space = Box(*self.obs_helper.get_obs_limits()) @@ -629,9 +691,13 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, # multi envs with different obs limits are now allowed, do sanity check for oh in self.obs_helpers: low, high = self.obs_helper.get_obs_limits() - if not np.array_equal(low, observation_space.low) or not np.array_equal(high, observation_space.high): - raise ValueError("The provided environments differ in the their observation limits. " - "This is not allowed.") + if not np.array_equal(low, observation_space.low) or not np.array_equal( + high, observation_space.high + ): + raise ValueError( + "The provided environments differ in the their observation limits. " + "This is not allowed." + ) # Pre-process the additional data to allow easier writing and reading # to and from arrays in MuJoCo @@ -646,8 +712,12 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, for name, geom_names in collision_groups: col_group = list() for geom_name in geom_names: - mj_id = mujoco.mj_name2id(self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name) - assert mj_id != -1, f"geom \"{geom_name}\" not found! Can't be used for collision-checking." + mj_id = mujoco.mj_name2id( + self._model, mujoco.mjtObj.mjOBJ_GEOM, geom_name + ) + assert ( + mj_id != -1 + ), f'geom "{geom_name}" not found! Can\'t be used for collision-checking.' col_group.append(mj_id) self.collision_groups[name] = set(col_group) @@ -676,8 +746,11 @@ def reset(self, obs=None): if self._random_env_reset: self._current_model_idx = np.random.randint(0, len(self._models)) else: - self._current_model_idx = self._current_model_idx + 1 \ - if self._current_model_idx < len(self._models) - 1 else 0 + self._current_model_idx = ( + self._current_model_idx + 1 + if self._current_model_idx < len(self._models) - 1 + else 0 + ) self._model = self._models[self._current_model_idx] self._data = self._datas[self._current_model_idx] @@ -687,7 +760,9 @@ def reset(self, obs=None): if self._viewer is not None and self.more_than_one_env: self._viewer.load_new_model(self._model) - self._obs = self._create_observation(self.obs_helper._build_obs(self._model, self._data)) + self._obs = self._create_observation( + self.obs_helper._build_obs(self._model, self._data) + ) return self._modify_observation(self._obs) @property @@ -708,11 +783,11 @@ def _get_env_id_map(current_model_idx, n_models): """ n_models = np.maximum(n_models, 2) - bits_needed = 1+int(np.log((n_models-1))/np.log(2)) + bits_needed = 1 + int(np.log((n_models - 1)) / np.log(2)) id_mask = np.zeros(bits_needed) bin_rep = np.binary_repr(current_model_idx)[::-1] for i, b in enumerate(bin_rep): - idx = bits_needed - 1 - i # reverse idx + idx = bits_needed - 1 - i # reverse idx if int(b): id_mask[idx] = 1.0 else: diff --git a/mushroom_rl/environments/mujoco_envs/air_hockey/double.py b/mushroom_rl/environments/mujoco_envs/air_hockey/double.py index 27308b08..a82f4c7e 100644 --- a/mushroom_rl/environments/mujoco_envs/air_hockey/double.py +++ b/mushroom_rl/environments/mujoco_envs/air_hockey/double.py @@ -5,21 +5,38 @@ from mushroom_rl.environments.mujoco_envs.air_hockey.base import AirHockeyBase from mushroom_rl.utils.mujoco import forward_kinematics + class AirHockeyDouble(AirHockeyBase): """ Base class for two agents air hockey tasks. """ - def __init__(self, gamma=0.99, horizon=120, env_noise=False, obs_noise=False, timestep=1 / 240., - n_intermediate_steps=1, **viewer_params): + def __init__( + self, + gamma=0.99, + horizon=120, + env_noise=False, + obs_noise=False, + timestep=1 / 240.0, + n_intermediate_steps=1, + **viewer_params, + ): """ Constructor. """ self.init_state = np.array([-0.9273, 0.9273, np.pi / 2]) - super().__init__(gamma=gamma, horizon=horizon, env_noise=env_noise, n_agents=2, obs_noise=obs_noise, - timestep=timestep, n_intermediate_steps=n_intermediate_steps, **viewer_params) + super().__init__( + gamma=gamma, + horizon=horizon, + env_noise=env_noise, + n_agents=2, + obs_noise=obs_noise, + timestep=timestep, + n_intermediate_steps=n_intermediate_steps, + **viewer_params, + ) # Remove z position and quaternion from puck pos self.obs_helper.remove_obs("puck_pos", 2) @@ -68,14 +85,26 @@ def get_ee(self, robot=1): def _modify_observation(self, obs): new_obs = obs.copy() - self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"]) - - self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_vel"), self.agents[0]["frame"], type='vel') - - self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "robot_2/puck_pos"), self.agents[1]["frame"]) - - self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "robot_2/puck_vel"), self.agents[1]["frame"], - type='vel') + self._puck_2d_in_robot_frame( + self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"] + ) + + self._puck_2d_in_robot_frame( + self.obs_helper.get_from_obs(new_obs, "puck_vel"), + self.agents[0]["frame"], + type="vel", + ) + + self._puck_2d_in_robot_frame( + self.obs_helper.get_from_obs(new_obs, "robot_2/puck_pos"), + self.agents[1]["frame"], + ) + + self._puck_2d_in_robot_frame( + self.obs_helper.get_from_obs(new_obs, "robot_2/puck_vel"), + self.agents[1]["frame"], + type="vel", + ) if self.obs_noise: noise = np.random.randn(2) * 0.001 @@ -93,10 +122,14 @@ def setup(self, obs): self.has_bounce = False for i in range(3): - self._data.joint("planar_robot_1/joint_" + str(i+1)).qpos = self.init_state[i] + self._data.joint("planar_robot_1/joint_" + str(i + 1)).qpos = ( + self.init_state[i] + ) for i in range(3): - self._data.joint("planar_robot_2/joint_" + str(i+1)).qpos = self.init_state[i] + self._data.joint("planar_robot_2/joint_" + str(i + 1)).qpos = ( + self.init_state[i] + ) super().setup(obs) mujoco.mj_fwdPosition(self._model, self._data) @@ -115,15 +148,17 @@ def _create_observation(self, state): obs = super(AirHockeyDouble, self)._create_observation(state) return np.append(obs, [self.robot_1_hit, self.robot_2_hit, self.has_bounce]) - def _create_info_dictionary(self, obs): - constraints = {"agent-1": {}, "agent-2":{}} + def _create_info_dictionary(self, obs, action): + constraints = {"agent-1": {}, "agent-2": {}} for i, key in enumerate(constraints.keys()): - q_pos = self.obs_helper.get_joint_pos_from_obs(obs)[i * 3: (i+1) * 3] - q_vel = self.obs_helper.get_joint_vel_from_obs(obs)[i * 3: (i+1) * 3] + q_pos = self.obs_helper.get_joint_pos_from_obs(obs)[i * 3 : (i + 1) * 3] + q_vel = self.obs_helper.get_joint_vel_from_obs(obs)[i * 3 : (i + 1) * 3] - x_pos, _ = forward_kinematics(self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee") + x_pos, _ = forward_kinematics( + self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee" + ) # Translate to table space ee_pos = x_pos + self.agents[0]["frame"][:3, 3] @@ -131,33 +166,48 @@ def _create_info_dictionary(self, obs): # ee_constraint: force the ee to stay within the bounds of the table # 1 Dimension on x direction: x > x_lb # 2 Dimension on y direction: y > y_lb, y < y_ub - x_lb = - (self.env_spec['table']['length'] / 2 + self.env_spec['mallet']['radius']) - y_lb = - (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius']) - y_ub = (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius']) - - constraints[key]["ee_constraints"] = np.array([-ee_pos[0] + x_lb, - -ee_pos[1] + y_lb, ee_pos[1] - y_ub]) + x_lb = -( + self.env_spec["table"]["length"] / 2 + self.env_spec["mallet"]["radius"] + ) + y_lb = -( + self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"] + ) + y_ub = ( + self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"] + ) + + constraints[key]["ee_constraints"] = np.array( + [-ee_pos[0] + x_lb, -ee_pos[1] + y_lb, ee_pos[1] - y_ub] + ) # joint_pos_constraint: stay within the robots joint position limits constraints[key]["joint_pos_constraints"] = np.zeros(6) - constraints[key]["joint_pos_constraints"][:3] = q_vel - self.obs_helper.get_joint_pos_limits()[1][i * 3: (i+1) * 3] - constraints[key]["joint_pos_constraints"][3:] = self.obs_helper.get_joint_pos_limits()[0][i * 3: (i+1) * 3] - q_vel + constraints[key]["joint_pos_constraints"][:3] = ( + q_vel - self.obs_helper.get_joint_pos_limits()[1][i * 3 : (i + 1) * 3] + ) + constraints[key]["joint_pos_constraints"][3:] = ( + self.obs_helper.get_joint_pos_limits()[0][i * 3 : (i + 1) * 3] - q_vel + ) # joint_vel_constraint: stay within the robots joint velocity limits constraints[key]["joint_vel_constraints"] = np.zeros(6) - constraints[key]["joint_vel_constraints"][:3] = q_vel - self.obs_helper.get_joint_vel_limits()[1][i * 3: (i+1) * 3] - constraints[key]["joint_vel_constraints"][3:] = self.obs_helper.get_joint_vel_limits()[0][i * 3: (i+1) * 3] - q_vel + constraints[key]["joint_vel_constraints"][:3] = ( + q_vel - self.obs_helper.get_joint_vel_limits()[1][i * 3 : (i + 1) * 3] + ) + constraints[key]["joint_vel_constraints"][3:] = ( + self.obs_helper.get_joint_vel_limits()[0][i * 3 : (i + 1) * 3] - q_vel + ) return constraints -if __name__ == '__main__': +if __name__ == "__main__": env = AirHockeyDouble(env_noise=False, obs_noise=False, n_intermediate_steps=4) env.reset() - R = 0. - J = 0. - gamma = 1. + R = 0.0 + J = 0.0 + gamma = 1.0 steps = 0 while True: action = np.random.randn(6) * 5 @@ -169,8 +219,8 @@ def _create_info_dictionary(self, obs): steps += 1 if done or steps > env.info.horizon: print("J: ", J, " R: ", R) - R = 0. - J = 0. - gamma = 1. + R = 0.0 + J = 0.0 + gamma = 1.0 steps = 0 env.reset() diff --git a/mushroom_rl/environments/mujoco_envs/air_hockey/single.py b/mushroom_rl/environments/mujoco_envs/air_hockey/single.py index 5d138a93..2f5c77e5 100644 --- a/mushroom_rl/environments/mujoco_envs/air_hockey/single.py +++ b/mushroom_rl/environments/mujoco_envs/air_hockey/single.py @@ -11,16 +11,32 @@ class AirHockeySingle(AirHockeyBase): Base class for single agent air hockey tasks. """ - def __init__(self, gamma=0.99, horizon=120, env_noise=False, obs_noise=False, timestep=1 / 240., - n_intermediate_steps=1, **viewer_params): + def __init__( + self, + gamma=0.99, + horizon=120, + env_noise=False, + obs_noise=False, + timestep=1 / 240.0, + n_intermediate_steps=1, + **viewer_params, + ): """ Constructor. """ self.init_state = np.array([-0.9273, 0.9273, np.pi / 2]) - super().__init__(gamma=gamma, horizon=horizon, env_noise=env_noise, n_agents=1, obs_noise=obs_noise, - timestep=timestep, n_intermediate_steps=n_intermediate_steps, **viewer_params) + super().__init__( + gamma=gamma, + horizon=horizon, + env_noise=env_noise, + n_agents=1, + obs_noise=obs_noise, + timestep=timestep, + n_intermediate_steps=n_intermediate_steps, + **viewer_params, + ) # Remove z position and quaternion from puck pos self.obs_helper.remove_obs("puck_pos", 2) @@ -73,12 +89,20 @@ def get_ee(self): def _modify_observation(self, obs): new_obs = obs.copy() - self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"]) + self._puck_2d_in_robot_frame( + self.obs_helper.get_from_obs(new_obs, "puck_pos"), self.agents[0]["frame"] + ) - self._puck_2d_in_robot_frame(self.obs_helper.get_from_obs(new_obs, "puck_vel"), self.agents[0]["frame"], type='vel') + self._puck_2d_in_robot_frame( + self.obs_helper.get_from_obs(new_obs, "puck_vel"), + self.agents[0]["frame"], + type="vel", + ) if self.obs_noise: - self.obs_helper.get_from_obs(new_obs, "puck_pos")[:] += np.random.randn(2) * 0.001 + self.obs_helper.get_from_obs(new_obs, "puck_pos")[:] += ( + np.random.randn(2) * 0.001 + ) return new_obs @@ -87,7 +111,9 @@ def setup(self, obs): self.has_bounce = False for i in range(3): - self._data.joint("planar_robot_1/joint_" + str(i+1)).qpos = self.init_state[i] + self._data.joint("planar_robot_1/joint_" + str(i + 1)).qpos = ( + self.init_state[i] + ) super().setup(obs) mujoco.mj_fwdPosition(self._model, self._data) @@ -103,12 +129,14 @@ def _create_observation(self, state): obs = super(AirHockeySingle, self)._create_observation(state) return np.append(obs, [self.has_hit, self.has_bounce]) - def _create_info_dictionary(self, obs): + def _create_info_dictionary(self, obs, action): constraints = {} q_pos = self.obs_helper.get_joint_pos_from_obs(obs) q_vel = self.obs_helper.get_joint_vel_from_obs(obs) - x_pos, _ = forward_kinematics(self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee") + x_pos, _ = forward_kinematics( + self.robot_model, self.robot_data, q_pos, "planar_robot_1/body_ee" + ) # Translate to table space ee_pos = x_pos + self.agents[0]["frame"][:3, 3] @@ -116,22 +144,34 @@ def _create_info_dictionary(self, obs): # ee_constraint: force the ee to stay within the bounds of the table # 1 Dimension on x direction: x > x_lb # 2 Dimension on y direction: y > y_lb, y < y_ub - x_lb = - (self.env_spec['table']['length'] / 2 + self.env_spec['mallet']['radius']) - y_lb = - (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius']) - y_ub = (self.env_spec['table']['width'] / 2 - self.env_spec['mallet']['radius']) - - constraints["ee_constraints"] = np.array([-ee_pos[0] + x_lb, - -ee_pos[1] + y_lb, ee_pos[1] - y_ub]) + x_lb = -( + self.env_spec["table"]["length"] / 2 + self.env_spec["mallet"]["radius"] + ) + y_lb = -( + self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"] + ) + y_ub = self.env_spec["table"]["width"] / 2 - self.env_spec["mallet"]["radius"] + + constraints["ee_constraints"] = np.array( + [-ee_pos[0] + x_lb, -ee_pos[1] + y_lb, ee_pos[1] - y_ub] + ) # joint_pos_constraint: stay within the robots joint position limits constraints["joint_pos_constraints"] = np.zeros(6) - constraints["joint_pos_constraints"][:3] = q_vel - self.obs_helper.get_joint_pos_limits()[1] - constraints["joint_pos_constraints"][3:] = self.obs_helper.get_joint_pos_limits()[0] - q_vel + constraints["joint_pos_constraints"][:3] = ( + q_vel - self.obs_helper.get_joint_pos_limits()[1] + ) + constraints["joint_pos_constraints"][3:] = ( + self.obs_helper.get_joint_pos_limits()[0] - q_vel + ) # joint_vel_constraint: stay within the robots joint velocity limits constraints["joint_vel_constraints"] = np.zeros(6) - constraints["joint_vel_constraints"][:3] = q_vel - self.obs_helper.get_joint_vel_limits()[1] - constraints["joint_vel_constraints"][3:] = self.obs_helper.get_joint_vel_limits()[0] - q_vel + constraints["joint_vel_constraints"][:3] = ( + q_vel - self.obs_helper.get_joint_vel_limits()[1] + ) + constraints["joint_vel_constraints"][3:] = ( + self.obs_helper.get_joint_vel_limits()[0] - q_vel + ) return constraints - diff --git a/mushroom_rl/environments/mujoco_envs/ant.py b/mushroom_rl/environments/mujoco_envs/ant.py index 3f5c3fe0..bb55342e 100644 --- a/mushroom_rl/environments/mujoco_envs/ant.py +++ b/mushroom_rl/environments/mujoco_envs/ant.py @@ -1,5 +1,5 @@ from pathlib import Path - +from typing import Tuple import numpy as np from mushroom_rl.environments.mujoco import MuJoCo, ObservationType @@ -11,23 +11,24 @@ class Ant(MuJoCo): """ The Ant MuJoCo environment as presented in: "High-Dimensional Continuous Control Using Generalized Advantage Estimation". John Schulman et. al.. 2015. + and implemented in Gymnasium """ def __init__( self, - gamma=0.99, - horizon=1000, - forward_reward_weight=1.0, - ctrl_cost_weight=0.5, - contact_cost_weight=5e-4, - healthy_reward=1.0, - terminate_when_unhealthy=True, - healthy_z_range=(0.2, 1.0), - contact_force_range=(-1.0, 1.0), - reset_noise_scale=0.1, - n_substeps=5, - exclude_current_positions_from_observation=True, - use_contact_forces=False, + gamma: float = 0.99, + horizon: int = 1000, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 0.5, + contact_cost_weight: float = 5e-4, + healthy_reward: float = 1.0, + terminate_when_unhealthy: bool = True, + healthy_z_range: Tuple[float, float] = (0.2, 1.0), + contact_force_range: Tuple[float, float] = (-1.0, 1.0), + reset_noise_scale: float = 0.1, + n_substeps: int = 5, + exclude_current_positions_from_observation: bool = True, + use_contact_forces: bool = False, **viewer_params, ): """ @@ -72,10 +73,8 @@ def __init__( ] additional_data_spec = [ - # Include torso vel for reward calculation ("torso_pos", "torso", ObservationType.BODY_POS), ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD), - ("root_pose", "root", ObservationType.JOINT_POS), ] collision_groups = [ @@ -129,28 +128,26 @@ def _is_finite(self): states = self.get_states() return np.isfinite(states).all() - def _is_within_z_range(self, z_pos): - z_pos = self._read_data("root_pose")[2] + def _is_within_z_range(self): + z_pos = self._read_data("torso_pos")[2] min_z, max_z = self._healthy_z_range return min_z <= z_pos <= max_z - def _is_healthy(self, obs): - is_healthy = self._is_finite() and self._is_within_z_range(obs) + def _is_healthy(self): + is_healthy = self._is_finite() and self._is_within_z_range() return is_healthy def is_absorbing(self, obs): - absorbing = self._terminate_when_unhealthy and not self._is_healthy(obs) + absorbing = self._terminate_when_unhealthy and not self._is_healthy() return absorbing def _get_healthy_reward(self, obs): return ( - self._healthy_reward - if self._is_healthy(obs) or self._terminate_when_unhealthy - else 0.0 - ) + self._terminate_when_unhealthy and self._is_healthy() + ) * self._healthy_reward def _get_forward_reward(self): - forward_reward = self._read_data("torso_vel")[0] + forward_reward = self._read_data("torso_vel")[3] return self._forward_reward_weight * forward_reward def _get_ctrl_cost(self, action): @@ -167,11 +164,11 @@ def _get_contact_cost(self, obs): def reward(self, obs, action, next_obs, absorbing): healthy_reward = self._get_healthy_reward(next_obs) forward_reward = self._get_forward_reward() - ctrl_cost = self._get_ctrl_cost(action) - contact_cost = ( - self._get_contact_cost(next_obs) if self._use_contact_forces else 0.0 - ) - reward = healthy_reward + forward_reward - ctrl_cost - contact_cost + cost = self._get_ctrl_cost(action) + if self._use_contact_forces: + contact_cost = self._get_contact_cost(next_obs) + cost += contact_cost + reward = healthy_reward + forward_reward - cost return reward def _generate_noise(self): @@ -191,12 +188,12 @@ def setup(self, obs): mujoco.mj_forward(self._model, self._data) # type: ignore - def _create_info_dictionary(self, obs): + def _create_info_dictionary(self, obs, action): info = { "healthy_reward": self._get_healthy_reward(obs), "forward_reward": self._get_forward_reward(), } - # info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1) + info["ctrl_cost"] = self._get_ctrl_cost(action) if self._use_contact_forces: info["contact_cost"] = self._get_contact_cost(obs) return info diff --git a/mushroom_rl/environments/mujoco_envs/half_cheetah.py b/mushroom_rl/environments/mujoco_envs/half_cheetah.py index 3a9c3937..9fa02cea 100644 --- a/mushroom_rl/environments/mujoco_envs/half_cheetah.py +++ b/mushroom_rl/environments/mujoco_envs/half_cheetah.py @@ -55,6 +55,7 @@ def __init__( additional_data_spec = [ ("x_pos", "rootx", ObservationType.JOINT_POS), + ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD), ] self._forward_reward_weight = forward_reward_weight @@ -92,11 +93,8 @@ def _create_observation(self, obs): def is_absorbing(self, obs): return False - def _get_x_vel(self): - return (self._next_x_pos - self._x_pos) / self.dt - def _get_forward_reward(self): - forward_reward = self._get_x_vel() + forward_reward = self._read_data("torso_vel")[3] return self._forward_reward_weight * forward_reward def _get_ctrl_cost(self, action): @@ -109,34 +107,24 @@ def reward(self, obs, action, next_obs, absorbing): reward = forward_reward - ctrl_cost return reward + def _generate_noise(self): + self._data.qpos[:] = self._data.qpos + np.random.uniform( + -self._reset_noise_scale, self._reset_noise_scale, self._model.nq + ) + self._data.qvel[:] = self._data.qvel + np.random.uniform( + -self._reset_noise_scale, self._reset_noise_scale, self._model.nv + ) + def setup(self, obs): super().setup(obs) - self._data.qpos[:] = ( - self._data.qpos - + np.random.uniform( - -self._reset_noise_scale, self._reset_noise_scale, self._model.nq - ) - ).copy() - self._data.qvel[:] = ( - self._data.qvel - + self._reset_noise_scale * np.random.standard_normal(self._model.nv).copy() - ).copy() + + self._generate_noise() mujoco.mj_forward(self._model, self._data) # type: ignore - def _create_info_dictionary(self, obs): + def _create_info_dictionary(self, obs, action): info = { - "x_vel": self._get_x_vel(), "forward_reward": self._get_forward_reward(), } - # if action is not None: - # info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1) + info["ctrl_cost"] = self._get_ctrl_cost(action) return info - - def _step_init(self, obs, action): - super()._step_init(obs, action) - self._x_pos = self._read_data("x_pos").item() - - def _step_finalize(self): - super()._step_finalize() - self._next_x_pos = self._read_data("x_pos").item() diff --git a/mushroom_rl/environments/mujoco_envs/hopper.py b/mushroom_rl/environments/mujoco_envs/hopper.py index 491b6332..a079a0e6 100644 --- a/mushroom_rl/environments/mujoco_envs/hopper.py +++ b/mushroom_rl/environments/mujoco_envs/hopper.py @@ -53,6 +53,7 @@ def __init__( additional_data_spec = [ ("x_pos", "rootx", ObservationType.JOINT_POS), + ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD), ] self._forward_reward_weight = forward_reward_weight @@ -87,32 +88,51 @@ def _modify_mdp_info(self, mdp_info): def _create_observation(self, obs): obs = super()._create_observation(obs) + # Clip the qvels obs[5:] = np.clip(obs[5:], -10, 10) if not self._exclude_current_positions_from_observation: x_pos = self._read_data("x_pos") obs = np.concatenate([obs, x_pos]) return obs + def _is_within_state_range(self) -> bool: + """Check if state variables are within the healthy range.""" + state = self.get_states() + min_state, max_state = self._healthy_state_range + return np.all(np.logical_and(min_state < state, state < max_state)) + + def _is_within_z_range(self, obs: np.ndarray) -> bool: + """Check if Z position of torso is within the healthy range.""" + z_pos = self.obs_helper.get_from_obs(obs, "z_pos").item() + min_z, max_z = self._healthy_z_range + return min_z < z_pos < max_z + + def _is_within_angle_range(self, obs): + """Check if Y angle of torso is within the healthy range.""" + y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item() + min_angle, max_angle = self._healthy_angle_range + return min_angle < y_angle < max_angle + + def _is_healthy(self, obs: np.ndarray) -> bool: + """Check if the agent is healthy.""" + is_within_state_range = self._is_within_state_range() + is_within_z_range = self._is_within_z_range(obs) + is_within_angle_range = self._is_within_angle_range(obs) + return is_within_state_range and is_within_z_range and is_within_angle_range + def is_absorbing(self, obs): """Return True if the agent is unhealthy and terminate_when_unhealthy is True.""" return self._terminate_when_unhealthy and not self._is_healthy(obs) - def _get_x_vel(self): - x_pos = self._x_pos - next_x_pos = self._next_x_pos - return (next_x_pos - x_pos) / self.dt - - def _get_forward_reward(self): - forward_reward = self._get_x_vel() - return self._forward_reward_weight * forward_reward - def _get_healthy_reward(self, obs): """Return the healthy reward if the agent is healthy, else 0.""" return ( - self._healthy_reward - if self._is_healthy(obs) or self._terminate_when_unhealthy - else 0 - ) + self._is_healthy(obs) or self._terminate_when_unhealthy + ) * self._healthy_reward + + def _get_forward_reward(self): + forward_reward = self._read_data("torso_vel")[3] + return self._forward_reward_weight * forward_reward def _get_ctrl_cost(self, action): """Return the control cost.""" @@ -126,69 +146,29 @@ def reward(self, obs, action, next_obs, absorbing): reward = healthy_reward + forward_reward - ctrl_cost return reward - def _is_healthy(self, obs: np.ndarray) -> bool: - """Check if the agent is healthy.""" - is_within_state_range = self._is_within_state_range(obs) - is_within_z_range = self._is_within_z_range(obs) - is_within_angle_range = self._is_within_angle_range(obs) - return is_within_state_range and is_within_z_range and is_within_angle_range - - def _is_within_state_range(self, obs: np.ndarray) -> bool: - """Check if state variables are within the healthy range.""" - idx = 1 if self._exclude_current_positions_from_observation else 2 - state_values = obs[idx:] - min_state, max_state = self._healthy_state_range - return all(min_state < value.item() < max_state for value in state_values) - - def _is_within_z_range(self, obs: np.ndarray) -> bool: - """Check if Z position of torso is within the healthy range.""" - z_pos = self.obs_helper.get_from_obs(obs, "z_pos").item() - min_z, max_z = self._healthy_z_range - return min_z < z_pos < max_z - - def _is_within_angle_range(self, obs): - """Check if Y angle of torso is within the healthy range.""" - y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item() - min_angle, max_angle = self._healthy_angle_range - return min_angle < y_angle < max_angle - - def _get_error( - self, - ): - x_vel = self._get_x_vel() - return np.abs(x_vel - self.target_vel) + def _generate_noise(self): + self._data.qpos[:] = self._data.qpos + np.random.uniform( + -self._reset_noise_scale, self._reset_noise_scale, self._model.nq + ) + self._data.qvel[:] = self._data.qvel + np.random.uniform( + -self._reset_noise_scale, self._reset_noise_scale, self._model.nv + ) def setup(self, obs): super().setup(obs) - self._data.qpos[:] = ( - self._data.qpos - + np.random.uniform( - -self._reset_noise_scale, self._reset_noise_scale, self._model.nq - ) - ).copy() - self._data.qvel[:] = ( - self._data.qvel - + np.random.uniform( - -self._reset_noise_scale, self._reset_noise_scale, self._model.nv - ) - ).copy() + self._generate_noise() mujoco.mj_forward(self._model, self._data) # type: ignore - def _create_info_dictionary(self, obs): + def _create_info_dictionary(self, obs, action): info = { "healthy_reward": self._get_healthy_reward(obs), "forward_reward": self._get_forward_reward(), } - # if action is not None: - # info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1) + info["ctrl_cost"] = self._get_ctrl_cost(action) return info - def _step_init(self, obs, action): - super()._step_init(obs, action) - self._x_pos = self._read_data("x_pos").item() - - def _step_finalize(self): - super()._step_finalize() - self._next_x_pos = self._read_data("x_pos").item() + def get_states(self): + """Return the position and velocity joint states of the model""" + return np.concatenate([self._data.qpos.flat, self._data.qvel.flat]) diff --git a/mushroom_rl/environments/mujoco_envs/walker_2d.py b/mushroom_rl/environments/mujoco_envs/walker_2d.py index 204147b1..0025355d 100644 --- a/mushroom_rl/environments/mujoco_envs/walker_2d.py +++ b/mushroom_rl/environments/mujoco_envs/walker_2d.py @@ -25,7 +25,6 @@ def __init__( reset_noise_scale=5e-3, exclude_current_positions_from_observation=True, n_substeps=4, - normalize_reward=False, **viewer_params, ): """ @@ -66,6 +65,7 @@ def __init__( additional_data_spec = [ ("x_pos", "rootx", ObservationType.JOINT_POS), + ("torso_vel", "torso", ObservationType.BODY_VEL_WORLD), ] self._forward_reward_weight = forward_reward_weight @@ -99,39 +99,42 @@ def _modify_mdp_info(self, mdp_info): def _create_observation(self, obs): obs = super()._create_observation(obs) + # Clip qvels obs[9:] = np.clip(obs[9:], -10, 10) if not self._exclude_current_positions_from_observation: x_pos = self._read_data("x_pos") obs = np.concatenate([obs, x_pos]) return obs + def _is_within_z_range(self, obs): + """Check if Z position of torso is within the healthy range.""" + z_position = self.obs_helper.get_from_obs(obs, "z_pos").item() + min_z, max_z = self._healthy_z_range + return min_z < z_position < max_z + + def _is_within_angle_range(self, obs): + """Check if y-angle of torso is within the healthy range.""" + y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item() + min_angle, max_angle = self._healthy_angle_range + return min_angle < y_angle < max_angle + def is_absorbing(self, obs): - """Return True if the agent is unhealthy and terminate_when_unhealthy is True.""" return self._terminate_when_unhealthy and not self._is_healthy(obs) def _is_healthy(self, obs): - """Check if the agent is healthy.""" - is_finite = self._is_finite(obs) is_within_z_range = self._is_within_z_range(obs) is_within_angle_range = self._is_within_angle_range(obs) - return is_finite and is_within_z_range and is_within_angle_range - - def _get_x_vel(self): - x_pos = self._x_pos - next_x_pos = self._next_x_pos - return (next_x_pos - x_pos) / self.dt - - def _get_forward_reward(self): - forward_reward = self._get_x_vel() - return self._forward_reward_weight * forward_reward + return is_within_z_range and is_within_angle_range def _get_healthy_reward(self, obs): """Return the healthy reward if the agent is healthy, else 0.""" return ( - self._healthy_reward - if self._is_healthy(obs) or self._terminate_when_unhealthy - else 0 - ) + self._is_healthy(obs) or self._terminate_when_unhealthy + ) * self._healthy_reward + + def _get_forward_reward(self): + forward_reward = self._read_data("torso_vel")[3] + return self._forward_reward_weight * forward_reward def _get_ctrl_cost(self, action): """Return the control cost.""" @@ -145,57 +148,25 @@ def reward(self, obs, action, next_obs, absorbing): reward = healthy_reward + forward_reward - ctrl_cost return reward - def _is_finite(self, obs): - """Check if the observation is finite.""" - return np.isfinite(obs).all() - - def _is_within_z_range(self, obs): - """Check if Z position of torso is within the healthy range.""" - z_position = self.obs_helper.get_from_obs(obs, "z_pos").item() - min_z, max_z = self._healthy_z_range - return min_z < z_position < max_z - - def _is_within_angle_range(self, obs): - """Check if y-angle of torso is within the healthy range.""" - y_angle = self.obs_helper.get_from_obs(obs, "y_pos").item() - min_angle, max_angle = self._healthy_angle_range - return min_angle < y_angle < max_angle - - def _get_error(self): - x_vel = self._get_x_vel() - return np.abs(self.target_vel - x_vel) + def _generate_noise(self): + self._data.qpos[:] = self._data.qpos + np.random.uniform( + -self._reset_noise_scale, self._reset_noise_scale, self._model.nq + ) + self._data.qvel[:] = self._data.qvel + np.random.uniform( + -self._reset_noise_scale, self._reset_noise_scale, self._model.nv + ) def setup(self, obs): super().setup(obs) - self._data.qpos[:] = ( - self._data.qpos - + np.random.uniform( - -self._reset_noise_scale, self._reset_noise_scale, self._model.nq - ) - ).copy() - self._data.qvel[:] = ( - self._data.qvel - + np.random.uniform( - -self._reset_noise_scale, self._reset_noise_scale, self._model.nv - ) - ).copy() + self._generate_noise() mujoco.mj_forward(self._model, self._data) # type: ignore - def _create_info_dictionary(self, obs): + def _create_info_dictionary(self, obs, action): info = { "healthy_reward": self._get_healthy_reward(obs), "forward_reward": self._get_forward_reward(), } - # if action is not None: - # info["ctrl_cost"] = self._get_ctrl_cost(action, ctrl_cost_weight=1) + info["ctrl_cost"] = self._get_ctrl_cost(action) return info - - def _step_init(self, obs, action): - super()._step_init(obs, action) - self._x_pos = self._read_data("x_pos").item() - - def _step_finalize(self): - super()._step_finalize() - self._next_x_pos = self._read_data("x_pos").item()