Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
boris-il-forte committed Oct 26, 2023
2 parents fc51ddd + ce77935 commit 71e6de4
Show file tree
Hide file tree
Showing 51 changed files with 1,459 additions and 709 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
*.DS_store
build/
dist/
examples/mushroom_rl_recordings/
examples/habitat/Replica-Dataset
examples/habitat/data
mushroom_rl.egg-info/
mushroom_rl_recordings/
.idea/
*.pyc
*.pyd
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ upload:
clean:
rm -rf dist
rm -rf build

.NOTPARALLEL:
1 change: 0 additions & 1 deletion TODO.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ Approximator:
* add neural network generator

For Mushroom 2.0:
* Record method in environment and record option in the core
* Simplify Regressor interface: drop GenericRegressor, remove facade pattern
* vectorize basis functions and simplify interface, simplify facade pattern
* remove custom save for plotting, use Serializable
Expand Down
15 changes: 10 additions & 5 deletions docs/source/tutorials/code/room_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class RoomToyEnv(Environment):
def __init__(self, size=5., goal=[2.5, 2.5], goal_radius=0.6):
def __init__(self, size=5., goal=(2.5, 2.5), goal_radius=0.6):

# Save important environment information
self._size = size
Expand All @@ -23,7 +23,7 @@ def __init__(self, size=5., goal=[2.5, 2.5], goal_radius=0.6):
observation_space = Box(0, size, shape)

# Create the MDPInfo structure, needed by the environment interface
mdp_info = MDPInfo(observation_space, action_space, gamma=0.99, horizon=100)
mdp_info = MDPInfo(observation_space, action_space, gamma=0.99, horizon=100, dt=0.1)

super().__init__(mdp_info)

Expand Down Expand Up @@ -86,15 +86,20 @@ def step(self, action):
# Return all the information + empty dictionary (used to pass additional information)
return self._state, reward, absorbing, {}

def render(self):
def render(self, record=False):
# Draw a red circle for the agent
self._viewer.circle(self._state, 0.1, color=(255, 0, 0))

# Draw a green circle for the goal
self._viewer.circle(self._goal, self._goal_radius, color=(0, 255, 0))

# Display the image for 0.1 seconds
self._viewer.display(0.1)
# Get the image if the record flag is set to true
frame = self._viewer.get_frame() if record else None

# Display the image for the control time (0.1 seconds)
self._viewer.display(self.info.dt)

return frame


# Register the class
Expand Down
10 changes: 5 additions & 5 deletions docs/source/tutorials/tutorials.5_environments.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,14 +159,14 @@ visualization tool for 2D Reinforcement Learning algorithms. The viewer class ha
simply draw two circles representing the agent and the goal area:

.. literalinclude:: code/room_env.py
:lines: 89-97
:lines: 89-102

For more information about the viewer, refer to the class documentation.

To conclude our environment, it's also possible to register it as specified in the previous section of this tutorial:

.. literalinclude:: code/room_env.py
:lines: 100-101
:lines: 105-106


Learning in the toy environment
Expand All @@ -179,17 +179,17 @@ We first import all necessary classes and utilities, then we construct the envir
reproducibility).

.. literalinclude:: code/room_env.py
:lines: 103-116
:lines: 108-121

We now proceed then to create the agent policy, which is a linear policy using tiles features, similar
to the one used by the Mountain Car experiment from R. Sutton book.

.. literalinclude:: code/room_env.py
:lines: 118-139
:lines: 123-144

Finally, using the ``Core`` class we set up an RL experiment. We first evaluate the initial policy for three episodes on the
environment. Then we learn the task using the algorithm build above for 20000 steps.
In the end, we evaluate the learned policy for 3 more episodes.

.. literalinclude:: code/room_env.py
:lines: 141-
:lines: 146-
2 changes: 1 addition & 1 deletion mushroom_rl/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.9.2'
__version__ = '1.10.0'
151 changes: 58 additions & 93 deletions mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,20 @@

class SACPolicy(Policy):
"""
Class used to implement the policy used by the Soft Actor-Critic
algorithm. The policy is a Gaussian policy squashed by a tanh.
This class implements the compute_action_and_log_prob and the
compute_action_and_log_prob_t methods, that are fundamental for
the internals calculations of the SAC algorithm.
Class used to implement the policy used by the Soft Actor-Critic algorithm.
The policy is a Gaussian policy squashed by a tanh. This class implements the compute_action_and_log_prob and the
compute_action_and_log_prob_t methods, that are fundamental for the internals calculations of the SAC algorithm.
"""
def __init__(self, mu_approximator, sigma_approximator, min_a, max_a,
log_std_min, log_std_max):
def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_min, log_std_max):
"""
Constructor.
Args:
mu_approximator (Regressor): a regressor computing mean in given a
state;
sigma_approximator (Regressor): a regressor computing the variance
in given a state;
min_a (np.ndarray): a vector specifying the minimum action value
for each component;
max_a (np.ndarray): a vector specifying the maximum action value
for each component.
mu_approximator (Regressor): a regressor computing mean in given a state;
sigma_approximator (Regressor): a regressor computing the variance in given a state;
min_a (np.ndarray): a vector specifying the minimum action value for each component;
max_a (np.ndarray): a vector specifying the maximum action value for each component.
log_std_min ([float, Parameter]): min value for the policy log std;
log_std_max ([float, Parameter]): max value for the policy log std.
Expand Down Expand Up @@ -78,8 +71,7 @@ def draw_action(self, state):

def compute_action_and_log_prob(self, state):
"""
Function that samples actions using the reparametrization trick and
the log probability for such actions.
Function that samples actions using the reparametrization trick and the log probability for such actions.
Args:
state (np.ndarray): the state in which the action is sampled.
Expand All @@ -93,17 +85,15 @@ def compute_action_and_log_prob(self, state):

def compute_action_and_log_prob_t(self, state, compute_log_prob=True):
"""
Function that samples actions using the reparametrization trick and,
optionally, the log probability for such actions.
Function that samples actions using the reparametrization trick and, optionally, the log probability for such
actions.
Args:
state (np.ndarray): the state in which the action is sampled;
compute_log_prob (bool, True): whether to compute the log
probability or not.
compute_log_prob (bool, True): whether to compute the log probability or not.
Returns:
The actions sampled and, optionally, the log probability as torch
tensors.
The actions sampled and, optionally, the log probability as torch tensors.
"""
dist = self.distribution(state)
Expand All @@ -123,8 +113,7 @@ def distribution(self, state):
Compute the policy distribution in the given states.
Args:
state (np.ndarray): the set of states where the distribution is
computed.
state (np.ndarray): the set of states where the distribution is computed.
Returns:
The torch distribution for the provided states.
Expand All @@ -147,19 +136,15 @@ def entropy(self, state=None):
The value of the entropy of the policy.
"""

return torch.mean(self.distribution(state).entropy()).detach().cpu().numpy().item()

def reset(self):
pass
_, log_pi = self.compute_action_and_log_prob(state)
return -log_pi.mean()

def set_weights(self, weights):
"""
Setter.
Args:
weights (np.ndarray): the vector of the new weights to be used by
the policy.
weights (np.ndarray): the vector of the new weights to be used by the policy.
"""
mu_weights = weights[:self._mu_approximator.weights_size]
Expand Down Expand Up @@ -190,8 +175,7 @@ def use_cuda(self):

def parameters(self):
"""
Returns the trainable policy parameters, as expected by torch
optimizers.
Returns the trainable policy parameters, as expected by torch optimizers.
Returns:
List of parameters to be optimized.
Expand All @@ -208,38 +192,30 @@ class SAC(DeepAC):
Haarnoja T. et al.. 2019.
"""
def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
actor_optimizer, critic_params, batch_size,
initial_replay_size, max_replay_size, warmup_transitions, tau,
lr_alpha, log_std_min=-20, log_std_max=2, target_entropy=None,
critic_fit_params=None):
def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size,
initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, use_log_alpha_loss=False,
log_std_min=-20, log_std_max=2, target_entropy=None,critic_fit_params=None):
"""
Constructor.
Args:
actor_mu_params (dict): parameters of the actor mean approximator
to build;
actor_sigma_params (dict): parameters of the actor sigm
approximator to build;
actor_optimizer (dict): parameters to specify the actor
optimizer algorithm;
critic_params (dict): parameters of the critic approximator to
build;
actor_mu_params (dict): parameters of the actor mean approximator to build;
actor_sigma_params (dict): parameters of the actor sigma approximator to build;
actor_optimizer (dict): parameters to specify the actor optimizer algorithm;
critic_params (dict): parameters of the critic approximator to build;
batch_size ((int, Parameter)): the number of samples in a batch;
initial_replay_size (int): the number of samples to collect before
starting the learning;
max_replay_size (int): the maximum number of samples in the replay
memory;
warmup_transitions ([int, Parameter]): number of samples to accumulate in the
replay memory to start the policy fitting;
initial_replay_size (int): the number of samples to collect before starting the learning;
max_replay_size (int): the maximum number of samples in the replay memory;
warmup_transitions ([int, Parameter]): number of samples to accumulate in the replay memory to start the
policy fitting;
tau ([float, Parameter]): value of coefficient for soft updates;
lr_alpha ([float, Parameter]): Learning rate for the entropy coefficient;
use_log_alpha_loss (bool, False): whether to use the original implementation loss or the one from the
paper;
log_std_min ([float, Parameter]): Min value for the policy log std;
log_std_max ([float, Parameter]): Max value for the policy log std;
target_entropy (float, None): target entropy for the policy, if
None a default value is computed ;
critic_fit_params (dict, None): parameters of the fitting algorithm
of the critic approximator.
target_entropy (float, None): target entropy for the policy, if None a default value is computed;
critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator.
"""
self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params
Expand All @@ -248,6 +224,8 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
self._warmup_transitions = to_parameter(warmup_transitions)
self._tau = to_parameter(tau)

self._use_log_alpha_loss = use_log_alpha_loss

if target_entropy is None:
self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32)
else:
Expand All @@ -261,25 +239,16 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
critic_params['n_models'] = 2

target_critic_params = deepcopy(critic_params)
self._critic_approximator = Regressor(TorchApproximator,
**critic_params)
self._target_critic_approximator = Regressor(TorchApproximator,
**target_critic_params)

actor_mu_approximator = Regressor(TorchApproximator,
**actor_mu_params)
actor_sigma_approximator = Regressor(TorchApproximator,
**actor_sigma_params)

policy = SACPolicy(actor_mu_approximator,
actor_sigma_approximator,
mdp_info.action_space.low,
mdp_info.action_space.high,
log_std_min,
log_std_max)

self._init_target(self._critic_approximator,
self._target_critic_approximator)
self._critic_approximator = Regressor(TorchApproximator, **critic_params)
self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params)

actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params)
actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params)

policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low,
mdp_info.action_space.high, log_std_min, log_std_max)

self._init_target(self._critic_approximator, self._target_critic_approximator)

self._log_alpha = torch.tensor(0., dtype=torch.float32)

Expand All @@ -302,6 +271,7 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
_replay_memory='mushroom',
_critic_approximator='mushroom',
_target_critic_approximator='mushroom',
_use_log_alpha_loss='primitive',
_log_alpha='torch',
_alpha_optim='torch'
)
Expand All @@ -311,8 +281,7 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
def fit(self, dataset, **info):
self._replay_memory.add(dataset)
if self._replay_memory.initialized:
state, action, reward, next_state, absorbing, _ = \
self._replay_memory.get(self._batch_size())
state, action, reward, next_state, absorbing, _ = self._replay_memory.get(self._batch_size())

if self._replay_memory.size > self._warmup_transitions():
action_new, log_prob = self.policy.compute_action_and_log_prob_t(state)
Expand All @@ -323,39 +292,35 @@ def fit(self, dataset, **info):
q_next = self._next_q(next_state, absorbing)
q = reward + self.mdp_info.gamma * q_next

self._critic_approximator.fit(state, action, q,
**self._critic_fit_params)
self._critic_approximator.fit(state, action, q, **self._critic_fit_params)

self._update_target(self._critic_approximator,
self._target_critic_approximator)
self._update_target(self._critic_approximator, self._target_critic_approximator)

def _loss(self, state, action_new, log_prob):
q_0 = self._critic_approximator(state, action_new,
output_tensor=True, idx=0)
q_1 = self._critic_approximator(state, action_new,
output_tensor=True, idx=1)
q_0 = self._critic_approximator(state, action_new, output_tensor=True, idx=0)
q_1 = self._critic_approximator(state, action_new, output_tensor=True, idx=1)

q = torch.min(q_0, q_1)

return (self._alpha * log_prob - q).mean()

def _update_alpha(self, log_prob):
alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean()
if self._use_log_alpha_loss:
alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean()
else:
alpha_loss = - (self._alpha * (log_prob + self._target_entropy)).mean()
self._alpha_optim.zero_grad()
alpha_loss.backward()
self._alpha_optim.step()

def _next_q(self, next_state, absorbing):
"""
Args:
next_state (np.ndarray): the states where next action has to be
evaluated;
absorbing (np.ndarray): the absorbing flag for the states in
``next_state``.
next_state (np.ndarray): the states where next action has to be evaluated;
absorbing (np.ndarray): the absorbing flag for the states in ``next_state``.
Returns:
Action-values returned by the critic for ``next_state`` and the
action returned by the actor.
Action-values returned by the critic for ``next_state`` and the action returned by the actor.
"""
a, log_prob_next = self.policy.compute_action_and_log_prob(next_state)
Expand Down
Loading

0 comments on commit 71e6de4

Please sign in to comment.