Merge branch 'dev2.0' of github.com:MushroomRL/mushroom-rl into dev2.0

MushroomRL · Oct 24, 2023 · a379ecd · a379ecd
2 parents 8587546 + 9fb0b3a
commit a379ecd
Show file tree

Hide file tree

Showing 52 changed files with 319 additions and 336 deletions.
diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/copdac_q.py
@@ -14,8 +14,7 @@ class COPDAC_Q(Agent):
     Silver D. et al.. 2014.
 
     """
-    def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
-                 value_function_features=None, policy_features=None):
+    def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None):
         """
         Constructor.
 
@@ -27,7 +26,6 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
             alpha_v ([float, Parameter]): learning rate for the value function;
             value_function_features (Features, None): features used by the value
                 function approximator;
-            policy_features (Features, None): features used by the policy.
 
         """
         self._mu = mu
@@ -59,19 +57,18 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
             _A='mushroom'
         )
 
-        super().__init__(mdp_info, policy, policy_features)
+        super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         for step in dataset:
             s, a, r, ss, absorbing, _ = step
 
-            s_phi = self.phi(s) if self.phi is not None else s
             s_psi = self._psi(s) if self._psi is not None else s
             ss_psi = self._psi(ss) if self._psi is not None else ss
 
             q_next = self._V(ss_psi).item() if not absorbing else 0
 
-            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
+            grad_mu_s = np.atleast_2d(self._mu.diff(s))
             omega = self._A.get_weights()
 
             delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
@@ -96,8 +93,7 @@ def _Q(self, state, action):
                                                             action)).item()
 
     def _nu(self, state, action):
-        state_phi = self.phi(state) if self.phi is not None else state
-        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
-        delta = action - self._mu(state_phi)
+        grad_mu = np.atleast_2d(self._mu.diff(state))
+        delta = action - self._mu(state)
 
         return delta.dot(grad_mu)
diff --git a/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py b/mushroom_rl/algorithms/actor_critic/classic_actor_critic/stochastic_ac.py
@@ -14,18 +14,15 @@ class StochasticAC(Agent):
     Degris T. et al.. 2012.
 
     """
-    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
-                 value_function_features=None, policy_features=None):
+    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None):
         """
         Constructor.
 
         Args:
             alpha_theta ([float, Parameter]): learning rate for policy update;
             alpha_v ([float, Parameter]): learning rate for the value function;
             lambda_par ([float, Parameter], .9): trace decay parameter;
-            value_function_features (Features, None): features used by the
-                value function approximator;
-            policy_features (Features, None): features used by the policy.
+            value_function_features (Features, None): features used by the value function approximator.
 
         """
         self._psi = value_function_features
@@ -35,15 +32,14 @@ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
 
         self._lambda = to_parameter(lambda_par)
 
-        super().__init__(mdp_info, policy, policy_features)
+        super().__init__(mdp_info, policy)
 
         if self._psi is not None:
             input_shape = (self._psi.size,)
         else:
             input_shape = mdp_info.observation_space.shape
 
-        self._V = Regressor(LinearApproximator, input_shape=input_shape,
-                            output_shape=(1,))
+        self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,))
 
         self._e_v = np.zeros(self._V.weights_size)
         self._e_theta = np.zeros(self.policy.weights_size)
@@ -64,17 +60,16 @@ def episode_start(self, episode_info):
 
         return super().episode_start(episode_info)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         for step in dataset:
             s, a, r, ss, absorbing, _ = step
 
-            s_phi = self.phi(s) if self.phi is not None else s
             s_psi = self._psi(s) if self._psi is not None else s
             ss_psi = self._psi(ss) if self._psi is not None else ss
 
             v_next = self._V(ss_psi) if not absorbing else 0
 
-            delta = self._compute_td_n_traces(a, r, v_next, s_psi, s_phi)
+            delta = self._compute_td_n_traces(s, a, r, v_next, s_psi)
 
             # Update value function
             delta_v = self._alpha_v(s, a) * delta * self._e_v
@@ -86,14 +81,13 @@ def fit(self, dataset, **info):
             theta_new = self.policy.get_weights() + delta_theta
             self.policy.set_weights(theta_new)
 
-    def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
+    def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
         # Compute TD error
         delta = r + self.mdp_info.gamma * v_next - self._V(s_psi)
 
         # Update traces
         self._e_v = self.mdp_info.gamma * self._lambda() * self._e_v + s_psi
-        self._e_theta = self.mdp_info.gamma * self._lambda() * \
-            self._e_theta + self.policy.diff_log(s_phi, a)
+        self._e_theta = self.mdp_info.gamma * self._lambda() * self._e_theta + self.policy.diff_log(s, a)
 
         return delta
 
@@ -105,31 +99,28 @@ class StochasticAC_AVG(StochasticAC):
     Degris T. et al.. 2012.
 
     """
-    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r,
-                 lambda_par=.9, value_function_features=None,
-                 policy_features=None):
+    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None):
         """
         Constructor.
 
         Args:
             alpha_r (Parameter): learning rate for the reward trace.
 
         """
-        super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par,
-                         value_function_features, policy_features)
+        super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par,  value_function_features)
 
         self._alpha_r = to_parameter(alpha_r)
         self._r_bar = 0
 
         self._add_save_attr(_alpha_r='mushroom', _r_bar='primitive')
 
-    def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
+    def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
         # Compute TD error
         delta = r - self._r_bar + v_next - self._V(s_psi)
 
         # Update traces
         self._r_bar += self._alpha_r() * delta
         self._e_v = self._lambda() * self._e_v + s_psi
-        self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s_phi, a)
+        self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s, a)
 
         return delta
diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/a2c.py
@@ -56,7 +56,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
 
         super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, _ = dataset.parse(to='torch')
 
         v, adv = compute_advantage_montecarlo(self._V, state, next_state,

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ddpg.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ddpg.py
@@ -97,7 +97,7 @@ def __init__(self, mdp_info, policy_class, policy_params,
 
         super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset)
         if self._replay_memory.initialized:
             state, action, reward, next_state, absorbing, _ =\

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/deep_actor_critic.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/deep_actor_critic.py
@@ -42,7 +42,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, parameters):
 
         super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         """
         Fit step.
 

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
@@ -68,9 +68,9 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
             _iter='primitive'
         )
 
-        super().__init__(mdp_info, policy, None)
+        super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
 
         v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo_bptt.py
@@ -78,7 +78,7 @@ def divide_state_to_env_hidden_batch(self, states):
         assert len(states.shape) > 1, "This function only divides batches of states."
         return states[:, 0:self._dim_env_state], states[:, self._dim_env_state:]
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         obs, act, r, obs_next, absorbing, last = dataset.parse(to='torch')
         policy_state, policy_next_state = dataset.parse_policy_state(to='torch')
         obs_seq, policy_state_seq, act_seq, obs_next_seq, policy_next_state_seq, lengths = \

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/sac.py
@@ -280,7 +280,7 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimize
 
         super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         self._replay_memory.add(dataset)
         if self._replay_memory.initialized:
             state, action, reward, next_state, absorbing, _ = self._replay_memory.get(self._batch_size())

diff --git a/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py b/mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
@@ -79,9 +79,9 @@ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, l
             _iter='primitive'
         )
 
-        super().__init__(mdp_info, policy, None)
+        super().__init__(mdp_info, policy)
 
-    def fit(self, dataset, **info):
+    def fit(self, dataset):
         state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')
 
         v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/black_box_optimization.py
@@ -10,7 +10,7 @@ class BlackBoxOptimization(Agent):
     do not rely on stochastic and differentiable policies.
 
     """
-    def __init__(self, mdp_info, distribution, policy, features=None):
+    def __init__(self, mdp_info, distribution, policy):
         """
         Constructor.
 
@@ -20,32 +20,25 @@ def __init__(self, mdp_info, distribution, policy, features=None):
 
         """
         self.distribution = distribution
-        self._theta_list = list()
 
-        self._add_save_attr(distribution='mushroom', _theta_list='pickle')
+        self._add_save_attr(distribution='mushroom')
 
-        super().__init__(mdp_info, policy, features)
+        super().__init__(mdp_info, policy, is_episodic=True)
 
     def episode_start(self, episode_info):
         theta = self.distribution.sample()
-        self._theta_list.append(theta)
         self.policy.set_weights(theta)
 
-        return super().episode_start(episode_info)
+        policy_state, _ = super().episode_start(episode_info)
 
-    def fit(self, dataset, **info):
-        Jep = dataset.compute_J(self.mdp_info.gamma)
+        return policy_state, theta
 
-        Jep = np.array(Jep)
-        theta = np.array(self._theta_list)
+    def fit(self, dataset):
+        Jep = np.array(dataset.discounted_return)
+        theta = np.array(dataset.theta_list)
 
         self._update(Jep, theta)
 
-        self._theta_list = list()
-
-    def stop(self):
-        self._theta_list = list()
-
     def _update(self, Jep, theta):
         """
         Function that implements the update routine of distribution parameters.

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/constrained_reps.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/constrained_reps.py
@@ -9,7 +9,7 @@ class ConstrainedREPS(BlackBoxOptimization):
     Episodic Relative Entropy Policy Search algorithm with constrained policy update.
 
     """
-    def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None):
+    def __init__(self, mdp_info, distribution, policy, eps, kappa):
         """
         Constructor.
 
@@ -28,7 +28,7 @@ def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None):
         self._add_save_attr(_eps='mushroom')
         self._add_save_attr(_kappa='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         eta_start = np.ones(1)

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/more.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/more.py
@@ -17,7 +17,7 @@ class MORE(BlackBoxOptimization):
     Peters, Jan R and Lau, Nuno and Pualo Reis, Luis and Neumann, Gerhard. 2015.
 
     """
-    def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99, features=None):
+    def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99):
         """
         Constructor.
 
@@ -53,7 +53,7 @@ def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99, feat
         self._add_save_attr(h0='primitive')
         self._add_save_attr(kappa='primitive')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
 

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/pgpe.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/pgpe.py
@@ -10,8 +10,7 @@ class PGPE(BlackBoxOptimization):
     Peters J.. 2013.
 
     """
-    def __init__(self, mdp_info, distribution, policy, optimizer,
-                 features=None):
+    def __init__(self, mdp_info, distribution, policy, optimizer):
         """
         Constructor.
 
@@ -23,7 +22,7 @@ def __init__(self, mdp_info, distribution, policy, optimizer,
 
         self._add_save_attr(optimizer='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         baseline_num_list = list()

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/reps.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/reps.py
@@ -13,7 +13,7 @@ class REPS(BlackBoxOptimization):
     Peters J.. 2013.
 
     """
-    def __init__(self, mdp_info, distribution, policy, eps, features=None):
+    def __init__(self, mdp_info, distribution, policy, eps):
         """
         Constructor.
 
@@ -27,7 +27,7 @@ def __init__(self, mdp_info, distribution, policy, eps, features=None):
 
         self._add_save_attr(_eps='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         eta_start = np.ones(1)

diff --git a/mushroom_rl/algorithms/policy_search/black_box_optimization/rwr.py b/mushroom_rl/algorithms/policy_search/black_box_optimization/rwr.py
@@ -11,7 +11,7 @@ class RWR(BlackBoxOptimization):
     Peters J.. 2013.
 
     """
-    def __init__(self, mdp_info, distribution, policy, beta, features=None):
+    def __init__(self, mdp_info, distribution, policy, beta):
         """
         Constructor.
 
@@ -24,7 +24,7 @@ def __init__(self, mdp_info, distribution, policy, beta, features=None):
 
         self._add_save_attr(_beta='mushroom')
 
-        super().__init__(mdp_info, distribution, policy, features)
+        super().__init__(mdp_info, distribution, policy)
 
     def _update(self, Jep, theta):
         Jep -= np.max(Jep)

diff --git a/mushroom_rl/algorithms/policy_search/policy_gradient/enac.py b/mushroom_rl/algorithms/policy_search/policy_gradient/enac.py
@@ -10,16 +10,15 @@ class eNAC(PolicyGradient):
     Peters J. 2013.
 
     """
-    def __init__(self, mdp_info, policy, optimizer, features=None,
-                 critic_features=None):
+    def __init__(self, mdp_info, policy, optimizer, critic_features=None):
         """
         Constructor.
 
         Args:
             critic_features (Features, None): features used by the critic.
 
         """
-        super().__init__(mdp_info, policy, optimizer, features)
+        super().__init__(mdp_info, policy, optimizer)
         self.phi_c = critic_features
 
         self.sum_grad_log = None