Skip to content

Commit

Permalink
Merge branch 'dev2.0' of github.com:MushroomRL/mushroom-rl into dev2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
boris-il-forte committed Oct 24, 2023
2 parents 8587546 + 9fb0b3a commit a379ecd
Show file tree
Hide file tree
Showing 52 changed files with 319 additions and 336 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ class COPDAC_Q(Agent):
Silver D. et al.. 2014.
"""
def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
value_function_features=None, policy_features=None):
def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None):
"""
Constructor.
Expand All @@ -27,7 +26,6 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
alpha_v ([float, Parameter]): learning rate for the value function;
value_function_features (Features, None): features used by the value
function approximator;
policy_features (Features, None): features used by the policy.
"""
self._mu = mu
Expand Down Expand Up @@ -59,19 +57,18 @@ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
_A='mushroom'
)

super().__init__(mdp_info, policy, policy_features)
super().__init__(mdp_info, policy)

def fit(self, dataset, **info):
def fit(self, dataset):
for step in dataset:
s, a, r, ss, absorbing, _ = step

s_phi = self.phi(s) if self.phi is not None else s
s_psi = self._psi(s) if self._psi is not None else s
ss_psi = self._psi(ss) if self._psi is not None else ss

q_next = self._V(ss_psi).item() if not absorbing else 0

grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
grad_mu_s = np.atleast_2d(self._mu.diff(s))
omega = self._A.get_weights()

delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
Expand All @@ -96,8 +93,7 @@ def _Q(self, state, action):
action)).item()

def _nu(self, state, action):
state_phi = self.phi(state) if self.phi is not None else state
grad_mu = np.atleast_2d(self._mu.diff(state_phi))
delta = action - self._mu(state_phi)
grad_mu = np.atleast_2d(self._mu.diff(state))
delta = action - self._mu(state)

return delta.dot(grad_mu)
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,15 @@ class StochasticAC(Agent):
Degris T. et al.. 2012.
"""
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,
value_function_features=None, policy_features=None):
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None):
"""
Constructor.
Args:
alpha_theta ([float, Parameter]): learning rate for policy update;
alpha_v ([float, Parameter]): learning rate for the value function;
lambda_par ([float, Parameter], .9): trace decay parameter;
value_function_features (Features, None): features used by the
value function approximator;
policy_features (Features, None): features used by the policy.
value_function_features (Features, None): features used by the value function approximator.
"""
self._psi = value_function_features
Expand All @@ -35,15 +32,14 @@ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9,

self._lambda = to_parameter(lambda_par)

super().__init__(mdp_info, policy, policy_features)
super().__init__(mdp_info, policy)

if self._psi is not None:
input_shape = (self._psi.size,)
else:
input_shape = mdp_info.observation_space.shape

self._V = Regressor(LinearApproximator, input_shape=input_shape,
output_shape=(1,))
self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,))

self._e_v = np.zeros(self._V.weights_size)
self._e_theta = np.zeros(self.policy.weights_size)
Expand All @@ -64,17 +60,16 @@ def episode_start(self, episode_info):

return super().episode_start(episode_info)

def fit(self, dataset, **info):
def fit(self, dataset):
for step in dataset:
s, a, r, ss, absorbing, _ = step

s_phi = self.phi(s) if self.phi is not None else s
s_psi = self._psi(s) if self._psi is not None else s
ss_psi = self._psi(ss) if self._psi is not None else ss

v_next = self._V(ss_psi) if not absorbing else 0

delta = self._compute_td_n_traces(a, r, v_next, s_psi, s_phi)
delta = self._compute_td_n_traces(s, a, r, v_next, s_psi)

# Update value function
delta_v = self._alpha_v(s, a) * delta * self._e_v
Expand All @@ -86,14 +81,13 @@ def fit(self, dataset, **info):
theta_new = self.policy.get_weights() + delta_theta
self.policy.set_weights(theta_new)

def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
# Compute TD error
delta = r + self.mdp_info.gamma * v_next - self._V(s_psi)

# Update traces
self._e_v = self.mdp_info.gamma * self._lambda() * self._e_v + s_psi
self._e_theta = self.mdp_info.gamma * self._lambda() * \
self._e_theta + self.policy.diff_log(s_phi, a)
self._e_theta = self.mdp_info.gamma * self._lambda() * self._e_theta + self.policy.diff_log(s, a)

return delta

Expand All @@ -105,31 +99,28 @@ class StochasticAC_AVG(StochasticAC):
Degris T. et al.. 2012.
"""
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r,
lambda_par=.9, value_function_features=None,
policy_features=None):
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None):
"""
Constructor.
Args:
alpha_r (Parameter): learning rate for the reward trace.
"""
super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par,
value_function_features, policy_features)
super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par, value_function_features)

self._alpha_r = to_parameter(alpha_r)
self._r_bar = 0

self._add_save_attr(_alpha_r='mushroom', _r_bar='primitive')

def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
def _compute_td_n_traces(self, s, a, r, v_next, s_psi):
# Compute TD error
delta = r - self._r_bar + v_next - self._V(s_psi)

# Update traces
self._r_bar += self._alpha_r() * delta
self._e_v = self._lambda() * self._e_v + s_psi
self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s_phi, a)
self._e_theta = self._lambda() * self._e_theta + self.policy.diff_log(s, a)

return delta
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,

super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())

def fit(self, dataset, **info):
def fit(self, dataset):
state, action, reward, next_state, absorbing, _ = dataset.parse(to='torch')

v, adv = compute_advantage_montecarlo(self._V, state, next_state,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def __init__(self, mdp_info, policy_class, policy_params,

super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)

def fit(self, dataset, **info):
def fit(self, dataset):
self._replay_memory.add(dataset)
if self._replay_memory.initialized:
state, action, reward, next_state, absorbing, _ =\
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, mdp_info, policy, actor_optimizer, parameters):

super().__init__(mdp_info, policy)

def fit(self, dataset, **info):
def fit(self, dataset):
"""
Fit step.
Expand Down
4 changes: 2 additions & 2 deletions mushroom_rl/algorithms/actor_critic/deep_actor_critic/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
_iter='primitive'
)

super().__init__(mdp_info, policy, None)
super().__init__(mdp_info, policy)

def fit(self, dataset, **info):
def fit(self, dataset):
state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')

v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def divide_state_to_env_hidden_batch(self, states):
assert len(states.shape) > 1, "This function only divides batches of states."
return states[:, 0:self._dim_env_state], states[:, self._dim_env_state:]

def fit(self, dataset, **info):
def fit(self, dataset):
obs, act, r, obs_next, absorbing, last = dataset.parse(to='torch')
policy_state, policy_next_state = dataset.parse_policy_state(to='torch')
obs_seq, policy_state_seq, act_seq, obs_next_seq, policy_next_state_seq, lengths = \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimize

super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)

def fit(self, dataset, **info):
def fit(self, dataset):
self._replay_memory.add(dataset)
if self._replay_memory.initialized:
state, action, reward, next_state, absorbing, _ = self._replay_memory.get(self._batch_size())
Expand Down
4 changes: 2 additions & 2 deletions mushroom_rl/algorithms/actor_critic/deep_actor_critic/trpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, l
_iter='primitive'
)

super().__init__(mdp_info, policy, None)
super().__init__(mdp_info, policy)

def fit(self, dataset, **info):
def fit(self, dataset):
state, action, reward, next_state, absorbing, last = dataset.parse(to='torch')

v_target, adv = compute_gae(self._V, state, next_state, reward, absorbing, last,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class BlackBoxOptimization(Agent):
do not rely on stochastic and differentiable policies.
"""
def __init__(self, mdp_info, distribution, policy, features=None):
def __init__(self, mdp_info, distribution, policy):
"""
Constructor.
Expand All @@ -20,32 +20,25 @@ def __init__(self, mdp_info, distribution, policy, features=None):
"""
self.distribution = distribution
self._theta_list = list()

self._add_save_attr(distribution='mushroom', _theta_list='pickle')
self._add_save_attr(distribution='mushroom')

super().__init__(mdp_info, policy, features)
super().__init__(mdp_info, policy, is_episodic=True)

def episode_start(self, episode_info):
theta = self.distribution.sample()
self._theta_list.append(theta)
self.policy.set_weights(theta)

return super().episode_start(episode_info)
policy_state, _ = super().episode_start(episode_info)

def fit(self, dataset, **info):
Jep = dataset.compute_J(self.mdp_info.gamma)
return policy_state, theta

Jep = np.array(Jep)
theta = np.array(self._theta_list)
def fit(self, dataset):
Jep = np.array(dataset.discounted_return)
theta = np.array(dataset.theta_list)

self._update(Jep, theta)

self._theta_list = list()

def stop(self):
self._theta_list = list()

def _update(self, Jep, theta):
"""
Function that implements the update routine of distribution parameters.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ConstrainedREPS(BlackBoxOptimization):
Episodic Relative Entropy Policy Search algorithm with constrained policy update.
"""
def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None):
def __init__(self, mdp_info, distribution, policy, eps, kappa):
"""
Constructor.
Expand All @@ -28,7 +28,7 @@ def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None):
self._add_save_attr(_eps='mushroom')
self._add_save_attr(_kappa='mushroom')

super().__init__(mdp_info, distribution, policy, features)
super().__init__(mdp_info, distribution, policy)

def _update(self, Jep, theta):
eta_start = np.ones(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class MORE(BlackBoxOptimization):
Peters, Jan R and Lau, Nuno and Pualo Reis, Luis and Neumann, Gerhard. 2015.
"""
def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99, features=None):
def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99):
"""
Constructor.
Expand Down Expand Up @@ -53,7 +53,7 @@ def __init__(self, mdp_info, distribution, policy, eps, h0=-75, kappa=0.99, feat
self._add_save_attr(h0='primitive')
self._add_save_attr(kappa='primitive')

super().__init__(mdp_info, distribution, policy, features)
super().__init__(mdp_info, distribution, policy)

def _update(self, Jep, theta):

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ class PGPE(BlackBoxOptimization):
Peters J.. 2013.
"""
def __init__(self, mdp_info, distribution, policy, optimizer,
features=None):
def __init__(self, mdp_info, distribution, policy, optimizer):
"""
Constructor.
Expand All @@ -23,7 +22,7 @@ def __init__(self, mdp_info, distribution, policy, optimizer,

self._add_save_attr(optimizer='mushroom')

super().__init__(mdp_info, distribution, policy, features)
super().__init__(mdp_info, distribution, policy)

def _update(self, Jep, theta):
baseline_num_list = list()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class REPS(BlackBoxOptimization):
Peters J.. 2013.
"""
def __init__(self, mdp_info, distribution, policy, eps, features=None):
def __init__(self, mdp_info, distribution, policy, eps):
"""
Constructor.
Expand All @@ -27,7 +27,7 @@ def __init__(self, mdp_info, distribution, policy, eps, features=None):

self._add_save_attr(_eps='mushroom')

super().__init__(mdp_info, distribution, policy, features)
super().__init__(mdp_info, distribution, policy)

def _update(self, Jep, theta):
eta_start = np.ones(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class RWR(BlackBoxOptimization):
Peters J.. 2013.
"""
def __init__(self, mdp_info, distribution, policy, beta, features=None):
def __init__(self, mdp_info, distribution, policy, beta):
"""
Constructor.
Expand All @@ -24,7 +24,7 @@ def __init__(self, mdp_info, distribution, policy, beta, features=None):

self._add_save_attr(_beta='mushroom')

super().__init__(mdp_info, distribution, policy, features)
super().__init__(mdp_info, distribution, policy)

def _update(self, Jep, theta):
Jep -= np.max(Jep)
Expand Down
5 changes: 2 additions & 3 deletions mushroom_rl/algorithms/policy_search/policy_gradient/enac.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@ class eNAC(PolicyGradient):
Peters J. 2013.
"""
def __init__(self, mdp_info, policy, optimizer, features=None,
critic_features=None):
def __init__(self, mdp_info, policy, optimizer, critic_features=None):
"""
Constructor.
Args:
critic_features (Features, None): features used by the critic.
"""
super().__init__(mdp_info, policy, optimizer, features)
super().__init__(mdp_info, policy, optimizer)
self.phi_c = critic_features

self.sum_grad_log = None
Expand Down
Loading

0 comments on commit a379ecd

Please sign in to comment.