From 584aa86b5c5daad4a3312e6b45c4787ae1121783 Mon Sep 17 00:00:00 2001 From: boris-il-forte Date: Thu, 7 Dec 2023 11:35:49 +0100 Subject: [PATCH] Renamed Exponential Parameter in Decay Parameter --- .../double_chain_q_learning/double_chain.py | 4 +- examples/grid_world_td.py | 6 +- mushroom_rl/environments/mujoco.py | 133 ++++++++---------- .../features/tensors/random_fourier_tensor.py | 18 ++- mushroom_rl/rl_utils/__init__.py | 2 +- mushroom_rl/rl_utils/parameters.py | 61 +++++--- mushroom_rl/utils/record.py | 2 +- tests/utils/test_callbacks.py | 6 +- 8 files changed, 118 insertions(+), 114 deletions(-) diff --git a/examples/double_chain_q_learning/double_chain.py b/examples/double_chain_q_learning/double_chain.py index ec7ddcc5..c1e1010c 100644 --- a/examples/double_chain_q_learning/double_chain.py +++ b/examples/double_chain_q_learning/double_chain.py @@ -8,7 +8,7 @@ from mushroom_rl.environments import * from mushroom_rl.policy import EpsGreedy from mushroom_rl.utils.callbacks import CollectQ -from mushroom_rl.rl_utils.parameters import Parameter, ExponentialParameter +from mushroom_rl.rl_utils.parameters import Parameter, DecayParameter """ @@ -33,7 +33,7 @@ def experiment(algorithm_class, exp): pi = EpsGreedy(epsilon=epsilon) # Agent - learning_rate = ExponentialParameter(value=1., exp=exp, size=mdp.info.size) + learning_rate = DecayParameter(value=1., exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) diff --git a/examples/grid_world_td.py b/examples/grid_world_td.py index 1885b3af..50bca559 100644 --- a/examples/grid_world_td.py +++ b/examples/grid_world_td.py @@ -11,7 +11,7 @@ from mushroom_rl.environments import * from mushroom_rl.policy import EpsGreedy from mushroom_rl.utils.callbacks import CollectDataset, CollectMaxQ -from mushroom_rl.rl_utils.parameters import ExponentialParameter +from mushroom_rl.rl_utils.parameters import DecayParameter """ @@ -31,11 +31,11 @@ def experiment(algorithm_class, exp): mdp = GridWorldVanHasselt() # Policy - epsilon = ExponentialParameter(value=1, exp=.5, size=mdp.info.observation_space.size) + epsilon = DecayParameter(value=1, exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent - learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size) + learning_rate = DecayParameter(value=1, exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) diff --git a/mushroom_rl/environments/mujoco.py b/mushroom_rl/environments/mujoco.py index 0f33dd7b..25f52b67 100644 --- a/mushroom_rl/environments/mujoco.py +++ b/mushroom_rl/environments/mujoco.py @@ -18,45 +18,34 @@ def __init__(self, xml_file, actuation_spec, observation_spec, gamma, horizon, t Constructor. Args: - xml_file (str/xml handle): A string with a path to the xml or an Mujoco xml handle. - actuation_spec (list): A list specifying the names of the joints - which should be controllable by the agent. Can be left empty - when all actuators should be used; - observation_spec (list): A list containing the names of data that - should be made available to the agent as an observation and - their type (ObservationType). They are combined with a key, - which is used to access the data. An entry in the list - is given by: (key, name, type). The name can later be used - to retrieve specific observations; - gamma (float): The discounting factor of the environment; - horizon (int): The maximum horizon for the environment; - timestep (float): The timestep used by the MuJoCo - simulator. If None, the default timestep specified in the XML will be used; - n_substeps (int, 1): The number of substeps to use by the MuJoCo - simulator. An action given by the agent will be applied for - n_substeps before the agent receives the next observation and - can act accordingly; - n_intermediate_steps (int, 1): The number of steps between every action - taken by the agent. Similar to n_substeps but allows the user - to modify, control and access intermediate states. - additional_data_spec (list, None): A list containing the data fields of - interest, which should be read from or written to during - simulation. The entries are given as the following tuples: - (key, name, type) key is a string for later referencing in the - "read_data" and "write_data" methods. The name is the name of - the object in the XML specification and the type is the - ObservationType; - collision_groups (list, None): A list containing groups of geoms for - which collisions should be checked during simulation via - ``check_collision``. The entries are given as: - ``(key, geom_names)``, where key is a string for later - referencing in the "check_collision" method, and geom_names is - a list of geom names in the XML specification. - max_joint_vel (list, None): A list with the maximum joint velocities which are provided in the mdp_info. - The list has to define a maximum velocity for every occurrence of JOINT_VEL in the observation_spec. The - velocity will not be limited in mujoco - **viewer_params: other parameters to be passed to the viewer. - See MujocoViewer documentation for the available options. + xml_file (str/xml handle): A string with a path to the xml or an Mujoco xml handle. + actuation_spec (list): A list specifying the names of the joints which should be controllable by the + agent. Can be left empty when all actuators should be used; + observation_spec (list): A list containing the names of data that should be made available to the agent as + an observation and their type (ObservationType). They are combined with a key, which is used to access + the data. An entry in the list is given by: (key, name, type). The name can later be used to retrieve + specific observations; + gamma (float): The discounting factor of the environment; + horizon (int): The maximum horizon for the environment; + timestep (float): The timestep used by the MuJoCo simulator. If None, the default timestep specified in + the XML will be used; + n_substeps (int, 1): The number of substeps to use by the MuJoCo simulator. An action given by the agent + will be applied for n_substeps before the agent receives the next observation and can act accordingly; + n_intermediate_steps (int, 1): The number of steps between every action taken by the agent. Similar to + n_substeps but allows the user to modify, control and access intermediate states. + additional_data_spec (list, None): A list containing the data fields of interest, which should be read from + or written to during simulation. The entries are given as the following tuples: (key, name, type) key + is a string for later referencing in the "read_data" and "write_data" methods. The name is the name of + the object in the XML specification and the type is the ObservationType; + collision_groups (list, None): A list containing groups of geoms for which collisions should be checked + during simulation via ``check_collision``. The entries are given as: ``(key, geom_names)``, where key is + a string for later referencing in the "check_collision" method, and geom_names is a list of geom names + in the XML specification. + max_joint_vel (list, None): A list with the maximum joint velocities which are provided in the mdp_info. + The list has to define a maximum velocity for every occurrence of JOINT_VEL in the observation_spec. The + velocity will not be limited in mujoco + **viewer_params: other parameters to be passed to the viewer. + See MujocoViewer documentation for the available options. """ # Create the simulation @@ -462,9 +451,8 @@ def get_action_indices(model, data, actuation_spec): Args: model: MuJoCo model. data: MuJoCo data structure. - actuation_spec (list): A list specifying the names of the joints - which should be controllable by the agent. Can be left empty - when all actuators should be used; + actuation_spec (list): A list specifying the names of the joints which should be controllable by the agent. + Can be left empty when all actuators should be used; Returns: A list of actuator indices. @@ -561,45 +549,34 @@ def __init__(self, xml_files, actuation_spec, observation_spec, gamma, horizon, Constructor. Args: - xml_files (str/xml handle): A list containing strings with a path to the xml or Mujoco xml handles; - actuation_spec (list): A list specifying the names of the joints - which should be controllable by the agent. Can be left empty - when all actuators should be used; - observation_spec (list): A list containing the names of data that - should be made available to the agent as an observation and - their type (ObservationType). They are combined with a key, - which is used to access the data. An entry in the list - is given by: (key, name, type); - gamma (float): The discounting factor of the environment; - horizon (int): The maximum horizon for the environment; - timestep (float): The timestep used by the MuJoCo - simulator. If None, the default timestep specified in the XML will be used; - n_substeps (int, 1): The number of substeps to use by the MuJoCo - simulator. An action given by the agent will be applied for - n_substeps before the agent receives the next observation and - can act accordingly; - n_intermediate_steps (int, 1): The number of steps between every action - taken by the agent. Similar to n_substeps but allows the user - to modify, control and access intermediate states. - additional_data_spec (list, None): A list containing the data fields of - interest, which should be read from or written to during - simulation. The entries are given as the following tuples: - (key, name, type) key is a string for later referencing in the - "read_data" and "write_data" methods. The name is the name of - the object in the XML specification and the type is the - ObservationType; - collision_groups (list, None): A list containing groups of geoms for - which collisions should be checked during simulation via - ``check_collision``. The entries are given as: - ``(key, geom_names)``, where key is a string for later - referencing in the "check_collision" method, and geom_names is - a list of geom names in the XML specification. - max_joint_vel (list, None): A list with the maximum joint velocities which are provided in the mdp_info. + xml_files (str/xml handle): A list containing strings with a path to the xml or Mujoco xml handles; + actuation_spec (list): A list specifying the names of the joints which should be controllable by the + agent. Can be left empty when all actuators should be used; + observation_spec (list): A list containing the names of data that should be made available to the agent as + an observation and their type (ObservationType). They are combined with a key, which is used to access + the data. An entry in the list is given by: (key, name, type); + gamma (float): The discounting factor of the environment; + horizon (int): The maximum horizon for the environment; + timestep (float): The timestep used by the MuJoCo simulator. If None, the default timestep specified in the + XML will be used; + n_substeps (int, 1): The number of substeps to use by the MuJoCo simulator. An action given by the agent + will be applied for n_substeps before the agent receives the next observation and can act accordingly; + n_intermediate_steps (int, 1): The number of steps between every action taken by the agent. Similar to + n_substeps but allows the user to modify, control and access intermediate states. + additional_data_spec (list, None): A list containing the data fields of interest, which should be read + from or written to during simulation. The entries are given as the following tuples: (key, name, type) + key is a string for later referencing in the "read_data" and "write_data" methods. The name is the name + of the object in the XML specification and the type is the ObservationType; + collision_groups (list, None): A list containing groups of geoms for which collisions should be checked + during simulation via ``check_collision``. The entries are given as: ``(key, geom_names)``, where key is + a string for later referencing in the "check_collision" method, and geom_names is a list of geom names + in the XML specification. + max_joint_vel (list, None): A list with the maximum joint velocities which are provided in the mdp_info. The list has to define a maximum velocity for every occurrence of JOINT_VEL in the observation_spec. The velocity will not be limited in mujoco. random_env_reset (bool): If True, a random environment/model is chosen after each episode. If False, it is - sequentially iterated through the environment/model list. - **viewer_params: other parameters to be passed to the viewer. + sequentially iterated through the environment/model list. + **viewer_params: other parameters to be passed to the viewer. See MujocoViewer documentation for the available options. """ diff --git a/mushroom_rl/features/tensors/random_fourier_tensor.py b/mushroom_rl/features/tensors/random_fourier_tensor.py index cfd72ae5..6e911e98 100644 --- a/mushroom_rl/features/tensors/random_fourier_tensor.py +++ b/mushroom_rl/features/tensors/random_fourier_tensor.py @@ -9,30 +9,28 @@ class RandomFourierBasis(nn.Module): r""" - Class implementing Random Fourier basis functions. The value of the feature - is computed using the formula: + Class implementing Random Fourier basis functions. The value of the feature is computed using the formula: .. math:: \sin{\dfrac{PX}{\nu}+\varphi} - where X is the input, m is the vector of the minumum input values (for each - dimensions) , \Delta is the vector of maximum - This features have been presented in: + where X is the input, m is the vector of the minimum input values (for each dimensions), :math: `\Delta` is the + vector of maximum variations of the input variables, i.e. delta = high - low; - "Towards generalization and simplicity in continuous control". Rajeswaran A. et Al.. - 2017. + These features have been presented in: + + "Towards generalization and simplicity in continuous control". Rajeswaran A. et Al.. 2017. """ def __init__(self, P, phi, nu): - r""" + """ Constructor. Args: P (np.ndarray): weights matrix, every weight should be drawn from a normal distribution; phi (np.ndarray): bias vector, every weight should be drawn from a uniform distribution in the interval - [-\pi, \pi); - values of the input variables, i.e. delta = high - low; + :math: `[-\pi, \pi)`; nu (float): bandwidth parameter, it should be chosen approximately as the average pairwise distances between different observation vectors. diff --git a/mushroom_rl/rl_utils/__init__.py b/mushroom_rl/rl_utils/__init__.py index 04bdd9dd..28cbffe2 100644 --- a/mushroom_rl/rl_utils/__init__.py +++ b/mushroom_rl/rl_utils/__init__.py @@ -1,6 +1,6 @@ from .eligibility_trace import EligibilityTrace, ReplacingTrace, AccumulatingTrace from .optimizers import Optimizer, AdamOptimizer, SGDOptimizer, AdaptiveOptimizer -from .parameters import Parameter, ExponentialParameter, LinearParameter, to_parameter +from .parameters import Parameter, DecayParameter, LinearParameter, to_parameter from .preprocessors import StandardizationPreprocessor, MinMaxPreprocessor from .replay_memory import ReplayMemory, PrioritizedReplayMemory from .running_stats import RunningStandardization, RunningAveragedWindow, RunningExpWeightedAverage diff --git a/mushroom_rl/rl_utils/parameters.py b/mushroom_rl/rl_utils/parameters.py index 36759bce..1f9d7899 100644 --- a/mushroom_rl/rl_utils/parameters.py +++ b/mushroom_rl/rl_utils/parameters.py @@ -24,13 +24,10 @@ def __init__(self, value, min_value=None, max_value=None, size=(1,)): Args: value (float): initial value of the parameter; - min_value (float, None): minimum value that the parameter can reach - when decreasing; - max_value (float, None): maximum value that the parameter can reach - when increasing; - size (tuple, (1,)): shape of the matrix of parameters; this shape - can be used to have a single parameter for each state or - state-action tuple. + min_value (float, None): minimum value that the parameter can reach when decreasing; + max_value (float, None): maximum value that the parameter can reach when increasing; + size (tuple, (1,)): shape of the matrix of parameters; this shape can be used to have a single parameter for + each state or state-action tuple. """ self._initial_value = value @@ -94,8 +91,7 @@ def update(self, *idx, **kwargs): Updates the number of visit of the parameter in the provided index. Args: - *idx (list): index of the parameter whose number of visits has to be - updated. + *idx (list): index of the parameter whose number of visits has to be updated. """ self._n_updates[idx] += 1 @@ -121,11 +117,28 @@ def initial_value(self): class LinearParameter(Parameter): """ - This class implements a linearly changing parameter according to the number - of times it has been used. + This class implements a linearly changing parameter according to the number of times it has been used. + The parameter changes following the formula: + + .. math:: + v_n = \textrm{clip}(v_0 + \dfrac{v_{th} - v_0}}{n}, v_{th}) + + where :math: `v_0` is the initial value of the parameter, :math: `n` is the number of steps and :math: `v_{th}` is + the upper or lower threshold for the parameter. """ def __init__(self, value, threshold_value, n, size=(1,)): + """ + Constructor. + + Args: + value (float): initial value of the parameter; + threshold_value (float, None): minimum or maximum value that the parameter can reach; + n (int): number of time steps needed to reach the threshold value; + size (tuple, (1,)): shape of the matrix of parameters; this shape can be used to have a single parameter for + each state or state-action tuple. + + """ self._coeff = (threshold_value - value) / n if self._coeff >= 0: @@ -139,14 +152,30 @@ def _compute(self, *idx, **kwargs): return self._coeff * self._n_updates[idx] + self._initial_value -class ExponentialParameter(Parameter): +class DecayParameter(Parameter): """ - This class implements a exponentially changing parameter according to the - number of times it has been used. + This class implements a decaying parameter. The decay follows the formula: + + .. math:: + v_n = \dfrac{v_0}{n^p} + + where :math: `v_0` is the initial value of the parameter, :math: `n` is the number of steps and :math: `p` is an + arbitrary exponent. """ - def __init__(self, value, exp=1., min_value=None, max_value=None, - size=(1,)): + def __init__(self, value, exp=1., min_value=None, max_value=None, size=(1,)): + """ + Constructor. + + Args: + value (float): initial value of the parameter; + exp (float, 1.): exponent for the step decay; + min_value (float, None): minimum value that the parameter can reach when decreasing; + max_value (float, None): maximum value that the parameter can reach when increasing; + size (tuple, (1,)): shape of the matrix of parameters; this shape can be used to have a single parameter for + each state or state-action tuple. + + """ self._exp = exp super().__init__(value, min_value, max_value, size) diff --git a/mushroom_rl/utils/record.py b/mushroom_rl/utils/record.py index aa5761ba..2ffa0001 100644 --- a/mushroom_rl/utils/record.py +++ b/mushroom_rl/utils/record.py @@ -1,4 +1,3 @@ -import os import cv2 import datetime from pathlib import Path @@ -7,6 +6,7 @@ class VideoRecorder(object): """ Simple video record that creates a video from a stream of images. + """ def __init__(self, path="./mushroom_rl_recordings", tag=None, video_name=None, fps=60): diff --git a/tests/utils/test_callbacks.py b/tests/utils/test_callbacks.py index a078b98c..0bc9e712 100644 --- a/tests/utils/test_callbacks.py +++ b/tests/utils/test_callbacks.py @@ -2,7 +2,7 @@ from mushroom_rl.environments import GridWorld from mushroom_rl.algorithms.value import SARSA from mushroom_rl.policy import EpsGreedy -from mushroom_rl.rl_utils.parameters import Parameter, ExponentialParameter +from mushroom_rl.rl_utils.parameters import Parameter, DecayParameter from mushroom_rl.utils.callbacks import * import numpy as np @@ -64,8 +64,8 @@ def test_collect_parameter(): np.random.seed(42) mdp = GridWorld(3, 3, (2, 2)) - eps = ExponentialParameter(value=1, exp=.5, - size=mdp.info.observation_space.size) + eps = DecayParameter(value=1, exp=.5, + size=mdp.info.observation_space.size) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(mdp.info, pi, alpha)