From 4ab8d2ced7ad755876b2ca9ed26936480d883865 Mon Sep 17 00:00:00 2001 From: okiwi6 <45100017+oleflb@users.noreply.github.com> Date: Sat, 1 Feb 2025 16:40:16 +0100 Subject: [PATCH 1/2] Refactor tomato throwing mechanics and reward system in NaoStanding environment --- .../machine-learning/mujoco/model/tomato.xml | 6 +-- .../nao_env/src/nao_env/nao_standing.py | 40 ++++++++++++------- .../packages/rewards/src/rewards/composer.py | 2 +- .../packages/rewards/src/rewards/rewards.py | 9 +++++ .../throwing/src/throwing/__init__.py | 8 ++++ 5 files changed, 47 insertions(+), 18 deletions(-) diff --git a/tools/machine-learning/mujoco/model/tomato.xml b/tools/machine-learning/mujoco/model/tomato.xml index aac8d575ca..dac1930aa9 100644 --- a/tools/machine-learning/mujoco/model/tomato.xml +++ b/tools/machine-learning/mujoco/model/tomato.xml @@ -1,8 +1,8 @@ - - + + - + \ No newline at end of file diff --git a/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py b/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py index c2d4103ac0..bb64df4534 100644 --- a/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py +++ b/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py @@ -6,6 +6,7 @@ from numpy.typing import NDArray from rewards import ( ConstantReward, + ControlAmplitudePenalty, HeadOverTorsoPenalty, RewardComposer, RewardContext, @@ -40,7 +41,7 @@ ], ) -HEAD_SET_HEIGHT = 0.51 +HEAD_SET_HEIGHT = 0.493 class NaoStanding(NaoBaseEnv, utils.EzPickle): @@ -57,24 +58,38 @@ def __init__( ) self.current_step = 0 - self.termination_penalty = 10.0 + self.next_throw_at = 500 + self.expected_number_of_frames_between_throws = 120 + self.rng = np.random.default_rng() self.reward = ( RewardComposer() - .add(0.05, ConstantReward()) - .add(-0.01, TorqueChangeRatePenalty(self.model.nu, self.dt)) - .add(1.0, HeadOverTorsoPenalty()) + .add(0.02, ConstantReward()) + .add(-0.001, TorqueChangeRatePenalty(self.model.nu, self.dt)) + .add(-0.001, ControlAmplitudePenalty()) + .add(-0.5, HeadOverTorsoPenalty()) ) utils.EzPickle.__init__(self, **kwargs) + def _should_throw_tomato(self) -> bool: + allowed_to_throw = ( + self.current_step >= self.next_throw_at + and self.projectile.has_ground_contact() + ) + if allowed_to_throw: + self.next_throw_at = self.current_step + self.rng.poisson( + self.expected_number_of_frames_between_throws + ) + + return allowed_to_throw + @override def step(self, action: NDArray[np.floating]) -> tuple: self.current_step += 1 - if self.throw_tomatoes and self.projectile.has_ground_contact(): + if self.throw_tomatoes and self._should_throw_tomato(): target = self.data.site("Robot").xpos - alpha = self.current_step / 2500 - time_to_reach = 0.2 * (1 - alpha) + 0.1 * alpha + time_to_reach = 0.15 self.projectile.random_throw( target, time_to_reach=time_to_reach, @@ -82,19 +97,14 @@ def step(self, action: NDArray[np.floating]) -> tuple: ) self.do_simulation(action + OFFSET_QPOS, self.frame_skip) - head_center_z = self.data.site("head_center").xpos[2] if self.render_mode == "human": self.render() - terminated = head_center_z < 0.3 - distinct_rewards = self.reward.rewards(RewardContext(self.nao, action)) reward = sum(distinct_rewards.values()) - if terminated: - reward -= self.termination_penalty - + terminated = False self.current_step += 1 return ( self._get_obs(), @@ -107,6 +117,8 @@ def step(self, action: NDArray[np.floating]) -> tuple: @override def reset_model(self) -> NDArray[np.floating]: self.current_step = 0 + self.next_throw_at = 500 + self.reward.reset() self.set_state( self.init_qpos, self.init_qvel, diff --git a/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py b/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py index 10f95b985c..9abe56366c 100644 --- a/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py +++ b/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py @@ -16,7 +16,7 @@ def add(self, factor: float | None, reward: BaseReward) -> Self: return self def reward(self, context: RewardContext) -> np.floating: - return np.float32(sum(self._inner_rewards(context).values())) + return np.float32(sum(self.rewards(context).values())) def rewards(self, context: RewardContext) -> dict[str, np.floating]: return { diff --git a/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py b/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py index 29f54f04ea..9dc9d42563 100644 --- a/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py +++ b/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py @@ -64,6 +64,7 @@ def reward(self, context: RewardContext) -> np.floating: class TorqueChangeRatePenalty(BaseReward): def __init__(self, actuator_dimension: int, dt: float) -> None: self.previous_force = np.zeros(actuator_dimension) + self.is_initialized = False self.dt = dt def reward(self, context: RewardContext) -> np.floating: @@ -78,8 +79,16 @@ def reward(self, context: RewardContext) -> np.floating: np.abs(previous_torque - current_torque) / self.dt ) self.previous_force = np.copy(context.nao.data.actuator_force) + + if not self.is_initialized: + self.is_initialized = True + return np.float32(0.0) + return torque_change_rate + def reset(self) -> None: + self.is_initialized = False + class XDistanceReward(BaseReward): def reward(self, context: RewardContext) -> np.floating: diff --git a/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py b/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py index 03bd728c2b..b71642a90b 100644 --- a/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py +++ b/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py @@ -66,6 +66,14 @@ def __init__( ) def has_ground_contact(self) -> bool: + qpos_index = self.model.jnt_qposadr[ + self.model.body_jntadr[self.throwable_index] + ] + z_height = self.data.qpos[qpos_index + 2] + if z_height <= -1.0: + # clipped through the floor + return True + geoms = ( self.model.body_geomadr[self.throwable_index], self.model.body_geomadr[self.ground_index], From 64edfbe53d543d20df31ee7f669908b3b550bcfd Mon Sep 17 00:00:00 2001 From: okiwi6 <45100017+oleflb@users.noreply.github.com> Date: Sat, 1 Feb 2025 17:52:31 +0100 Subject: [PATCH 2/2] add newline --- tools/machine-learning/mujoco/model/tomato.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/machine-learning/mujoco/model/tomato.xml b/tools/machine-learning/mujoco/model/tomato.xml index dac1930aa9..cc7ced0b55 100644 --- a/tools/machine-learning/mujoco/model/tomato.xml +++ b/tools/machine-learning/mujoco/model/tomato.xml @@ -5,4 +5,4 @@ - \ No newline at end of file +