From 4ab8d2ced7ad755876b2ca9ed26936480d883865 Mon Sep 17 00:00:00 2001
From: okiwi6 <45100017+oleflb@users.noreply.github.com>
Date: Sat, 1 Feb 2025 16:40:16 +0100
Subject: [PATCH 1/2] Refactor tomato throwing mechanics and reward system in
NaoStanding environment
---
.../machine-learning/mujoco/model/tomato.xml | 6 +--
.../nao_env/src/nao_env/nao_standing.py | 40 ++++++++++++-------
.../packages/rewards/src/rewards/composer.py | 2 +-
.../packages/rewards/src/rewards/rewards.py | 9 +++++
.../throwing/src/throwing/__init__.py | 8 ++++
5 files changed, 47 insertions(+), 18 deletions(-)
diff --git a/tools/machine-learning/mujoco/model/tomato.xml b/tools/machine-learning/mujoco/model/tomato.xml
index aac8d575ca..dac1930aa9 100644
--- a/tools/machine-learning/mujoco/model/tomato.xml
+++ b/tools/machine-learning/mujoco/model/tomato.xml
@@ -1,8 +1,8 @@
-
-
+
+
-
+
\ No newline at end of file
diff --git a/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py b/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py
index c2d4103ac0..bb64df4534 100644
--- a/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py
+++ b/tools/machine-learning/mujoco/packages/nao_env/src/nao_env/nao_standing.py
@@ -6,6 +6,7 @@
from numpy.typing import NDArray
from rewards import (
ConstantReward,
+ ControlAmplitudePenalty,
HeadOverTorsoPenalty,
RewardComposer,
RewardContext,
@@ -40,7 +41,7 @@
],
)
-HEAD_SET_HEIGHT = 0.51
+HEAD_SET_HEIGHT = 0.493
class NaoStanding(NaoBaseEnv, utils.EzPickle):
@@ -57,24 +58,38 @@ def __init__(
)
self.current_step = 0
- self.termination_penalty = 10.0
+ self.next_throw_at = 500
+ self.expected_number_of_frames_between_throws = 120
+ self.rng = np.random.default_rng()
self.reward = (
RewardComposer()
- .add(0.05, ConstantReward())
- .add(-0.01, TorqueChangeRatePenalty(self.model.nu, self.dt))
- .add(1.0, HeadOverTorsoPenalty())
+ .add(0.02, ConstantReward())
+ .add(-0.001, TorqueChangeRatePenalty(self.model.nu, self.dt))
+ .add(-0.001, ControlAmplitudePenalty())
+ .add(-0.5, HeadOverTorsoPenalty())
)
utils.EzPickle.__init__(self, **kwargs)
+ def _should_throw_tomato(self) -> bool:
+ allowed_to_throw = (
+ self.current_step >= self.next_throw_at
+ and self.projectile.has_ground_contact()
+ )
+ if allowed_to_throw:
+ self.next_throw_at = self.current_step + self.rng.poisson(
+ self.expected_number_of_frames_between_throws
+ )
+
+ return allowed_to_throw
+
@override
def step(self, action: NDArray[np.floating]) -> tuple:
self.current_step += 1
- if self.throw_tomatoes and self.projectile.has_ground_contact():
+ if self.throw_tomatoes and self._should_throw_tomato():
target = self.data.site("Robot").xpos
- alpha = self.current_step / 2500
- time_to_reach = 0.2 * (1 - alpha) + 0.1 * alpha
+ time_to_reach = 0.15
self.projectile.random_throw(
target,
time_to_reach=time_to_reach,
@@ -82,19 +97,14 @@ def step(self, action: NDArray[np.floating]) -> tuple:
)
self.do_simulation(action + OFFSET_QPOS, self.frame_skip)
- head_center_z = self.data.site("head_center").xpos[2]
if self.render_mode == "human":
self.render()
- terminated = head_center_z < 0.3
-
distinct_rewards = self.reward.rewards(RewardContext(self.nao, action))
reward = sum(distinct_rewards.values())
- if terminated:
- reward -= self.termination_penalty
-
+ terminated = False
self.current_step += 1
return (
self._get_obs(),
@@ -107,6 +117,8 @@ def step(self, action: NDArray[np.floating]) -> tuple:
@override
def reset_model(self) -> NDArray[np.floating]:
self.current_step = 0
+ self.next_throw_at = 500
+ self.reward.reset()
self.set_state(
self.init_qpos,
self.init_qvel,
diff --git a/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py b/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py
index 10f95b985c..9abe56366c 100644
--- a/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py
+++ b/tools/machine-learning/mujoco/packages/rewards/src/rewards/composer.py
@@ -16,7 +16,7 @@ def add(self, factor: float | None, reward: BaseReward) -> Self:
return self
def reward(self, context: RewardContext) -> np.floating:
- return np.float32(sum(self._inner_rewards(context).values()))
+ return np.float32(sum(self.rewards(context).values()))
def rewards(self, context: RewardContext) -> dict[str, np.floating]:
return {
diff --git a/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py b/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py
index 29f54f04ea..9dc9d42563 100644
--- a/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py
+++ b/tools/machine-learning/mujoco/packages/rewards/src/rewards/rewards.py
@@ -64,6 +64,7 @@ def reward(self, context: RewardContext) -> np.floating:
class TorqueChangeRatePenalty(BaseReward):
def __init__(self, actuator_dimension: int, dt: float) -> None:
self.previous_force = np.zeros(actuator_dimension)
+ self.is_initialized = False
self.dt = dt
def reward(self, context: RewardContext) -> np.floating:
@@ -78,8 +79,16 @@ def reward(self, context: RewardContext) -> np.floating:
np.abs(previous_torque - current_torque) / self.dt
)
self.previous_force = np.copy(context.nao.data.actuator_force)
+
+ if not self.is_initialized:
+ self.is_initialized = True
+ return np.float32(0.0)
+
return torque_change_rate
+ def reset(self) -> None:
+ self.is_initialized = False
+
class XDistanceReward(BaseReward):
def reward(self, context: RewardContext) -> np.floating:
diff --git a/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py b/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py
index 03bd728c2b..b71642a90b 100644
--- a/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py
+++ b/tools/machine-learning/mujoco/packages/throwing/src/throwing/__init__.py
@@ -66,6 +66,14 @@ def __init__(
)
def has_ground_contact(self) -> bool:
+ qpos_index = self.model.jnt_qposadr[
+ self.model.body_jntadr[self.throwable_index]
+ ]
+ z_height = self.data.qpos[qpos_index + 2]
+ if z_height <= -1.0:
+ # clipped through the floor
+ return True
+
geoms = (
self.model.body_geomadr[self.throwable_index],
self.model.body_geomadr[self.ground_index],
From 64edfbe53d543d20df31ee7f669908b3b550bcfd Mon Sep 17 00:00:00 2001
From: okiwi6 <45100017+oleflb@users.noreply.github.com>
Date: Sat, 1 Feb 2025 17:52:31 +0100
Subject: [PATCH 2/2] add newline
---
tools/machine-learning/mujoco/model/tomato.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/machine-learning/mujoco/model/tomato.xml b/tools/machine-learning/mujoco/model/tomato.xml
index dac1930aa9..cc7ced0b55 100644
--- a/tools/machine-learning/mujoco/model/tomato.xml
+++ b/tools/machine-learning/mujoco/model/tomato.xml
@@ -5,4 +5,4 @@
-
\ No newline at end of file
+