From ea0fc1f0e2fb127a7590e96baedd1f60c5593bd6 Mon Sep 17 00:00:00 2001 From: Raghu Rajan Date: Fri, 10 Jan 2025 19:18:28 +0100 Subject: [PATCH] Minor change to save compute when rendering RLToyEnv with image_representations; added close() to release pygame resources when called; fixed 6 tests in TestGymEnvWrapper, 8 in TestRLToyEnv; --- mdp_playground/envs/gym_env_wrapper.py | 4 +- mdp_playground/envs/rl_toy_env.py | 28 +++-- tests/test_gym_env_wrapper.py | 40 +++---- tests/test_mdp_playground.py | 155 +++++++++++-------------- 4 files changed, 108 insertions(+), 119 deletions(-) diff --git a/mdp_playground/envs/gym_env_wrapper.py b/mdp_playground/envs/gym_env_wrapper.py index 36d9735..dad814b 100644 --- a/mdp_playground/envs/gym_env_wrapper.py +++ b/mdp_playground/envs/gym_env_wrapper.py @@ -360,9 +360,7 @@ def step(self, action): ) probs[action] = 1 - self.transition_noise old_action = action - action = int( - self._np_random.choice(self.env.action_space.n, size=1, p=probs) - ) # random + action = self._np_random.choice(self.env.action_space.n, size=1, p=probs).item() # random if old_action != action: # print("NOISE inserted", old_action, action) self.total_noisy_transitions_episode += 1 diff --git a/mdp_playground/envs/rl_toy_env.py b/mdp_playground/envs/rl_toy_env.py index bc0f375..12ff545 100644 --- a/mdp_playground/envs/rl_toy_env.py +++ b/mdp_playground/envs/rl_toy_env.py @@ -2348,9 +2348,7 @@ def render(self,): # to only instantiate the render_space here and not in __init__ because it's only needed # if render() is called. if self.window is None: - if self.image_representations: - self.render_space = self.observation_space - else: + if not self.image_representations: if self.config["state_space_type"] == "discrete": self.render_space = ImageMultiDiscrete( self.state_space_size, @@ -2396,10 +2394,11 @@ def render(self,): if self.clock is None and self.render_mode == "human": self.clock = pygame.time.Clock() - # ##TODO There are repeated calculations here in calling get_concatenated_image - # that can be taken from storing variables in step() or reset(). if self.render_mode == "human": - rgb_array = self.render_space.get_concatenated_image(self.curr_state) + if not self.image_representations: + rgb_array = self.render_space.get_concatenated_image(self.curr_state) + elif self.image_representations: + rgb_array = self.curr_obs pygame_surface = pygame.surfarray.make_surface(rgb_array) self.window.blit(pygame_surface, pygame_surface.get_rect()) pygame.event.pump() @@ -2409,7 +2408,22 @@ def render(self,): # The following line will automatically add a delay to keep the framerate stable. self.clock.tick(self.metadata["render_fps"]) elif self.render_mode == "rgb_array": - return self.render_space.get_concatenated_image(self.curr_state) + if not self.image_representations: + return self.render_space.get_concatenated_image(self.curr_state) + elif self.image_representations: + return self.curr_obs + + def close(self): + ''' + Closes the environment and the pygame window if it was opened. + ''' + if self.window is not None and self.render_mode == "human": + import pygame + pygame.display.quit() + pygame.quit() + self.window = None + self.clock = None + def dist_of_pt_from_line(pt, ptA, ptB): """Returns shortest distance of a point from a line defined by 2 points - ptA and ptB. diff --git a/tests/test_gym_env_wrapper.py b/tests/test_gym_env_wrapper.py index a55ebff..330a2d3 100644 --- a/tests/test_gym_env_wrapper.py +++ b/tests/test_gym_env_wrapper.py @@ -35,7 +35,7 @@ def test_r_delay(self): "grayscale_obs": False, "state_space_type": "discrete", "action_space_type": "discrete", - "seed": 0, + "seed": 1, # }, # 'seed': 0, #seed } @@ -52,7 +52,7 @@ def test_r_delay(self): act = aew.action_space.sample() next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) - if i == 154 or i == 159: + if i == 124 or i == 152 or i == 171: assert reward == 44.0, ( "1-step delayed reward in step: " + str(i) @@ -73,7 +73,7 @@ def test_r_shift(self): "grayscale_obs": False, "state_space_type": "discrete", "action_space_type": "discrete", - "seed": 0, + "seed": 1, # }, # 'seed': 0, #seed } @@ -91,7 +91,7 @@ def test_r_shift(self): act = aew.action_space.sample() next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) - if i == 153 or i == 158: + if i == 123 or i == 151 or i == 170: assert reward == 45.0, ( "Shifted reward in step: " + str(i) + " should have been 45.0." ) @@ -114,7 +114,7 @@ def test_r_scale(self): "grayscale_obs": False, "state_space_type": "discrete", "action_space_type": "discrete", - "seed": 0, + "seed": 1, # }, # 'seed': 0, #seed } @@ -131,7 +131,7 @@ def test_r_scale(self): act = aew.action_space.sample() next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) - if i == 153 or i == 158: + if i == 123 or i == 151 or i == 170: assert reward == 88.0, ( "Scaled reward in step: " + str(i) + " should have been 88.0." ) @@ -236,15 +236,15 @@ def test_r_delay_p_noise_r_noise(self): print("\033[32;1;4mTEST_MULTIPLE\033[0m") config = { "delay": 1, - "reward_noise": lambda a: a.normal(0, 0.1), - "transition_noise": 0.1, + "reward_noise": lambda s, a, rng: rng.normal(0, 0.1), + "transition_noise": 0.2, # "GymEnvWrapper": { "atari_preprocessing": True, "frame_skip": 4, "grayscale_obs": False, "state_space_type": "discrete", "action_space_type": "discrete", - "seed": 0, + "seed": 1, # }, # 'seed': 0, #seed } @@ -262,24 +262,24 @@ def test_r_delay_p_noise_r_noise(self): next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) # Testing hardcoded values at these timesteps implicitly tests that there - # were 21 noisy transitions in total and noise inserted in rewards. - if i == 154: + # were noisy transitions and noise inserted in rewards. + if i == 147: np.testing.assert_allclose( reward, - 44.12183457980473, + 44.0668047426572, rtol=1e-05, err_msg="1-step delayed reward in step: " + str(i) - + " should have been 44.0.", + + " should have been 44.066...", ) - if i == 199: + if i == 173: np.testing.assert_allclose( reward, - 0.07467690634910334, + 44.088450289124935, rtol=1e-05, err_msg="1-step delayed reward in step: " + str(i) - + " should have been 44.0.", + + " should have been 44.088...", ) total_reward += reward print("total_reward:", total_reward) @@ -296,7 +296,7 @@ def test_discrete_irr_features(self): "grayscale_obs": False, "state_space_type": "discrete", "action_space_type": "discrete", - "seed": 0, + "seed": 1, "irrelevant_features": { "state_space_type": "discrete", "action_space_type": "discrete", @@ -331,7 +331,7 @@ def test_discrete_irr_features(self): act, next_state[1], ) - if i == 154 or i == 159: + if i == 128 or i == 151: assert reward == 44.0, ( "1-step delayed reward in step: " + str(i) @@ -355,7 +355,7 @@ def test_image_transforms(self): "grayscale_obs": False, "state_space_type": "discrete", "action_space_type": "discrete", - "seed": 0, + "seed": 1, # }, # 'seed': 0, #seed } @@ -373,7 +373,7 @@ def test_image_transforms(self): act = aew.action_space.sample() next_state, reward, done, trunc, info = aew.step(act) print("step, reward, done, act:", i, reward, done, act) - if i == 153 or i == 158: + if i == 123 or i == 151: assert reward == 44.0, ( "Reward in step: " + str(i) + " should have been 44.0." ) diff --git a/tests/test_mdp_playground.py b/tests/test_mdp_playground.py index 6271875..12b00e1 100644 --- a/tests/test_mdp_playground.py +++ b/tests/test_mdp_playground.py @@ -814,15 +814,15 @@ def test_grid_image_representations(self): actions = [ [0, 1], [-1, 0], - [-1, 0], - [1, 0], + [0, -1], + [0, -1], [0.5, -0.5], [1, 2], [1, 0], [0, -1], [0, -1], ] - expected_image_sums = [6372018, 6371313, 6372018, 6371313, 6371313] + expected_image_sums = [6371313, 6372018, 6372018, 6407811] # obs = env.curr_obs[0] # import PIL.Image as Image @@ -853,7 +853,7 @@ def test_grid_image_representations(self): ) # To check bouncing back behaviour of grid walls - for i in range(4): + for i in range(6): # action = env.action_space.sample() action = [0, 1] next_obs, reward, done, trunc, info = env.step(action) @@ -862,8 +862,8 @@ def test_grid_image_representations(self): state = next_state.copy() tot_rew += reward - assert tot_rew == 2.0, str(tot_rew) - assert state == [5, 7], str(state) + assert tot_rew == 6.0, str(tot_rew) + assert state == [6, 7], str(state) # test_ = np.allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False) # self.assertAlmostEqual(state, np.array([21.59339006, 20.68189965, 21.49608203, 20.19183292]), places=3) # Error env.reset()[0] @@ -894,7 +894,7 @@ def test_grid_image_representations(self): state = next_state.copy() tot_rew += reward - assert tot_rew == 2.0, str(tot_rew) + assert tot_rew == 4.0, str(tot_rew) env.reset()[0] env.close() @@ -938,7 +938,7 @@ def test_grid_image_representations(self): state = next_state.copy() tot_rew += reward - assert tot_rew == -0.5, str(tot_rew) + assert tot_rew == 3, str(tot_rew) env.reset()[0] env.close() @@ -959,7 +959,7 @@ def test_grid_image_representations(self): [0, -1], [0, -1], ] - expected_image_sums = [12272400, 12271695] + expected_image_sums = [12271695, 12272400] # obs = env.curr_obs[0] # import PIL.Image as Image @@ -1002,7 +1002,7 @@ def test_grid_image_representations(self): # img1.show() - assert tot_rew == -3, str(tot_rew) + assert tot_rew == 4, str(tot_rew) env.reset()[0] env.close() @@ -1049,7 +1049,7 @@ def test_grid_image_representations(self): # img1 = Image.fromarray(np.squeeze(obs), 'RGB') # img1.show() - assert tot_rew == 0.75, str(tot_rew) + assert tot_rew == 1.0, str(tot_rew) env.reset()[0] env.close() @@ -1326,7 +1326,7 @@ def test_discrete_reward_delay(self): state = env.get_augmented_state()["curr_state"] actions = [ - 6, + 3, 2, 5, 4, @@ -1337,7 +1337,6 @@ def test_discrete_reward_delay(self): 4, ] # 2nd last action is random just to check that last delayed reward works with any action expected_rewards = [0, 0, 0, 1, 0, 0, 0, 1, 0] - expected_states = [5, 4, 0, 0, 5, 4, 6,] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -1348,9 +1347,6 @@ def test_discrete_reward_delay(self): + str(i + 1) + " when reward delay = 3.", ) - # self.assertEqual(state, expected_states[i], "Expected state mismatch in - # time step: " + str(i + 1) + " when reward delay = 3.") # will not work - # for 2nd last time step due to random action. state = next_state env.reset()[0] @@ -1432,20 +1428,20 @@ def test_discrete_p_noise(self): config["delay"] = 0 config["sequence_length"] = 1 config["reward_scale"] = 1.0 - config["transition_noise"] = 0.5 + config["transition_noise"] = 0.9 config["generate_random_mdp"] = True # config["log_level"] = logging.INFO env = RLToyEnv(**config) state = env.get_augmented_state()["curr_state"] - actions = [6, 6, 2, np.random.default_rng().integers(config["action_space_size"])] # + actions = [6, 6, 2, np.random.default_rng(0).integers(config["action_space_size"])] # expected_states = [ - 1, - 5, + 0, 4, + 3, 1, - ] # Last state 3 is fixed for this test because of fixed seed for Env which selects the next noisy state. + ] for i in range(len(actions)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -1490,13 +1486,11 @@ def test_discrete_r_noise(self): env = RLToyEnv(**config) state = env.get_augmented_state()["curr_state"] - actions = [6, 6, 2, 1] # + actions = [3, 6,] expected_rewards = [ - 1 + 0.32021, - 0.0524501, - -0.267835, - 0.180798, - ] # 2nd state produces 'true' reward + 1 - 0.0660524, + 0.320211, + ] for i in range(len(actions)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -1818,31 +1812,31 @@ def test_discrete_image_representations(self): state = env.get_augmented_state()["augmented_state"][-1] actions = [ - 6, + 4, 6, 2, - 3, + 7, 4, 2, - np.random.default_rng().integers(config["action_space_size"]), + np.random.default_rng(1).integers(config["action_space_size"]), 5, ] # - expected_rewards = [0, 0, 0, 1, 0, 1, 0, 0] + expected_rewards = [0, 0, 0, 0, 1,] # 0, 1, 0, 0] expected_reward_noises = [ - -0.292808, - 0.770696, - -1.01743611, - -0.042768, - 0.78761320, - -0.510087, - -0.089978, - 0.48654863, + -0.0660524, + 0.3202113, + 0.052450, + -0.267834, + 0.1807975, + # -0.510087, + # -0.089978, + # 0.48654863, ] expected_image_sums = [ - 122910, - 212925, - 111180, - ] # [152745, 282030, 528870], [105060, 232050, 78795] + 364395, + 342465, + 412335, + ] for i in range(len(expected_rewards)): expected_rewards[i] = ( expected_rewards[i] + expected_reward_noises[i] @@ -1910,19 +1904,16 @@ def test_discrete_reward_every_n_steps(self): state = env.get_augmented_state()["curr_state"] actions = [ - 6, 6, 2, - 3, - 4, 2, + 4, + 4, + 6, 6, 1, - 0, - np.random.default_rng().integers(config["action_space_size"]), - 5, ] # - expected_rewards = [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0] + expected_rewards = [0, 0, 1, 0, 0, 1,] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -1941,24 +1932,12 @@ def test_discrete_reward_every_n_steps(self): # With delay config["delay"] = 1 config["sequence_length"] = 3 + config["reward_every_n_steps"] = 2 env = RLToyEnv(**config) state = env.get_augmented_state()["curr_state"] - actions = [ - 6, - 6, - 2, - 3, - 4, - 2, - 6, - 1, - 0, - np.random.default_rng().integers(config["action_space_size"]), - 5, - ] # - expected_rewards = [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0] + expected_rewards = [0, 0, 0, 1, 0, 0,] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -1983,18 +1962,16 @@ def test_discrete_reward_every_n_steps(self): actions = [ 6, - 6, - 2, 3, 4, - 2, + 4, + 4, + 6, 6, 1, - 0, - np.random.default_rng().integers(config["action_space_size"]), - 5, ] # - expected_rewards = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + expected_rewards = [0, 0, 0, 1, 0, 1, 0,] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -2029,8 +2006,8 @@ def test_discrete_custom_P_R(self): config["use_custom_mdp"] = True # np.random.seed(0) # seed - config["transition_function"] = np.random.default_rng().integers(8, size=(8, 5)) - config["reward_function"] = np.random.default_rng().integers(4, size=(8, 5)) + config["transition_function"] = np.random.default_rng(0).integers(8, size=(8, 5)) + config["reward_function"] = np.random.default_rng(1).integers(4, size=(8, 5)) config["init_state_dist"] = np.array([1 / 8 for i in range(8)]) env = RLToyEnv(**config) @@ -2049,7 +2026,7 @@ def test_discrete_custom_P_R(self): np.random.default_rng(0).integers(config["action_space_size"]), 4, ] # - expected_rewards = [0, 0, 6, 4, 4, 0, 4, 6, 6, 2, 0] + expected_rewards = [0, 2, 2, 6, 6, 2, 0, 2, 6, 2, 2] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -2065,8 +2042,8 @@ def test_discrete_custom_P_R(self): # np.random.seed(0) #seed config["delay"] = 2 - P = np.random.default_rng().integers(8, size=(8, 5)) - R = np.random.default_rng().integers(4, size=(8, 5)) + P = np.random.default_rng(0).integers(8, size=(8, 5)) + R = np.random.default_rng(1).integers(4, size=(8, 5)) config["transition_function"] = lambda s, a: P[s, a] config["reward_function"] = lambda s, a: R[s[-2], a] config["init_state_dist"] = np.array([1 / 8 for i in range(8)]) @@ -2087,7 +2064,7 @@ def test_discrete_custom_P_R(self): np.random.default_rng().integers(config["action_space_size"]), 4, ] # - expected_rewards = [0, 0, 0, 2, 2, 0, 0, 6, 0, 4, 6] + expected_rewards = [0, 0, 2, 2, 6, 6, 2, 0, 2, 6, 2] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -2128,13 +2105,13 @@ def test_continuous_custom_P_R(self): actions = [2, [0.5, 1.5], 2, 3, [-10, -5], 2, 1, 1] # expected_rewards = [ 0, - -1.06496, - 0.935036, - 1.435036, - 3.435036, - 6.435036, - -3.564964, - -1.564964, + -2.95762, + -0.957624, + # 1.435036, + # 3.435036, + # 6.435036, + # -3.564964, + # -1.564964, ] # , -0.564964] for i in range(len(expected_rewards)): next_state, reward, done, trunc, info = env.step(actions[i]) @@ -2300,16 +2277,16 @@ def test_discrete_diameter(self): ) # TODO Similar test case for irrelevant_features state = env.get_augmented_state()["curr_state"] - actions = [6, 6, 7, 7, 0, 7, 1] # + actions = [7, 1, 1, 7, 0, 7, 1] # expected_rewards = [ 0, 0, 1, 0, + 1, 0, 0, - 1, - ] # 1st, 3rd and 4th states produce 'true' rewards, every reward has been shifted by 1 + ] # for i in range(len(actions)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done) @@ -2385,7 +2362,7 @@ def test_discrete_diameter(self): ) # TODO Similar test case for irrelevant_features state = env.get_augmented_state()["curr_state"] - actions = [1, 7, 2, 4, 0, 7, 1] # Leads to rewardable sequence 20, 1, 12, 21, 5 + actions = [2, 5, 5, 1, 0, 7, 1] # From 1st state 13, actions lead to rewardable sequence 19, 1, 10, 21, 4 expected_rewards = [ 0, 0, @@ -2394,7 +2371,7 @@ def test_discrete_diameter(self): 1, 0, 0, - ] # 1st, 3rd and 4th states produce 'true' rewards, every reward has been shifted by 1 + ] # for i in range(len(actions)): next_state, reward, done, trunc, info = env.step(actions[i]) print("sars', done =", state, actions[i], reward, next_state, done)