-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsb_tutorial.py
77 lines (56 loc) · 2.48 KB
/
sb_tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.base_class import BaseAlgorithm
from recording_tools import *
# MlpPolicy because cartpole observation is feature vector not image
env = gym.make("CartPole-v1")
model = PPO(MlpPolicy, env, verbose=0)
def evaluate(
model: BaseAlgorithm,
num_episodes: int = 100,
deterministic: bool = True,
) -> float:
"""
Evaluate an RL agent for `num_episodes`.
:param model: the RL Agent
:param env: the gym Environment
:param num_episodes: number of episodes to evaluate it
:param deterministic: Whether to use deterministic or stochastic actions
:return: Mean reward for the last `num_episodes`
"""
# This function will only work for a single environment
vec_env = model.get_env()
obs = vec_env.reset()
all_episode_rewards = []
for _ in range(num_episodes):
episode_rewards = []
done = False
# Note: SB3 VecEnv resets automatically:
# https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api
# obs = vec_env.reset()
while not done:
# _states are only useful when using LSTM policies
# `deterministic` is to use deterministic actions
action, _states = model.predict(obs, deterministic=deterministic)
# here, action, rewards and dones are arrays
# because we are using vectorized env
obs, reward, done, _info = vec_env.step(action)
episode_rewards.append(reward)
all_episode_rewards.append(sum(episode_rewards))
mean_episode_reward = np.mean(all_episode_rewards)
print(f"Mean reward: {mean_episode_reward:.2f} - Num episodes: {num_episodes}")
return mean_episode_reward
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100, deterministic=True)
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False)
print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")
# train the agent
model.learn(total_timesteps=10_000)
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
record_video("CartPole-v1", model, video_length=500, prefix="ppo-cartpole")
show_videos("videos", prefix="ppo")