Skip to content

Commit

Permalink
Add PPO
Browse files Browse the repository at this point in the history
  • Loading branch information
lilianweng committed Jan 4, 2019
1 parent eee622b commit 4fec487
Show file tree
Hide file tree
Showing 12 changed files with 416 additions and 143 deletions.
10 changes: 5 additions & 5 deletions playground/configs/data/ddpg-bipedalwalker-v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"policy_name": "DDPGPolicy",
"policy_params": {
"gamma": 0.99,
"actor_layers": [64, 32],
"critic_layers": [128, 64],
"actor_layers": [32, 32],
"critic_layers": [64, 64],
"deterministic": true
},
"train_params": {
"n_steps": 40000,
"warmup_steps": 10000,
"batch_size": 128,
"n_steps": 100000,
"warmup_steps": 35000,
"batch_size": 64,
"lr_a": 0.005,
"lr_c": 0.005,
"epsilon": 0.35,
Expand Down
23 changes: 23 additions & 0 deletions playground/configs/data/ppo-lunarlander-v2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"env_name": "LunarLander-v2",
"policy_name": "PPOPolicy",
"policy_params": {
"gamma": 0.99,
"lam": 0.95,
"actor_layers": [64, 64],
"critic_layers": [128, 64],
"clip_norm": 0.5,
"deterministic": true
},
"train_params": {
"lr_a": 0.002,
"lr_c": 0.005,
"batch_size": 128,
"ratio_clip_range": 0.2,
"ratio_clip_decay": false,
"n_iterations": 100,
"n_rollout_workers": 5,
"train_epoches": 4,
"log_every_iteration": 5
}
}
2 changes: 2 additions & 0 deletions playground/policies/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from playground.policies.actor_critic import ActorCriticPolicy
from playground.policies.ddpg import DDPGPolicy
from playground.policies.dqn import DqnPolicy
from playground.policies.ppo import PPOPolicy
from playground.policies.qlearning import QlearningPolicy
from playground.policies.reinforce import ReinforcePolicy

ALL_POLICIES = [
ActorCriticPolicy,
DDPGPolicy,
DqnPolicy,
PPOPolicy,
QlearningPolicy,
ReinforcePolicy
]
79 changes: 37 additions & 42 deletions playground/policies/actor_critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import tensorflow as tf
from gym.spaces import Discrete

from playground.policies.base import BaseModelMixin, Policy, ReplayMemory, Config
from playground.policies.base import BaseModelMixin, Config, Policy
from playground.policies.memory import ReplayMemory, Transition
from playground.utils.misc import plot_learning_curve
from playground.utils.tf_ops import dense_nn

Expand All @@ -16,7 +17,7 @@ def __init__(self, env, name, training=True, gamma=0.9, layer_sizes=None, clip_n
BaseModelMixin.__init__(self, name)

assert isinstance(self.env.action_space, Discrete), \
"Current implementation only works for discrete action space."
"Current ActorCriticPolicy implementation only works for discrete action space."

self.layer_sizes = [64] if layer_sizes is None else layer_sizes
self.clip_norm = clip_norm
Expand All @@ -34,8 +35,9 @@ def _build_networks(self):
# Define input placeholders
self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state')
self.a = tf.placeholder(tf.int32, shape=(None,), name='action')
self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state')
self.r = tf.placeholder(tf.float32, shape=(None,), name='reward')
self.td_target = tf.placeholder(tf.float32, shape=(None,), name='td_target')
self.done = tf.placeholder(tf.float32, shape=(None,), name='done_flag')

# Actor: action probabilities
self.actor = dense_nn(self.s, self.layer_sizes + [self.act_size], name='actor')
Expand All @@ -45,21 +47,21 @@ def _build_networks(self):

# Critic: action value (V value)
self.critic = dense_nn(self.s, self.layer_sizes + [1], name='critic')
self.critic_next = dense_nn(self.s_next, self.layer_sizes + [1], name='critic', reuse=True)
self.critic_vars = self.scope_vars('critic')

def _build_train_ops(self):
self.learning_rate_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c')
self.learning_rate_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a')
# TD target
self.td_target = self.r + self.gamma * tf.squeeze(self.critic_next) * (1.0 - self.done)
self.td_error = self.td_target - tf.squeeze(self.critic)

action_ohe = tf.one_hot(self.a, self.act_size, 1.0, 0.0, name='action_one_hot')
self.pred_value = tf.reduce_sum(
self.critic * action_ohe, reduction_indices=-1, name='q_acted')
self.td_errors = self.td_target - tf.reshape(self.pred_value, [-1])
def _build_train_ops(self):
self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c')
self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a')

with tf.variable_scope('critic_train'):
# self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars])
self.loss_c = tf.reduce_mean(tf.square(self.td_errors)) # + 0.001 * self.reg_c
self.optim_c = tf.train.AdamOptimizer(self.learning_rate_c)
self.loss_c = tf.reduce_mean(tf.square(self.td_error)) # + 0.001 * self.reg_c
self.optim_c = tf.train.AdamOptimizer(self.lr_c)
self.grads_c = self.optim_c.compute_gradients(self.loss_c, self.critic_vars)
if self.clip_norm:
self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_c]
Expand All @@ -70,27 +72,26 @@ def _build_train_ops(self):
# self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars])
# self.entropy_a =- tf.reduce_sum(self.actor * tf.log(self.actor))
self.loss_a = tf.reduce_mean(
tf.stop_gradient(self.td_errors) * tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=self.actor, labels=self.a),
name='loss_actor') # + 0.001 * self.reg_a
self.optim_a = tf.train.AdamOptimizer(self.learning_rate_a)
tf.stop_gradient(self.td_error) * tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=self.actor, labels=self.a), name='loss_actor') # + 0.001 * self.reg_a
self.optim_a = tf.train.AdamOptimizer(self.lr_a)
self.grads_a = self.optim_a.compute_gradients(self.loss_a, self.actor_vars)
if self.clip_norm:
self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_a]

self.train_op_a = self.optim_a.apply_gradients(self.grads_a)

with tf.variable_scope('summary'):
self.grads_a_summ = [tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for
grad, var in self.grads_a if grad is not None]
self.grads_c_summ = [tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for
grad, var in self.grads_c if grad is not None]
self.loss_c_summ = tf.summary.scalar('loss/critic', self.loss_c)
self.loss_a_summ = tf.summary.scalar('loss/actor', self.loss_a)

self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')
self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward)

self.summary = [
tf.summary.scalar('loss/critic', self.loss_c),
tf.summary.scalar('loss/actor', self.loss_a),
tf.summary.scalar('episode_reward', self.ep_reward)
]
self.summary += [tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for
grad, var in self.grads_a if grad is not None]
self.summary += [tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for
grad, var in self.grads_c if grad is not None]
self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES)

self.train_ops = [self.train_op_a, self.train_op_c]
Expand All @@ -108,16 +109,15 @@ class TrainConfig(Config):
lr_c_decay = 0.995
batch_size = 32
n_episodes = 800
annealing_episodes = 720
warmup_episodes = 720
log_every_episode = 10
done_rewards = -100
# for epsilon-greedy exploration
epsilon = 1.0
epsilon_final = 0.05

def train(self, config: TrainConfig):
BufferRecord = namedtuple('Record', ['s', 'a', 'r', 'td_target'])
buffer = ReplayMemory(tuple_class=BufferRecord)
buffer = ReplayMemory(tuple_class=Transition)

step = 0
episode_reward = 0.
Expand All @@ -128,8 +128,8 @@ def train(self, config: TrainConfig):
lr_a = config.lr_a

eps = config.epsilon
annealing_episodes = config.annealing_episodes or config.n_episodes
eps_drop = (eps - config.epsilon_final) / annealing_episodes
warmup_episodes = config.warmup_episodes or config.n_episodes
eps_drop = (eps - config.epsilon_final) / warmup_episodes
print("Decrease epsilon per step:", eps_drop)

for n_episode in range(config.n_episodes):
Expand All @@ -143,28 +143,23 @@ def train(self, config: TrainConfig):
step += 1
episode_reward += r

if done:
next_state_value = config.done_rewards or 0.0
else:
with self.sess.as_default():
next_state_value = self.critic.eval({
self.s: [self.obs_to_inputs(ob_next)]})[0][0]
record = Transition(self.obs_to_inputs(ob), a, r, self.obs_to_inputs(ob_next), done)
buffer.add(record)

td_target = r + self.gamma * next_state_value
buffer.add(BufferRecord(self.obs_to_inputs(ob), a, r, td_target))
ob = ob_next

while buffer.size >= config.batch_size:
batch = buffer.pop(config.batch_size)
_, summ_str = self.sess.run(
[self.train_ops, self.merged_summary], feed_dict={
self.learning_rate_c: lr_c,
self.learning_rate_a: lr_a,
self.lr_c: lr_c,
self.lr_a: lr_a,
self.s: batch['s'],
self.a: batch['a'],
self.r: batch['r'],
self.td_target: batch['td_target'],
self.ep_reward: reward_history[-1] if reward_history else 0.0,
self.s_next: batch['s_next'],
self.done: batch['done'],
self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0,
})
self.writer.add_summary(summ_str, step)

Expand Down
83 changes: 3 additions & 80 deletions playground/policies/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import os
from collections import deque, namedtuple
from gym.spaces import Box, Discrete

import numpy as np
import tensorflow as tf
from gym.spaces import Box, Discrete
from gym.utils import colorize

from playground.utils.misc import Config
from playground.utils.misc import REPO_ROOT

Transition = namedtuple('Transition', ['s', 'a', 'r', 's_next', 'done'])


class TrainConfig(Config):
lr = 0.001
Expand All @@ -21,82 +20,6 @@ class TrainConfig(Config):
done_reward = None


class ReplayMemory:
def __init__(self, capacity=100000, replace=False, tuple_class=Transition):
self.buffer = []
self.capacity = capacity
self.replace = replace
self.tuple_class = tuple_class
self.fields = tuple_class._fields

def add(self, record):
"""Any named tuple item."""
if isinstance(record, self.tuple_class):
self.buffer.append(record)
elif isinstance(record, list):
self.buffer += record

while self.capacity and self.size > self.capacity:
self.buffer.pop(0)

def _reformat(self, indices):
# Reformat a list of Transition tuples for training.
# indices: list<int>
return {
field_name: np.array([getattr(self.buffer[i], field_name) for i in indices])
for field_name in self.fields
}

def sample(self, batch_size):
assert len(self.buffer) >= batch_size
idxs = np.random.choice(range(len(self.buffer)), size=batch_size, replace=self.replace)
return self._reformat(idxs)

def pop(self, batch_size):
# Pop the first `batch_size` Transition items out.
i = min(self.size, batch_size)
batch = self._reformat(range(i))
self.buffer = self.buffer[i:]
return batch

@property
def size(self):
return len(self.buffer)


class ReplayTrajMemory:
def __init__(self, capacity=100000, step_size=16):
self.buffer = deque(maxlen=capacity)
self.step_size = step_size

def add(self, traj):
# traj (list<Transition>)
if len(traj) >= self.step_size:
self.buffer.append(traj)

def sample(self, batch_size):
traj_idxs = np.random.choice(range(len(self.buffer)), size=batch_size, replace=True)
batch_data = {field_name: [] for field_name in Transition._fields}

for traj_idx in traj_idxs:
i = np.random.randint(0, len(self.buffer[traj_idx]) + 1 - self.step_size)
transitions = self.buffer[traj_idx][i: i + self.step_size]

for field_name in Transition._fields:
batch_data[field_name] += [getattr(t, field_name) for t in transitions]

assert all(len(v) == batch_size * self.step_size for v in batch_data.values())
return {k: np.array(v) for k, v in batch_data.items()}

@property
def size(self):
return len(self.buffer)

@property
def transition_size(self):
return sum(map(len, self.buffer))


class Policy:
def __init__(self, env, name, training=True, gamma=0.99, deterministic=False):
self.env = env
Expand Down
7 changes: 4 additions & 3 deletions playground/policies/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import tensorflow as tf
from gym.spaces import Box

from playground.policies.base import BaseModelMixin, Policy, ReplayMemory, TrainConfig, Transition
from playground.policies.base import BaseModelMixin, Policy, TrainConfig
from playground.policies.memory import ReplayMemory, Transition
from playground.utils.misc import plot_learning_curve
from playground.utils.tf_ops import dense_nn

Expand All @@ -15,7 +16,7 @@ def __init__(self, env, name, training=True, gamma=0.9,
BaseModelMixin.__init__(self, name)

assert isinstance(self.env.action_space, Box), \
"Current implementation only works for continuous action space."
"Current DDPGPolicy implementation only works for continuous action space."

self.actor_layers = actor_layers
self.critic_layers = critic_layers
Expand Down Expand Up @@ -173,7 +174,7 @@ def train(self, config: TrainConfig):
self.a: batch['a'],
self.r: batch['r'],
self.s_next: batch['s_next'],
self.ep_reward: reward_history[-1] if reward_history else 0.0,
self.ep_reward: np.mean(reward_history[-10:]) if reward_history else 0.0,
})
self.update_target_net(tau=config.tau)
self.writer.add_summary(summ_str, step)
Expand Down
10 changes: 2 additions & 8 deletions playground/policies/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,8 @@
import tensorflow as tf
from gym.spaces import Box, Discrete

from playground.policies.base import (
BaseModelMixin,
Policy,
ReplayMemory,
ReplayTrajMemory,
TrainConfig,
Transition,
)
from playground.policies.base import BaseModelMixin, Policy, TrainConfig
from playground.policies.memory import ReplayMemory, ReplayTrajMemory, Transition
from playground.utils.misc import plot_learning_curve
from playground.utils.tf_ops import dense_nn, conv2d_net, lstm_net

Expand Down
Loading

0 comments on commit 4fec487

Please sign in to comment.