Skip to content

Commit

Permalink
param refactoring - actor-critic
Browse files Browse the repository at this point in the history
  • Loading branch information
lilianweng committed Jan 2, 2019
1 parent 83a9472 commit 71c44c1
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 74 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
__pycache__
checkpoints/*
logs/*
tb/*
figs/*
**/*.pyc
**/*.egg-info
Expand Down
6 changes: 3 additions & 3 deletions playground/configs/data/actor-critic-cartpole-v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
"env_name": "CartPole-v1",
"policy_name": "ActorCriticPolicy",
"policy_params": {
"epsilon": 1.0,
"epsilon_final": 0.05,
"batch_size": 16,
"layer_sizes": [32],
"deterministic": true
},
Expand All @@ -13,6 +10,9 @@
"lr_a_decay": 0.999,
"lr_c": 0.01,
"lr_c_decay": 0.999,
"epsilon": 1.0,
"epsilon_final": 0.05,
"batch_size": 32,
"n_episodes": 800,
"annealing_episodes": 720,
"log_every_episode": 10,
Expand Down
4 changes: 2 additions & 2 deletions playground/configs/data/ddpg-bipedalwalker-v2.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
"deterministic": true
},
"train_params": {
"n_steps": 50000,
"n_steps": 40000,
"warmup_steps": 10000,
"batch_size": 64,
"batch_size": 128,
"lr_a": 0.005,
"lr_c": 0.005,
"epsilon": 0.35,
Expand Down
26 changes: 12 additions & 14 deletions playground/policies/actor_critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
import tensorflow as tf
from gym.spaces import Discrete

from playground.policies.base import BaseTFModelMixin, Policy, ReplayMemory, Config
from playground.policies.base import BaseModelMixin, Policy, ReplayMemory, Config
from playground.utils.misc import plot_learning_curve
from playground.utils.tf_ops import dense_nn


class ActorCriticPolicy(Policy, BaseTFModelMixin):
class ActorCriticPolicy(Policy, BaseModelMixin):

def __init__(self, env, name, training=True, gamma=0.9, layer_sizes=None, clip_norm=None, **kwargs):
Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs)
BaseTFModelMixin.__init__(self, name)
BaseModelMixin.__init__(self, name)

assert isinstance(self.env.action_space, Discrete), \
"Current implementation only works for discrete action space."
Expand All @@ -32,7 +32,7 @@ def act(self, state, eps=0.1):

def _build_networks(self):
# Define input placeholders
self.s = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='state')
self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state')
self.a = tf.placeholder(tf.int32, shape=(None,), name='action')
self.r = tf.placeholder(tf.float32, shape=(None,), name='reward')
self.td_target = tf.placeholder(tf.float32, shape=(None,), name='td_target')
Expand Down Expand Up @@ -62,8 +62,7 @@ def _build_train_ops(self):
self.optim_c = tf.train.AdamOptimizer(self.learning_rate_c)
self.grads_c = self.optim_c.compute_gradients(self.loss_c, self.critic_vars)
if self.clip_norm:
self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var)
for grad, var in self.grads_c]
self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_c]

self.train_op_c = self.optim_c.apply_gradients(self.grads_c)

Expand All @@ -77,8 +76,7 @@ def _build_train_ops(self):
self.optim_a = tf.train.AdamOptimizer(self.learning_rate_a)
self.grads_a = self.optim_a.compute_gradients(self.loss_a, self.actor_vars)
if self.clip_norm:
self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var)
for grad, var in self.grads_a]
self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_a]

self.train_op_a = self.optim_a.apply_gradients(self.grads_a)

Expand Down Expand Up @@ -117,7 +115,7 @@ class TrainConfig(Config):
epsilon = 1.0
epsilon_final = 0.05

def train(self, n_episodes, config: TrainConfig):
def train(self, config: TrainConfig):
BufferRecord = namedtuple('Record', ['s', 'a', 'r', 'td_target'])
buffer = ReplayMemory(tuple_class=BufferRecord)

Expand All @@ -130,11 +128,11 @@ def train(self, n_episodes, config: TrainConfig):
lr_a = config.lr_a

eps = config.epsilon
annealing_episodes = config.annealing_episodes or n_episodes
annealing_episodes = config.annealing_episodes or config.n_episodes
eps_drop = (eps - config.epsilon_final) / annealing_episodes
print("eps_drop:", eps_drop)
print("Decrease epsilon per step:", eps_drop)

for n_episode in range(n_episodes):
for n_episode in range(config.n_episodes):
ob = self.env.reset()
self.act(ob, eps)
done = False
Expand Down Expand Up @@ -189,9 +187,9 @@ def train(self, n_episodes, config: TrainConfig):
np.mean(reward_history[-10:]), reward_history[-5:],
lr_c, lr_a, eps,
))
# self.save_model(step=step)
# self.save_checkpoint(step=step)

self.save_model(step=step)
self.save_checkpoint(step=step)

print("[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
len(reward_history), np.max(reward_history), np.mean(reward_history)))
Expand Down
99 changes: 56 additions & 43 deletions playground/policies/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,6 @@ def state_dim(self):
def obs_to_inputs(self, ob):
return ob.flatten()

def get_vars(self, scope, only_trainable=True):
collection = tf.GraphKeys.TRAINABLE_VARIABLES if only_trainable else tf.GraphKeys.VARIABLES
variables = tf.get_collection(collection, scope=scope)
print(scope, variables)
assert len(variables) > 0
return variables

def act(self, state, **kwargs):
pass

Expand Down Expand Up @@ -168,55 +161,80 @@ def evaluate(self, n_episodes):
print("Avg. reward over {} episodes: {:.4f}".format(n_episodes, np.mean(reward_history)))


class BaseTFModelMixin:
"""Abstract object representing an Reader model.
Code borrowed from: https://github.com/devsisters/DQN-tensorflow/blob/master/dqn/base.py
with some modifications.
class BaseModelMixin:
"""Abstract object representing an tensorflow model that can be easily saved/loaded.
Modified based on https://github.com/devsisters/DQN-tensorflow/blob/master/dqn/base.py
"""

def __init__(self, model_name, saver_max_to_keep=5):
def __init__(self, model_name, tf_sess_config=None):
self._saver = None
self._saver_max_to_keep = saver_max_to_keep
self._writer = None
self._model_name = model_name
self._sess = None

def scope_vars(self, scope):
res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope)
assert len(res) > 0
print("Variables in scope '%s'" % scope)
for v in res:
if tf_sess_config is None:
tf_sess_config = {
'allow_soft_placement': True,
'intra_op_parallelism_threads': 8,
'inter_op_parallelism_threads': 4,
}
self.tf_sess_config = tf_sess_config

def scope_vars(self, scope, only_trainable=True):
collection = tf.GraphKeys.TRAINABLE_VARIABLES if only_trainable else tf.GraphKeys.VARIABLES
variables = tf.get_collection(collection, scope=scope)
assert len(variables) > 0
print(f"Variables in scope '{scope}':")
for v in variables:
print("\t" + str(v))
return res
return variables

def save_model(self, step=None):
def get_variable_values(self):
t_vars = tf.trainable_variables()
vals = self.sess.run(t_vars)
return {v.name: value for v, value in zip(t_vars, vals)}

def save_checkpoint(self, step=None):
print(colorize(" [*] Saving checkpoints...", "green"))
ckpt_file = os.path.join(self.checkpoint_dir, self.model_name)
self.saver.save(self.sess, ckpt_file, global_step=step)

def load_model(self):
def load_checkpoint(self):
print(colorize(" [*] Loading checkpoints...", "green"))

ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
print(self.checkpoint_dir, ckpt)
if ckpt and ckpt.model_checkpoint_path:
ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
print(ckpt_name)
fname = os.path.join(self.checkpoint_dir, ckpt_name)
print(fname)
self.saver.restore(self.sess, fname)
print(colorize(" [*] Load SUCCESS: %s" % fname, "green"))
ckpt_path = tf.train.latest_checkpoint(self.checkpoint_dir)
print(self.checkpoint_dir)
print("ckpt_path:", ckpt_path)

if ckpt_path:
# self._saver = tf.train.import_meta_graph(ckpt_path + '.meta')
self.saver.restore(self.sess, ckpt_path)
print(colorize(" [*] Load SUCCESS: %s" % ckpt_path, "green"))
return True
else:
print(colorize(" [!] Load FAILED: %s" % self.checkpoint_dir, "red"))
return False

def _get_dir(self, dir_name):
path = os.path.join(REPO_ROOT, dir_name, self.model_name)
os.makedirs(path, exist_ok=True)
return path

@property
def log_dir(self):
return self._get_dir('logs')

@property
def checkpoint_dir(self):
ckpt_path = os.path.join(REPO_ROOT, 'checkpoints', self.model_name)
os.makedirs(ckpt_path, exist_ok=True)
return ckpt_path
return self._get_dir('checkpoints')

@property
def model_dir(self):
return self._get_dir('models')

@property
def tb_dir(self):
# tensorboard
return self._get_dir('tb')

@property
def model_name(self):
Expand All @@ -226,24 +244,19 @@ def model_name(self):
@property
def saver(self):
if self._saver is None:
self._saver = tf.train.Saver(max_to_keep=self._saver_max_to_keep)
self._saver = tf.train.Saver(max_to_keep=5)
return self._saver

@property
def writer(self):
if self._writer is None:
writer_path = os.path.join(REPO_ROOT, "logs", self.model_name)
os.makedirs(writer_path, exist_ok=True)
self._writer = tf.summary.FileWriter(writer_path, self.sess.graph)
self._writer = tf.summary.FileWriter(self.tb_dir, self.sess.graph)
return self._writer

@property
def sess(self):
if self._sess is None:
config = tf.ConfigProto()

config.intra_op_parallelism_threads = 2
config.inter_op_parallelism_threads = 2
config = tf.ConfigProto(**self.tf_sess_config)
self._sess = tf.Session(config=config)

return self._sess
12 changes: 6 additions & 6 deletions playground/policies/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
import tensorflow as tf
from gym.spaces import Box, Discrete

from playground.policies.base import BaseTFModelMixin, Policy, ReplayMemory, TrainConfig
from playground.policies.base import BaseModelMixin, Policy, ReplayMemory, TrainConfig
from playground.utils.misc import plot_learning_curve
from playground.utils.tf_ops import dense_nn


class DDPGPolicy(Policy, BaseTFModelMixin):
class DDPGPolicy(Policy, BaseModelMixin):

def __init__(self, env, name, training=True, gamma=0.9,
actor_layers=[64, 32], critic_layers=[128, 64], **kwargs):
Policy.__init__(self, env, name, training=training, gamma=gamma, **kwargs)
BaseTFModelMixin.__init__(self, name)
BaseModelMixin.__init__(self, name)

assert isinstance(self.env.action_space, Box), \
"Current implementation only works for continuous action space."
Expand Down Expand Up @@ -53,12 +53,12 @@ def _build_networks(self):
self.Q_target = dense_nn(tf.concat([self.s_next, self.mu_target], axis=1),
self.critic_layers + [1], name='Q')

self.Q_vars = self.get_vars('primary/Q')
self.mu_vars = self.get_vars('primary/mu')
self.Q_vars = self.scope_vars('primary/Q')
self.mu_vars = self.scope_vars('primary/mu')

# sanity check
self.primary_vars = self.Q_vars + self.mu_vars
self.target_vars = self.get_vars('target/Q') + self.get_vars('target/mu')
self.target_vars = self.scope_vars('target/Q') + self.scope_vars('target/mu')
assert len(self.primary_vars) == len(self.target_vars)

def init_target_net(self):
Expand Down
6 changes: 3 additions & 3 deletions playground/policies/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from gym.spaces import Box, Discrete

from playground.policies.base import (
BaseTFModelMixin,
BaseModelMixin,
Policy,
ReplayMemory,
ReplayTrajMemory,
Expand All @@ -13,7 +13,7 @@
from playground.utils.tf_ops import dense_nn, conv2d_net, lstm_net


class DqnPolicy(Policy, BaseTFModelMixin):
class DqnPolicy(Policy, BaseModelMixin):
def __init__(self, env, name,
training=True,
gamma=0.99,
Expand All @@ -35,7 +35,7 @@ def __init__(self, env, name,
model_params: 'layer_sizes', 'step_size', 'lstm_layers', 'lstm_size'
"""
Policy.__init__(self, env, name, gamma=gamma, training=training)
BaseTFModelMixin.__init__(self, name, saver_max_to_keep=5)
BaseModelMixin.__init__(self, name, saver_max_to_keep=5)

assert isinstance(self.env.action_space, Discrete)
assert isinstance(self.env.observation_space, Box)
Expand Down
6 changes: 3 additions & 3 deletions playground/policies/reinforce.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import numpy as np
import tensorflow as tf
from playground.policies.base import BaseTFModelMixin, Policy
from playground.policies.base import BaseModelMixin, Policy
from playground.utils.misc import plot_learning_curve
from playground.utils.tf_ops import dense_nn


class ReinforcePolicy(Policy, BaseTFModelMixin):
class ReinforcePolicy(Policy, BaseModelMixin):
def __init__(self, env, name, training=True, gamma=0.99,
lr=0.001, lr_decay=0.999, batch_size=32, layer_sizes=None,
baseline=False):
Policy.__init__(self, env, name, training=training, gamma=gamma)
BaseTFModelMixin.__init__(self, name)
BaseModelMixin.__init__(self, name)

self.lr = lr
self.lr_decay = lr_decay
Expand Down
File renamed without changes.

0 comments on commit 71c44c1

Please sign in to comment.