forked from louisnino/RLcode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtutorial_DDPG.py
354 lines (297 loc) · 13.6 KB
/
tutorial_DDPG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
"""
Deep Deterministic Policy Gradient (DDPG)
-----------------------------------------
An algorithm concurrently learns a Q-function and a policy.
It uses off-policy data and the Bellman equation to learn the Q-function,
and uses the Q-function to learn the policy.
Reference
---------
Deterministic Policy Gradient Algorithms, Silver et al. 2014
Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016
MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
Environment
-----------
Openai Gym Pendulum-v0, continual action space
Prerequisites
-------------
tensorflow >=2.0.0a0
tensorflow-probability 0.6.0
tensorlayer >=2.0.0
To run
------
python tutorial_DDPG.py --train/test
"""
import argparse
import os
import time
import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorlayer as tl
parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
parser.add_argument('--train', dest='train', action='store_true', default=True)
parser.add_argument('--test', dest='test', action='store_false')
args = parser.parse_args()
##################### hyper parameters ####################
ENV_NAME = 'Pendulum-v0' # environment name
RANDOMSEED = 1 # random seed
LR_A = 0.001 # learning rate for actor
LR_C = 0.002 # learning rate for critic
GAMMA = 0.9 # reward discount
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000 # size of replay buffer
BATCH_SIZE = 32 # update batchsize
MAX_EPISODES = 200 # total number of episodes for training
MAX_EP_STEPS = 200 # total number of steps for each episode
TEST_PER_EPISODES = 10 # test the model per episodes
VAR = 3 # control exploration
############################### DDPG ####################################
class DDPG(object):
"""
DDPG class
"""
def __init__(self, a_dim, s_dim, a_bound):
# memory用于储存跑的数据的数组:
# 保存个数MEMORY_CAPACITY,s_dim * 2 + a_dim + 1:分别是两个state,一个action,和一个reward
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound
W_init = tf.random_normal_initializer(mean=0, stddev=0.3)
b_init = tf.constant_initializer(0.1)
# 建立actor网络,输入s,输出a
def get_actor(input_state_shape, name=''):
"""
Build actor network
:param input_state_shape: state
:param name: name
:return: act
"""
inputs = tl.layers.Input(input_state_shape, name='A_input')
x = tl.layers.Dense(n_units=30, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(inputs)
x = tl.layers.Dense(n_units=a_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(x)
x = tl.layers.Lambda(lambda x: np.array(a_bound) * x)(x) #注意这里,先用tanh把范围限定在[-1,1]之间,再进行映射
return tl.models.Model(inputs=inputs, outputs=x, name='Actor' + name)
#建立Critic网络,输入s,a。输出Q值
def get_critic(input_state_shape, input_action_shape, name=''):
"""
Build critic network
:param input_state_shape: state
:param input_action_shape: act
:param name: name
:return: Q value Q(s,a)
"""
s = tl.layers.Input(input_state_shape, name='C_s_input')
a = tl.layers.Input(input_action_shape, name='C_a_input')
x = tl.layers.Concat(1)([s, a])
x = tl.layers.Dense(n_units=60, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l1')(x)
x = tl.layers.Dense(n_units=1, W_init=W_init, b_init=b_init, name='C_out')(x)
return tl.models.Model(inputs=[s, a], outputs=x, name='Critic' + name)
self.actor = get_actor([None, s_dim])
self.critic = get_critic([None, s_dim], [None, a_dim])
self.actor.train()
self.critic.train()
#更新参数,只用于首次赋值,之后就没用了
def copy_para(from_model, to_model):
"""
Copy parameters for soft updating
:param from_model: latest model
:param to_model: target model
:return: None
"""
for i, j in zip(from_model.trainable_weights, to_model.trainable_weights):
j.assign(i)
#建立actor_target网络,并和actor参数一致,不能训练
self.actor_target = get_actor([None, s_dim], name='_target')
copy_para(self.actor, self.actor_target)
self.actor_target.eval()
#建立critic_target网络,并和actor参数一致,不能训练
self.critic_target = get_critic([None, s_dim], [None, a_dim], name='_target')
copy_para(self.critic, self.critic_target)
self.critic_target.eval()
self.R = tl.layers.Input([None, 1], tf.float32, 'r')
#建立ema,滑动平均值
self.ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement
self.actor_opt = tf.optimizers.Adam(LR_A)
self.critic_opt = tf.optimizers.Adam(LR_C)
def ema_update(self):
"""
滑动平均更新
"""
# 其实和之前的硬更新类似,不过在更新赋值之前,用一个ema.average。
paras = self.actor.trainable_weights + self.critic.trainable_weights #获取要更新的参数包括actor和critic的
self.ema.apply(paras) #主要是建立影子参数
for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
i.assign(self.ema.average(j)) # 用滑动平均赋值
# 选择动作,把s带进入,输出a
def choose_action(self, s):
"""
Choose action
:param s: state
:return: act
"""
return self.actor(np.array([s], dtype=np.float32))[0]
def learn(self):
"""
Update parameters
:return: None
"""
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) #随机BATCH_SIZE个随机数
bt = self.memory[indices, :] #根据indices,选取数据bt,相当于随机
bs = bt[:, :self.s_dim] #从bt获得数据s
ba = bt[:, self.s_dim:self.s_dim + self.a_dim] #从bt获得数据a
br = bt[:, -self.s_dim - 1:-self.s_dim] #从bt获得数据r
bs_ = bt[:, -self.s_dim:] #从bt获得数据s'
# Critic:
# Critic更新和DQN很像,不过target不是argmax了,是用critic_target计算出来的。
# br + GAMMA * q_
with tf.GradientTape() as tape:
a_ = self.actor_target(bs_)
q_ = self.critic_target([bs_, a_])
y = br + GAMMA * q_
q = self.critic([bs, ba])
td_error = tf.losses.mean_squared_error(y, q)
c_grads = tape.gradient(td_error, self.critic.trainable_weights)
self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights))
# Actor:
# Actor的目标就是获取最多Q值的。
with tf.GradientTape() as tape:
a = self.actor(bs)
q = self.critic([bs, a])
a_loss = -tf.reduce_mean(q) # 【敲黑板】:注意这里用负号,是梯度上升!也就是离目标会越来越远的,就是越来越大。
a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights))
self.ema_update()
# 保存s,a,r,s_
def store_transition(self, s, a, r, s_):
"""
Store data in data buffer
:param s: state
:param a: act
:param r: reward
:param s_: next state
:return: None
"""
# 整理s,s_,方便直接输入网络计算
s = s.astype(np.float32)
s_ = s_.astype(np.float32)
#把s, a, [r], s_横向堆叠
transition = np.hstack((s, a, [r], s_))
#pointer是记录了曾经有多少数据进来。
#index是记录当前最新进来的数据位置。
#所以是一个循环,当MEMORY_CAPACITY满了以后,index就重新在最底开始了
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
#把transition,也就是s, a, [r], s_存进去。
self.memory[index, :] = transition
self.pointer += 1
def save_ckpt(self):
"""
save trained weights
:return: None
"""
if not os.path.exists('model'):
os.makedirs('model')
tl.files.save_weights_to_hdf5('model/ddpg_actor.hdf5', self.actor)
tl.files.save_weights_to_hdf5('model/ddpg_actor_target.hdf5', self.actor_target)
tl.files.save_weights_to_hdf5('model/ddpg_critic.hdf5', self.critic)
tl.files.save_weights_to_hdf5('model/ddpg_critic_target.hdf5', self.critic_target)
def load_ckpt(self):
"""
load trained weights
:return: None
"""
tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor.hdf5', self.actor)
tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor_target.hdf5', self.actor_target)
tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic.hdf5', self.critic)
tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic_target.hdf5', self.critic_target)
if __name__ == '__main__':
#初始化环境
env = gym.make(ENV_NAME)
env = env.unwrapped
# reproducible,设置随机种子,为了能够重现
env.seed(RANDOMSEED)
np.random.seed(RANDOMSEED)
tf.random.set_seed(RANDOMSEED)
#定义状态空间,动作空间,动作幅度范围
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high
print('s_dim',s_dim)
print('a_dim',a_dim)
#用DDPG算法
ddpg = DDPG(a_dim, s_dim, a_bound)
#训练部分:
if args.train: # train
reward_buffer = [] #用于记录每个EP的reward,统计变化
t0 = time.time() #统计时间
for i in range(MAX_EPISODES):
t1 = time.time()
s = env.reset()
ep_reward = 0 #记录当前EP的reward
for j in range(MAX_EP_STEPS):
# Add exploration noise
a = ddpg.choose_action(s) #这里很简单,直接用actor估算出a动作
# 为了能保持开发,这里用了另外一种方式增加探索。
# 因此需要需要以a为均值,VAR为标准差,建立正态分布,再从正态分布采样出a
# 因为a是均值,所以a的概率是最大的。但a相对其他概率由多大,是靠VAR调整。这里我们其实可以增加更新VAR,动态调整a的确定性
# 然后进行裁剪
a = np.clip(np.random.normal(a, VAR), -2, 2)
# 与环境进行互动
s_, r, done, info = env.step(a)
# 保存s,a,r,s_
ddpg.store_transition(s, a, r / 10, s_)
# 第一次数据满了,就可以开始学习
if ddpg.pointer > MEMORY_CAPACITY:
ddpg.learn()
#输出数据记录
s = s_
ep_reward += r #记录当前EP的总reward
if j == MAX_EP_STEPS - 1:
print(
'\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
i, MAX_EPISODES, ep_reward,
time.time() - t1
), end=''
)
plt.show()
# test
if i and not i % TEST_PER_EPISODES:
t1 = time.time()
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
a = ddpg.choose_action(s) # 注意,在测试的时候,我们就不需要用正态分布了,直接一个a就可以了。
s_, r, done, info = env.step(a)
s = s_
ep_reward += r
if j == MAX_EP_STEPS - 1:
print(
'\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
i, MAX_EPISODES, ep_reward,
time.time() - t1
)
)
reward_buffer.append(ep_reward)
if reward_buffer:
plt.ion()
plt.cla()
plt.title('DDPG')
plt.plot(np.array(range(len(reward_buffer))) * TEST_PER_EPISODES, reward_buffer) # plot the episode vt
plt.xlabel('episode steps')
plt.ylabel('normalized state-action value')
plt.ylim(-2000, 0)
plt.show()
plt.pause(0.1)
plt.ioff()
plt.show()
print('\nRunning time: ', time.time() - t0)
ddpg.save_ckpt()
# test
ddpg.load_ckpt()
while True:
s = env.reset()
for i in range(MAX_EP_STEPS):
env.render()
s, r, done, info = env.step(ddpg.choose_action(s))
if done:
break