-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDQN_TORCS.py
204 lines (184 loc) · 9 KB
/
DQN_TORCS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import argparse
import base64
from datetime import datetime
import os
import shutil
import numpy as np
import socketio
import eventlet
import eventlet.wsgi
from PIL import Image
from flask import Flask
from io import BytesIO
# RF imports
from gym_torcs import TorcsEnv
import gym
import tensorflow as tf
import random
from collections import deque
import cv2
# Hyper Parameters for DQN
ACTIONS = 3 # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVE =10 # timesteps to observe before training
EXPLORE = 20. # frames over which to anneal epsilon
REPLAY_MEMORY = 50 # number of previous transitions to remember
BATCH = 5 # size of minibatch
##########################################################################################################################################################
# 权重
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev = 0.01) #tf.truncated_normal(shape, mean, stddev) :shape表示生成张量的维度,mean是均值,stddev是标准差。这个函数产生正太分布,均值和标准差自己设定。
return tf.Variable(initial)#
def bias_variable(shape):
initial = tf.constant(0.01, shape = shape)
return tf.Variable(initial)
# 卷积函数
def conv2d(x, W, stride):
return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME")
#实现卷积的函数
# 池化 核 2*2 步长2
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
# CNN
def createNetwork():
# network weights
W_conv1 = weight_variable([8, 8, 4, 32]) # 卷积核patch的大小是8*8, RGBD,channel是4,输出是32个featuremap
b_conv1 = bias_variable([32]) # 传入它的shape为[32]
W_conv2 = weight_variable([4, 4, 32, 64])
b_conv2 = bias_variable([64])
W_conv3 = weight_variable([3, 3, 64, 64])
b_conv3 = bias_variable([64])
W_fc1 = weight_variable([1600, 512])
b_fc1 = bias_variable([512])
W_fc2 = weight_variable([512, ACTIONS])
b_fc2 = bias_variable([ACTIONS])
# input layer 输入层 输入向量为80*80*4
s = tf.placeholder("float", [None, 80, 80, 4]) #
#print("s.shape",s.shape)
# 第一个隐藏层+一个池化层
h_conv1 = tf.nn.tanh(conv2d(s, W_conv1, 4) + b_conv1) #
h_pool1 = max_pool_2x2(h_conv1) #
#print("h_pool1.shape",h_pool1.shape)
#第二个隐藏层
h_conv2 = tf.nn.tanh(conv2d(h_pool1, W_conv2, 2) + b_conv2) #
# 第三个隐藏层
h_conv3 = tf.nn.tanh(conv2d(h_conv2, W_conv3, 1) + b_conv3) #
#print("h_conv3.shape",h_conv3.shape)
#展平
h_conv3_flat = tf.reshape(h_conv3, [-1, 1600])
#print("h_conv3_flat.shape",h_conv3_flat.shape)
# 第一个全连接层
h_fc1 = tf.nn.tanh(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)
#print("h_fc1.shape",h_fc1.shape)
# readout layer 输出层
readout = tf.matmul(h_fc1, W_fc2) + b_fc2
#print("readout.size",readout.shape)
return s, readout, h_fc1
###################################################################################################################################################
# Hyper Parameters
EPISODE = 10000 # Episode limitation
STEP = 30 # Step limitation in an episode
TEST = 5 # The number of experiment test every 100 episode
def trainNetwork(s, readout, h_fc1, sess):
# define the cost function 定义损失函数
a = tf.placeholder("float", [None, ACTIONS]) #tf.placeholder 是 Tensorflow 中的占位符,暂时储存变量
y = tf.placeholder("float", [None])
readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) #矩阵按行求和,multiply这个函数实现的是元素级别的相乘
cost = tf.reduce_mean( tf.square(y - readout_action) ) #张量tensor沿着指定的数轴(tensor的某一维度)上的的平均值
train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
# store the previous observations in replay memory
D = deque()
# start training
epsilon = 0.1
for episode in range(EPISODE):
# open up a game state to communicate with emulator
env = TorcsEnv(vision=True, throttle=True,gear_change=False)
if np.mod(episode, 3) == 0:
ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error
else:
ob = env.reset()
if episode==0:
# get the first state by doing nothing and preprocess the image to 80x80x4
do_nothing = np.zeros(ACTIONS)
ob, r_0, terminal, info = env.step(do_nothing)
x_t=ob.img
print("type(x_t)",type(x_t))
#x_t, r_0, terminal = game_state.frame_step(do_nothing)
x_t=x_t.swapaxes(0,2)
print("x_t.shape",x_t.shape)
#将图像转换成80*80,并进行灰度化
#Resize image to 80x80, Convert image to grayscale,remove the background appeared in the original game can make it converge faster
x_t=cv2.resize(x_t, (80, 80))
print("x_t.shape",x_t.shape)
x_t = cv2.cvtColor(x_t, cv2.COLOR_BGR2GRAY)
ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) #对图像进行二值化,从灰度图像中获取二进制图像或用于消除噪声,即滤除太小或太小的像素
s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # 将图像处理成4通道,stack last 4 frames to produce an 80x80x4 input array for network
for step in range(STEP):
# choose an action epsilon greedily
readout_t = readout.eval(feed_dict={s : [s_t]})[0] #将当前环境输入到CNN网络中
print("readout_t",readout_t)
a_t = np.zeros([ACTIONS])
if random.random() <= epsilon:
print("Random Action")
a_t[0] = random.random()
a_t[1] = random.random()
a_t[2] = random.random()
else:
a_t[0] = readout_t[0]
a_t[1] = readout_t[1]
a_t[2] = readout_t[2]
# run the selected action and observe next state and reward
ob, r_t, terminal, info = env.step(a_t)
x_t1_colored=ob.img
x_t1_colored=x_t1_colored.swapaxes(0,2)
x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
x_t1 = np.reshape(x_t1, (80, 80, 1))
s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)
# store the transition in D,经验池保存的是以一个马尔科夫序列于D中
D.append((s_t, a_t, r_t, s_t1, terminal))
if len(D) > REPLAY_MEMORY:
D.popleft()
# only train if done observing
if step > OBSERVE: # timesteps to observe before training
# sample a minibatch to train on
minibatch = random.sample(D, BATCH)
# get the batch variables
s_j_batch = [d[0] for d in minibatch]
a_batch = [d[1] for d in minibatch]
r_batch = [d[2] for d in minibatch]
s_j1_batch = [d[3] for d in minibatch]
readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
y_batch = [] #y_batch表示标签值,如果下一时刻游戏关闭则直接用奖励做标签值,若游戏没有关闭,则要在奖励的基础上加上GAMMA比例的下一时刻最大的模型预测值
for i in range(0, len(minibatch)):
terminal = minibatch[i][4]
# if terminal, only equals reward
if terminal:
y_batch.append(r_batch[i])
else:
y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))
# perform gradient step
train_step.run( feed_dict = {y : y_batch, a : a_batch, s : s_j_batch} )
# update the old values
s_t = s_t1
step += 1
# save progress every 10000 iterations
if step % 10000 == 0:
saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = step)
# print info
state = ""
if step <= OBSERVE:
state = "observe"
elif step > OBSERVE and step <= OBSERVE + EXPLORE:
state = "explore"
else:
state = "train"
print("TIMESTEP", step, "/ STATE", state, "/ EPSILON", epsilon, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
def playGame():
sess = tf.InteractiveSession()
s, readout, h_fc1 = createNetwork()
sess.run(tf.initialize_all_variables())
print("create net work successfully")
trainNetwork(s, readout, h_fc1, sess)
if __name__ == '__main__':
playGame()