-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_rl.py
143 lines (111 loc) · 3.73 KB
/
simple_rl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# a: Nick Knowles
# d: 8/21
# Some basic openai gym use & Q learning for general reference.
# -maybe play around w/ policy iteration here some day if time?
import numpy as np
from PIL import Image
import random
import math
import gym
from collections import deque
env = gym.make('CartPole-v0')
Q_table = {}
num_actions = env.action_space.n
discount_factor = 0.99999
""" BINS for state space """
angle_bins = [-0.25, -0.15, 0.0, 0.15, 0.25]
a_veloc_bins = [-0.87, 0.87]
x_veloc_bins = [-0.5, 0.5]
x_bins = [0]
frames = deque(maxlen= 1000)
def make_gif(name):
with open(name+'.gif', 'wb') as f: # change the path if necessary
im = Image.new('RGB', frames[0].size)
im.save(f, save_all=True, append_images=frames)
ave_score = 0.0
scores = deque(maxlen=100)
def add_state(S):
global Q_table
Q_table[S] = {}
for i in range(num_actions):
Q_table[S][i] = -1
def policy(S):
"""
Either return action with highest value
given state S, or a random action.
"""
global Q_table
if S not in Q_table.keys():
add_state(S)
action_list = Q_table[S].values()
if np.random.uniform(0, 1.0) >= exploration_rate:
v = max(action_list)
act = action_list.index(v)
else:
act = random.choice([0,1])
return act
def Q(S, a, r, S_):
global Q_table
old_Q = Q_table[S][a]
new_Q = max(Q_table[S_].values())
Q_table[S][a] += (learn_rate * (r + (discount_factor * new_Q) - old_Q))
def bin_state(S):
S[0] = np.digitize(S[0], x_bins)
S[1] = np.digitize(S[1], x_veloc_bins)
S[2] = np.digitize(S[2], angle_bins)
S[3] = np.digitize(S[3], a_veloc_bins)
return S
learn_rate = 0.01
exploration_rate = 0.99
mb = 0
T_i = 5
T_e = 5
solved = 0
m_learn_rate = 0.009 #max(0.001, min(1.0 - math.log10(episode/25.0), 0.9))
l_learn_rate = 0.0005
""" TRAINING LOOP """
for episode in range(1, 2000):
S = str(bin_state(env.reset()))
m_learn_rate = max(0.0001, min(1.0 - math.log10(episode/5.0), 0.09))
#learn_rate = max(0.1, min(1.0 - math.log10(episode/25.0), 0.5))
mb += 1
m_exploration_rate = max(0.0001, min(1.0 - math.log10(episode/5.0), 1.00))
#if episode > 55:
# exploration_rate = 0.0001
exploration_rate = 0.001 + (0.5 * (m_exploration_rate - 0.001)) * (1 + math.cos((mb / T_e) * math.pi))
learn_rate = l_learn_rate + (0.5 * (m_learn_rate - l_learn_rate)) * (1 + math.cos((mb / T_i) * math.pi))
if mb == T_i:
mb = 1
for t in range(2000000):
#learn_rate = 0.001 + (0.5 * (m_learn_rate- 0.001)) * (1 + math.cos((mb / T_i) * math.pi))
# Create new row for S if needed
frames.append(Image.fromarray(env.render(mode='rgb_array'))) # save each frames
if S not in Q_table.keys():
add_state(S)
# Select an action and execute it in the environment
a = policy(S)
S_, r, done, info = env.step(a)
S_ = str(bin_state(S_)[2:])
# Create new row for S' if needed
if S_ not in Q_table.keys():
add_state(S_)
# Update the Q table
Q(S, a, r, S_)
S = S_
if done:
# (Dropped the pole...)
print('Ep' + str(episode) + ': ' + str(t) + ' steps. Ave Score: ' + str(ave_score))
if len(scores) >= 100:
ave_score -= scores.popleft()/100
if t > 100:
make_gif('solving')
if ave_score > 200:
make_gif('solved')
print('Solved!')
exit()
scores.append(t)
ave_score += (t / 100)
break
#testing
#Q_table[S][a] = r + 0.99*(max(Q_table[S_].values()) - Q_table[S][a])
#Q_table[S][a] = (0.01 * Q_table[S][a]) + (0.99 * (r + 0.99 * max(Q_table[S_].values())))