-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvalueIteration.py
122 lines (99 loc) · 4.22 KB
/
valueIteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from maze import *
import numpy as np
from value_plot import value_plot
numStates = 112
numActions = 4
discount = 0.9
def valueIteration():
env = Maze()
initial_state = env.reset()
#get the transition proability matrix
transitions = np.zeros((numStates,numActions,numStates),dtype='float')
#get the reward matrix
rewards = np.zeros((numStates,numActions))
#create the transition and reward matrices
numIters = 500
for i in range(0,numIters):
for s in range(0,numStates):
for a in range(0,numActions):
#use step function to get the next state
reward, next_state, done = env.step(s, a)
#put the reward in the next state
rewards[s,a] = reward
#add one to the state transition matrix
transitions[s,a,next_state] = transitions[s,a,next_state] + 1.0
#standardize the transitions matrix
for s in range(0,numStates):
for a in range(0,numActions):
transitions[s,a,:] = transitions[s,a,:] / np.sum(transitions[s,a,:])
transitions = np.round(transitions,decimals=1)
#initialize the value function states as zeros
values = np.random.choice(a = ([0]), size=(112,))
#initialize the policies randomly
policies = np.random.choice(a = (0,1,2,3), size=(112,))
#do value iteration
for i in range (0,5000):
values, policies = valueIter(transitions, rewards, values, policies)
#best policy found so now save all the Q values
Qvals = np.zeros((numStates,numActions))
for s in range(0,numStates):
for a in range(0,numActions):
reward = rewards[s,a]
#get all the possible new states based on s and action (from policy pi)
possibleNewStates = transitions[s,a,:]
#get the s+1 state where probability != 0
indiciesSPrime = np.where(possibleNewStates > 0)[0]
probSPrime = possibleNewStates[indiciesSPrime]
#find the new value estimation for state s based on all the possible s+1 states sPrime
newValue = 0
for i in range(0,len(indiciesSPrime)):
#get the state sPrime
sPrime = indiciesSPrime[i]
#get the probability of that state sPrime
prob = probSPrime[i]
#get the value of sPrime
valueSPrime = values[sPrime]
#add the current value to the values for the other sPrimes
newValue = newValue + prob*(reward + discount*valueSPrime)
Qvals[s,a] = newValue
print Qvals
#save the q values
value_plot(Qvals, env, True, True)
np.save('QValues',Qvals)
def valueIter(transitions, rewards, values, policies):
epsilon = .03
delta = []
while delta > epsilon:
delta = 0.0
# iterate over all the states
for s in range(0 , numStates):
# get the current value v
v = values[s,]
maxAVal = 0
maxA = 0
for action in range(0,numActions):
# get all the possible new states based on s and action (from policy pi)
possibleNewStates = transitions[s, action, :]
# get the reward for state s
reward = rewards[s, action]
# find the new value estimation for state s based on all the possible s+1 states sPrime
newValue = 0
for i in range(0, len(possibleNewStates)):
# get the state sPrime
sPrime = i
# get the probability of that state sPrime
prob = possibleNewStates[i]
# get the value of sPrime
valueSPrime = values[sPrime]
# add the current value to the values for the other sPrimes
newValue = newValue + prob * (reward + discount * valueSPrime)
if newValue > maxAVal:
maxAVal = newValue
maxA = action
values[s] = maxAVal
policies[s] = maxA
# update delta
delta = max(delta, np.abs(v - values[s,]))
return values, policies
if __name__ == "__main__":
valueIteration()