-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathQLearning.py
67 lines (56 loc) · 2.3 KB
/
QLearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def QLearning(NumAgents,NUM_ITERATIONS,CostFunction,noV):
"""function that calculate the optimum using Q-learning algoritym"""
import numpy as np
import random
import math
# start condition for the decision variables
xVar = np.random.randint(2, size=[noV,NumAgents])
# learning rate settings
alpha = 0.8 #0.8;
gamma = 0.2 #0.5;
# build a state action matrix by finding all valid states from maze
# we have two actions (flip state or stay, 0 or 1);
Q = np.zeros([NumAgents,NUM_ITERATIONS])
Best = np.ones(NUM_ITERATIONS)*math.nan
NumbConv = 200
IterNumb=0
stepEqual = 0
while IterNumb < (NUM_ITERATIONS-1):
IterNumb += 1
# calculate the fitness for each xVar
fitness = np.ones(NumAgents)*math.nan
for i in range(NumAgents):
fitness[i] = 1/CostFunction(xVar[:,i])
Best[IterNumb] = max(fitness) #find the agent with the better fitness
indexB = (-fitness).argsort()
# Getting the rewarding
for i in range(NumAgents):
if i == indexB[0]:
rewardVal = 2
Q[i,IterNumb] = Q[i,IterNumb] + alpha*(rewardVal+gamma*max(Q[:,IterNumb]) - Q[i,IterNumb])
elif i == indexB[1] or i==indexB[2]:
rewardVal = 1
Q[i,IterNumb] = Q[i,IterNumb] + alpha*(rewardVal+gamma*max(Q[:,IterNumb]) -Q[i,IterNumb])
else:
rewardVal = 0;
Q[i,IterNumb] = Q[i,IterNumb] + alpha*(rewardVal+gamma*max(Q[:,IterNumb]) -Q[i,IterNumb])
indexQ = (-Q[:,IterNumb]).argsort()
for i in range(NumAgents):
if i == indexQ[0]:
xVar[:,i] = xVar[:,i]
elif i == indexQ[1] or i == indexQ[2]:
indFlip = np.random.randint(noV)
xVar[indFlip,i] = 1-xVar[indFlip,i]
else:
indFlip = np.random.randint(noV)
xVar[:,i] = xVar[:,indexQ[0]]
xVar[indFlip,i] = 1-xVar[indFlip,i]
if Best[IterNumb-1] == Best[IterNumb]:
stepEqual += 1
else:
stepEqual = 0
if stepEqual == NumbConv:
break
BestxVar = xVar[:,indexQ[0]]
BestCost = CostFunction(BestxVar)
return(BestxVar,BestCost,1/Best,IterNumb)