forked from yrlu/irl-imitation
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathlp_irl.py
70 lines (59 loc) · 2.42 KB
/
lp_irl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
'''
Implementation of linear programming inverse reinforcement learning in
Ng & Russell 2000 paper: Algorithms for Inverse Reinforcement Learning
http://ai.stanford.edu/~ang/papers/icml00-irl.pdf
By Yiren Lu ([email protected]), May 2017
'''
import numpy as np
from cvxopt import matrix, solvers
from utils import *
def lp_irl(trans_probs, policy, gamma=0.5, l1=10, R_max=10):
"""
inputs:
trans_probs NxNxN_ACTIONS transition matrix
policy policy vector / map
R_max maximum possible value of recoverred rewards
gamma RL discount factor
l1 l1 regularization lambda
returns:
rewards Nx1 reward vector
"""
print np.shape(trans_probs)
N_STATES, _, N_ACTIONS = np.shape(trans_probs)
N_STATES = int(N_STATES)
N_ACTIONS = int(N_ACTIONS)
# Formulate a linear IRL problem
A = np.zeros([2 * N_STATES * (N_ACTIONS + 1), 3 * N_STATES])
b = np.zeros([2 * N_STATES * (N_ACTIONS + 1)])
c = np.zeros([3 * N_STATES])
for i in range(N_STATES):
a_opt = int(policy[i])
tmp_inv = np.linalg.inv(np.identity(N_STATES) - gamma * trans_probs[:, :, a_opt])
cnt = 0
for a in range(N_ACTIONS):
if a != a_opt:
A[i * (N_ACTIONS - 1) + cnt, :N_STATES] = - \
np.dot(trans_probs[i, :, a_opt] - trans_probs[i, :, a], tmp_inv)
A[N_STATES * (N_ACTIONS - 1) + i * (N_ACTIONS - 1) + cnt, :N_STATES] = - \
np.dot(trans_probs[i, :, a_opt] - trans_probs[i, :, a], tmp_inv)
A[N_STATES * (N_ACTIONS - 1) + i * (N_ACTIONS - 1) + cnt, N_STATES + i] = 1
cnt += 1
for i in range(N_STATES):
A[2 * N_STATES * (N_ACTIONS - 1) + i, i] = 1
b[2 * N_STATES * (N_ACTIONS - 1) + i] = R_max
for i in range(N_STATES):
A[2 * N_STATES * (N_ACTIONS - 1) + N_STATES + i, i] = -1
b[2 * N_STATES * (N_ACTIONS - 1) + N_STATES + i] = 0
for i in range(N_STATES):
A[2 * N_STATES * (N_ACTIONS - 1) + 2 * N_STATES + i, i] = 1
A[2 * N_STATES * (N_ACTIONS - 1) + 2 * N_STATES + i, 2 * N_STATES + i] = -1
for i in range(N_STATES):
A[2 * N_STATES * (N_ACTIONS - 1) + 3 * N_STATES + i, i] = 1
A[2 * N_STATES * (N_ACTIONS - 1) + 3 * N_STATES + i, 2 * N_STATES + i] = -1
for i in range(N_STATES):
c[N_STATES:2 * N_STATES] = -1
c[2 * N_STATES:] = l1
sol = solvers.lp(matrix(c), matrix(A), matrix(b))
rewards = sol['x'][:N_STATES]
rewards = normalize(rewards) * R_max
return rewards