forked from yrlu/irl-imitation
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmaxent_irl.py
106 lines (81 loc) · 3.26 KB
/
maxent_irl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
'''
Implementation of maximum entropy inverse reinforcement learning in
Ziebart et al. 2008 paper: Maximum Entropy Inverse Reinforcement Learning
https://www.aaai.org/Papers/AAAI/2008/AAAI08-227.pdf
Acknowledgement:
This implementation is largely influenced by Matthew Alger's maxent implementation here:
https://github.com/MatthewJA/Inverse-Reinforcement-Learning/blob/master/irl/maxent.py
By Yiren Lu ([email protected]), May 2017
'''
import numpy as np
import mdp.gridworld as gridworld
import mdp.value_iteration as value_iteration
import img_utils
from utils import *
def compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True):
"""compute the expected states visition frequency p(s| theta, T)
using dynamic programming
inputs:
P_a NxNxN_ACTIONS matrix - transition dynamics
gamma float - discount factor
trajs list of list of Steps - collected from expert
policy Nx1 vector (or NxN_ACTIONS if deterministic=False) - policy
returns:
p Nx1 vector - state visitation frequencies
"""
N_STATES, _, N_ACTIONS = np.shape(P_a)
T = len(trajs[0])
# mu[s, t] is the prob of visiting state s at time t
mu = np.zeros([N_STATES, T])
for traj in trajs:
mu[traj[0].cur_state, 0] += 1
mu[:,0] = mu[:,0]/len(trajs)
for s in range(N_STATES):
for t in range(T-1):
if deterministic:
mu[s, t+1] = sum([mu[pre_s, t]*P_a[pre_s, s, int(policy[pre_s])] for pre_s in range(N_STATES)])
else:
mu[s, t+1] = sum([sum([mu[pre_s, t]*P_a[pre_s, s, a1]*policy[pre_s, a1] for a1 in range(N_ACTIONS)]) for pre_s in range(N_STATES)])
p = np.sum(mu, 1)
return p
def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
"""
Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)
inputs:
feat_map NxD matrix - the features for each state
P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of
landing at state s1 when taking action
a at state s0
gamma float - RL discount factor
trajs a list of demonstrations
lr float - learning rate
n_iters int - number of optimization steps
returns
rewards Nx1 vector - recoverred state rewards
"""
N_STATES, _, N_ACTIONS = np.shape(P_a)
# init parameters
theta = np.random.uniform(size=(feat_map.shape[1],))
# calc feature expectations
feat_exp = np.zeros([feat_map.shape[1]])
for episode in trajs:
for step in episode:
feat_exp += feat_map[step.cur_state,:]
feat_exp = feat_exp/len(trajs)
# training
for iteration in range(n_iters):
if iteration % (n_iters/20) == 0:
print 'iteration: {}/{}'.format(iteration, n_iters)
# compute reward function
rewards = np.dot(feat_map, theta)
# compute policy
_, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False)
# compute state visition frequences
svf = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False)
# compute gradients
grad = feat_exp - feat_map.T.dot(svf)
# update params
theta += lr * grad
rewards = np.dot(feat_map, theta)
# return sigmoid(normalize(rewards))
return normalize(rewards)