diff --git a/src/markov/ddp.jl b/src/markov/ddp.jl index f333fabd..420c39a6 100644 --- a/src/markov/ddp.jl +++ b/src/markov/ddp.jl @@ -22,7 +22,7 @@ Notes =# -import Base.* +import Base: *, ctranspose #------------------------# #-Types and Constructors-# @@ -66,27 +66,34 @@ type DiscreteDP{T<:Real,NQ,NR,Tbeta<:Real,Tind} msg = "R must be 2-dimensional without s-a formulation" throw(ArgumentError(msg)) end - (beta < 0 || beta >= 1) && throw(ArgumentError("beta must be [0, 1)")) - # verify input integrity 2 - num_states, num_actions = size(R) - if size(Q) != (num_states, num_actions, num_states) - throw(ArgumentError("shapes of R and Q must be (n,m) and (n,m,n)")) - end + s, a = size(R) - # check feasibility + # check feasibility of beta + (0 <= beta < 1) || ArgumentError("beta must be in [0, 1)") + + # verify dimensions of Q + size(Q) == (s, s, a) || throw(ArgumentError("shapes of R and Q must be (s, a) and (s, s, a)")) + + # check feasibility of R R_max = s_wise_max(R) - if any(R_max .== -Inf) + if any(isinf(R_max)) # First state index such that all actions yield -Inf - s = find(R_max .== -Inf) #-Only Gives True + s = find(isinf(R_max)) #-Only Gives True throw(ArgumentError("for every state the reward must be finite for - some action: violated for state $s")) + some action: violated for state(s) $(s)")) end + all(Q .>= 0) || throw(ArgumentError("Q must have nonnegative elements")) + + # check columns of Q sum to 1 + cols = sum(Q, 1) + isapprox(cols, ones(cols)) || throw(ArgumentError("columns of Q must sum to 1")) + # here the indices and indptr are null. s_indices = Nullable{Vector{Int}}() a_indices = Nullable{Vector{Int}}() - a_indptr = Nullable{Vector{Int}}() + a_indptr = Nullable{Vector{Int}}() new(R, Q, beta, s_indices, a_indices, a_indptr) end @@ -376,8 +383,10 @@ for a given value function v. - `Tv::Vector` : Updated value function vector """ -bellman_operator(ddp::DiscreteDP, v::Vector) = - s_wise_max(ddp, ddp.R + ddp.beta * ddp.Q * v) +bellman_operator(ddp::DiscreteDP, v::Vector) = begin + vals = ddp.R + ddp.beta * ddp.Q * v + s_wise_max(ddp, vals) +end # ---------------------- # # Compute greedy methods # @@ -514,8 +523,7 @@ Returns the controlled Markov chain for a given policy `sigma`. mc : MarkovChain Controlled Markov chain. """ -QuantEcon.MarkovChain(ddp::DiscreteDP, ddpr::DPSolveResult) = - MarkovChain(RQ_sigma(ddp, ddpr)[2]) +QuantEcon.MarkovChain(ddp::DiscreteDP, ddpr::DPSolveResult) = MarkovChain(RQ_sigma(ddp, ddpr)[2]) """ Method of `RQ_sigma` that extracts sigma from a `DPSolveResult` @@ -541,15 +549,28 @@ the transition probability matrix `Q_sigma`. of shape (n, n). """ -function RQ_sigma{T<:Integer}(ddp::DDP, sigma::Array{T}) - R_sigma = [ddp.R[i, sigma[i]] for i in 1:length(sigma)] - Q_sigma = hcat([getindex(ddp.Q, i, sigma[i], Colon())[:] for i=1:num_states(ddp)]...) - return R_sigma, Q_sigma' +function RQ_sigma{T,Tbeta,Tind,U<:Integer}(ddp::DDP{T,Tbeta,Tind}, sigma::Vector{U}) + s, s_, a = size(ddp.Q) + msg = "length of policy vector ($(length(sigma))) does not match number of actions in model ($a)" + length(sigma) == s || throw(ArgumentError(msg)) + + R_sigma = Vector{T}(a) + Q_sigma = Matrix{T}(a, s) + + for (i, s) in enumerate(sigma) + R_sigma[i] = ddp.R[s, i] + Q_sigma[i, :] = ddp.Q[:, i, s] + end + + return R_sigma, Q_sigma end # TODO: express it in a similar way as above to exploit Julia's column major order -function RQ_sigma{T<:Integer}(ddp::DDPsa, sigma::Array{T}) - sigma_indices = Array(T, num_states(ddp)) +function RQ_sigma{U<:Integer}(ddp::DDPsa, sigma::Vector{U}) + msg = "length of policy vector ($(length(sigma))) does not match number of actions in model ($num_states(ddp))" + length(sigma) == num_states(ddp) || throw(ArgumentError(msg)) + + sigma_indices = Vector{U}(num_states(ddp)) _find_indices!(get(ddp.a_indices), get(ddp.a_indptr), sigma, sigma_indices) R_sigma = ddp.R[sigma_indices] Q_sigma = ddp.Q[sigma_indices, :] @@ -581,25 +602,26 @@ s_wise_max!(vals::AbstractMatrix, out::Vector) = (println("calling this one! "); """ Populate `out` with `max_a vals(s, a)`, where `vals` is represented as a -`AbstractMatrix` of size `(num_states, num_actions)`. +`AbstractMatrix` of size `(num_actions, num_states)`. Also fills `out_argmax` with the column number associated with the indmax in each row """ function s_wise_max!(vals::AbstractMatrix, out::Vector, out_argmax::Vector) - # naive implementation where I just iterate over the rows + length(out) == length(out_argmax) == size(vals, 1) || throw(DimensionMismatch()) + nr, nc = size(vals) - for i_r in 1:nr + for i_c in 1:nc # index over columns # reset temporaries cur_max = -Inf - out_argmax[i_r] = 1 + out_argmax[i_c] = 1 - for i_c in 1:nc + for i_r in 1:nr # index over rows @inbounds v_rc = vals[i_r, i_c] if v_rc > cur_max - out[i_r] = v_rc - out_argmax[i_r] = i_c - cur_max = v_rc + out[i_c] = v_rc + out_argmax[i_c] = i_r + cur_max = v_rc end end @@ -625,7 +647,7 @@ Populate `out` with `max_a vals(s, a)`, where `vals` is represented as a function s_wise_max!(a_indices::Vector, a_indptr::Vector, vals::Vector, out::Vector) n = length(out) - for i in 1:n + @inbounds for i in 1:n if a_indptr[i] != a_indptr[i+1] m = a_indptr[i] for j in a_indptr[i]+1:(a_indptr[i+1]-1) @@ -734,14 +756,29 @@ Define Matrix Multiplication between 3-dimensional matrix and a vector Matrix multiplication over the last dimension of A """ -function *{T}(A::Array{T,3}, v::Vector) - shape = size(A) - size(v, 1) == shape[end] || error("wrong dimensions") +function ctranspose{T}(A::Array{T,3}) + m, n, o = size(A) + B = Array{T}(o, m, n) + @inbounds for i in 1:m, j in 1:n, k in 1:o + # permute indices + B[k,i,j] = A[i,j,k] + end + return B +end + +function *{T,U}(Q::Array{T,3}, v::Vector{U}) + s, s_, a = size(Q) + msg = "inappropriate dimension stochastic maxtrix" + s == s_ || throw(ArgumentError(msg)) + s == length(v) || throw(DimensionMismatch()) - B = reshape(A, (prod(shape[1:end-1]), shape[end])) - out = B * v + R = Array{promote_type(T,U)}(a, s) + + @inbounds for i in 1:s, j in 1:a + R[j, i] = dot(v, Q[:, i, j]) + end - return reshape(out, shape[1:end-1]) + return R end """ diff --git a/test/test_ddp.jl b/test/test_ddp.jl index f5521513..f949aa06 100644 --- a/test/test_ddp.jl +++ b/test/test_ddp.jl @@ -10,7 +10,6 @@ Tests for markov/ddp.jl =# @testset "Testing markov/dpp.jl" begin - #-Setup-# # Example from Puterman 2005, Section 3.1 @@ -18,17 +17,20 @@ Tests for markov/ddp.jl # Formulation with Dense Matrices R: n x m, Q: n x m x n n, m = 2, 2 # number of states, number of actions + R = [5.0 10.0; -1.0 -Inf] - Q = Array(Float64, n, m, n) + + Q = Array{Float64}(n, m, n) Q[:, :, 1] = [0.5 0.0; 0.0 0.0] Q[:, :, 2] = [0.5 1.0; 1.0 1.0] - ddp0 = DiscreteDP(R, Q, beta) + ddp0 = DiscreteDP(R', Q', beta) # Formulation with state-action pairs L = 3 # Number of state-action pairs s_indices = [1, 1, 2] a_indices = [1, 2, 1] + R_sa = [R[1, 1], R[1, 2], R[2, 1]] Q_sa = spzeros(L, n) Q_sa[1, :] = Q[1, 1, :] @@ -47,78 +49,66 @@ Tests for markov/ddp.jl v_star = [(5-5.5*beta)/((1-0.5*beta)*(1-beta)), -1/(1-beta)] sigma_star = [1, 1] - @testset "bellman_operator methods" begin + ddp_rational = DiscreteDP(convert(Array{Rational{BigInt}}, R'), + convert(Array{Rational{BigInt}}, Q'), + convert(Rational{BigInt}, beta)) + + @testset "bellman_operator methods" for ddp in ddp0_collection # Check both Dense and State-Action Pair Formulation - for ddp in ddp0_collection - @test isapprox(bellman_operator(ddp, v_star), v_star) - end + @test isapprox(bellman_operator(ddp, v_star), v_star) end - @testset "RQ_sigma" begin - nr, nc = size(R) - # test for DDP - sigmas = ([1, 1], [1, 2], [2, 1], [2, 2]) - for sig in sigmas - r, q = RQ_sigma(ddp0, sig) - - for i_r in 1:nr - @test r[i_r] == ddp0.R[i_r, sig[i_r]] - for i_c in 1:length(sig) - @test vec(q[i_c, :]) == vec(ddp0.Q[i_c, sig[i_c], :]) - end - end - end + @testset "RQ_sigma" for sig in ([1, 1], [1, 2], [2, 1], [2, 2]) + r, q = RQ_sigma(ddp0, sig) + for (i,s) in enumerate(sig) + @test r[i] == ddp0.R[s, i] + @test vec(q[i, :]) == vec(ddp0.Q[:, i, s]) + end # TODO: add test for DDPsa end - @testset "compute_greedy methods" begin + @testset "compute_greedy methods" for ddp in ddp0_collection # Check both Dense and State-Action Pair Formulation - for ddp in ddp0_collection - @test compute_greedy(ddp, v_star) == sigma_star - end + @test compute_greedy(ddp, v_star) == sigma_star end - @testset "evaluate_policy methods" begin + @testset "evaluate_policy methods" for ddp in ddp0_collection # Check both Dense and State-Action Pair Formulation - for ddp in ddp0_collection - @test isapprox(evaluate_policy(ddp, sigma_star), v_star) - end + @test isapprox(evaluate_policy(ddp, sigma_star), v_star) end - @testset "methods for subtypes != (Float64, Int)" begin - float_types = [Float16, Float32, Float64, BigFloat] - int_types = [Int8, Int16, Int32, Int64, Int128, - UInt8, UInt16, UInt32, UInt64, UInt128] - - for ddp in ddp0_collection - for f in (bellman_operator, compute_greedy) - for T in float_types - f_f64 = f(ddp, [1.0, 1.0]) - f_T = f(ddp, ones(T, 2)) - @test isapprox(f_f64, convert(Vector{eltype(f_f64)}, f_T)) - end - - # only Integer subtypes can be Rational type params - # NOTE: Only the integer types below don't overflow for this example - for T in [Int64, Int128] - @test f(ddp, [1//1, 1//1]) == f(ddp, ones(Rational{T}, 2)) - end - end + float_types = (Float16, Float32, Float64, BigFloat) + int_types = (Int8, Int16, Int32, Int64, Int128, + UInt8, UInt16, UInt32, UInt64, UInt128) - for T in float_types, S in int_types - v = ones(T, 2) - s = ones(S, 2) - # just test that we can call the method and the result is - # deterministic - @test bellman_operator!(ddp, v, s) == bellman_operator!(ddp, v, s) + @testset "methods for subtypes != (Float64, Int)" for ddp in ddp0_collection + for f in (bellman_operator, compute_greedy) + for T in float_types + f_f64 = f(ddp, [1.0, 1.0]) + f_T = f(ddp, ones(T, 2)) + @test isapprox(f_f64, convert(Vector{eltype(f_f64)}, f_T)) end - for T in int_types - s = T[1, 1] - @test isapprox(evaluate_policy(ddp, s), v_star) + # only Integer subtypes can be Rational type params + # NOTE: Only the integer types below don't overflow for this example + for T in (Int64, Int128) + @test f(ddp, [1//1, 1//1]) == f(ddp, ones(Rational{T}, 2)) end end + + for T in float_types, S in int_types + v = ones(T, 2) + s = ones(S, 2) + # just test that we can call the method and the result is + # deterministic + @test bellman_operator!(ddp, v, s) == bellman_operator!(ddp, v, s) + end + + for T in int_types + s = T[1, 1] + @test isapprox(evaluate_policy(ddp, s), v_star) + end end @testset "compute_greedy! changes ddpr.v" begin @@ -128,46 +118,39 @@ Tests for markov/ddp.jl @test maxabs(res.Tv - 500.0) > 0 end - @testset "value_iteration" begin + @testset "value_iteration" for ddp_item in ddp0_collection # Check both Dense and State-Action Pair Formulation - for ddp_item in ddp0_collection - # Compute Result - res = solve(ddp_item, VFI) - v_init = [0.0, 0.0] - res_init = solve(ddp_item, v_init, VFI; epsilon=epsilon) - - # Check v is an epsilon/2-approxmation of v_star - @test maxabs(res.v - v_star) < epsilon/2 - @test maxabs(res_init.v - v_star) < epsilon/2 - - # Check sigma == sigma_star. - # NOTE we need to convert from linear to row-by-row index - @test res.sigma == sigma_star - @test res_init.sigma == sigma_star - end + # Compute Result + res = solve(ddp_item, VFI) + v_init = [0.0, 0.0] + res_init = solve(ddp_item, v_init, VFI; epsilon=epsilon) + + # Check v is an epsilon/2-approxmation of v_star + @test maxabs(res.v - v_star) < epsilon/2 + @test maxabs(res_init.v - v_star) < epsilon/2 + + # Check sigma == sigma_star. + # NOTE we need to convert from linear to row-by-row index + @test res.sigma == sigma_star + @test res_init.sigma == sigma_star end - @testset "policy_iteration" begin + @testset "policy_iteration" for ddp in ddp0_collection # Check both Dense and State-Action Pair Formulation - for ddp_item in ddp0_collection - res = solve(ddp_item, PFI) - v_init = [0.0, 1.0] - res_init = solve(ddp_item, v_init, PFI) - - # Check v == v_star - @test isapprox(res.v, v_star) - @test isapprox(res_init.v, v_star) - - # Check sigma == sigma_star - @test res.sigma == sigma_star - @test res_init.sigma == sigma_star - end + res = solve(ddp, PFI) + v_init = [0.0, 1.0] + res_init = solve(ddp, v_init, PFI) + + # Check v == v_star + @test isapprox(res.v, v_star) + @test isapprox(res_init.v, v_star) + + # Check sigma == sigma_star + @test res.sigma == sigma_star + @test res_init.sigma == sigma_star end @testset "DiscreteDP{Rational,_,_,Rational} maintains Rational" begin - ddp_rational = DiscreteDP(map(Rational{BigInt}, R), - map(Rational{BigInt}, Q), - map(Rational{BigInt}, beta)) # do minimal number of iterations to avoid overflow vi = Rational{BigInt}[1//2, 1//2] @test eltype(solve(ddp_rational, VFI; max_iter=1, epsilon=Inf).v) == Rational{BigInt} @@ -175,10 +158,8 @@ Tests for markov/ddp.jl @test eltype(solve(ddp_rational, vi, MPFI; max_iter=1, k=1, epsilon=Inf).v) == Rational{BigInt} end - @testset "DiscreteDP{Rational{BigInt},_,_,Rational{BigInt}} works" begin - ddp_rational = DiscreteDP(map(Rational{BigInt}, R), - map(Rational{BigInt}, Q), - map(Rational{BigInt}, beta)) + @testset "DiscreteDP{Rational{BigInt},_,_,Rational{BigInt}} works" begin + # do minimal number of iterations to avoid overflow r1 = solve(ddp_rational, PFI) r2 = solve(ddp_rational, MPFI) @@ -190,30 +171,28 @@ Tests for markov/ddp.jl @test r1.mc.p == r3.mc.p end - @testset "modified_policy_iteration" begin - for ddp_item in ddp0_collection - res = solve(ddp_item, MPFI) - v_init = [0.0, 1.0] - res_init = solve(ddp_item, v_init, MPFI) + @testset "modified_policy_iteration" for ddp_item in ddp0_collection + res = solve(ddp_item, MPFI) + v_init = [0.0, 1.0] + res_init = solve(ddp_item, v_init, MPFI) - # Check v is an epsilon/2-approxmation of v_star - @test maxabs(res.v - v_star) < epsilon/2 - @test maxabs(res_init.v - v_star) < epsilon/2 + # Check v is an epsilon/2-approxmation of v_star + @test maxabs(res.v - v_star) < epsilon/2 + @test maxabs(res_init.v - v_star) < epsilon/2 - # Check sigma == sigma_star - @test res.sigma == sigma_star - @test res_init.sigma == sigma_star + # Check sigma == sigma_star + @test res.sigma == sigma_star + @test res_init.sigma == sigma_star - #Test Modified Policy Iteration k0 - k = 0 - res = solve(ddp_item, MPFI; max_iter=max_iter, epsilon=epsilon, k=k) + #Test Modified Policy Iteration k0 + k = 0 + res = solve(ddp_item, MPFI; max_iter=max_iter, epsilon=epsilon, k=k) - # Check v is an epsilon/2-approxmation of v_star - @test maxabs(res.v - v_star) < epsilon/2 + # Check v is an epsilon/2-approxmation of v_star + @test maxabs(res.v - v_star) < epsilon/2 - # Check sigma == sigma_star - @test res.sigma == sigma_star - end + # Check sigma == sigma_star + @test res.sigma == sigma_star end @testset "DDPsa constructor" begin @@ -264,9 +243,9 @@ Tests for markov/ddp.jl @testset "feasbile action pair" begin #Dense Matrix n, m = 2, 2 - _R = [-Inf -Inf; 1.0 2.0] + _R = [-Inf -Inf; 1.0 2.0]' - _Q = Array(Float64, n, m, n) + _Q = Array{Float64}(n, m, n) _Q[:, :, 1] = [0.5 0.0; 0.0 0.0] _Q[:, :, 2] = [0.5 1.0; 1.0 1.0] _beta = 0.95 @@ -290,9 +269,7 @@ Tests for markov/ddp.jl @testset "ddp_negative_inf_error()" begin # Dense Matrix n, m = 3, 2 - R = [0 1; - 0 -Inf; - -Inf -Inf] + R = [0 1; 0 -Inf; -Inf -Inf] Q = fill(1.0/n, n, m, n) beta = 0.95