-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathICI3D_Lab7_MCMC-Binomial.R
143 lines (120 loc) · 7.07 KB
/
ICI3D_Lab7_MCMC-Binomial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
## Introduction to MCMC 1: Estimating a posterior binomial probability
## Clinic on the Meaningful Modeling of Epidemiological Data
## International Clinics on Infectious Disease Dynamics and Data (ICI3D) Program
## African Institute for Mathematical Sciences, Muizenberg, RSA
## Steve Bellan 2012, 2015
##
######################################################################
## By the end of this tutorial you should…
## * Understand how to write a flexible prior function for the binomial
## * Understand the Metropolis-Hasting algorithm and explain it step by step.
## * Know how the parameter proposal distribution affects MCMC convergence
## * Know how to assess MCMC convergence with the Gelman-Rubin diagnostic and with trace plots
library(boot); library(coda)
defaultPar <- par() ## Save the default graphic parameter settings for use later.
## First, let's pretend that we are sampling a population of 100 individuals with a 30% prevalence
## for some disease and testing each of them. From this, we have a sample prevalence, which is our
## estimate of the true prevalence. The problem considered in the simulation is to estimate the
## posterior probability distribution of the prevalence from this sample and from a specified
## (informative or uninformative) prior probability distribution for the prevalence.
size <- 100
truePrev <- .3
sampPos <- rbinom(1, size, truePrev) ## Sample from this distribution once.
sampPrev <- sampPos/size
## Now we need to specify our prior probability distribution. This designates our prior (before we
## conducted the study) beliefs of what we think the prevalence of this population is. Frequently,
## we have insufficient prior information to specify an informative prior. In these cases, we will
## use an uninformative prior, meaning we that we assume a wide range of values is plausible.
## Our parameter of interest is prevalence, which is bounded between 0 and 1. The beta probability
## distribution is particularly suited for such parameters and is, in fact, the most commonly used
## probability distribution for parameters that are, themselves, probabilities. It is uniform (constand density across probabilities)
## with parameters shape1 = 1, shape2 = 1, and can otherwise have many shapes.
## We calculate both the prior and the likelihood on a log scale to avoid numerical problems
logBetaPrior <- function(prevalence
, shape1 = 1 ## Change to make informative (try 8)
, shape2 = 1) ## Change to make informative (try 40)
dbeta(prevalence, shape1 = shape1, shape2 = shape2, log = T)
## The likelihood is simply the probability of observing that many individuals test positive given
## the number tested and a specified prevalence.
logLikelihood <- function(prevalence,
data = list(size = size, sampPos = sampPos))
dbinom(data$sampPos, size = data$size, prob = prevalence, log = T)
## A convenience function that sums the log-likelihood and the log-prior.
logLikePrior <- function(prevalence
, shape1 = 1
, shape2 = 1
, data = list(size = size, sampPos = sampPos))
logBetaPrior(prevalence, shape1, shape2) + logLikelihood(prevalence, data)
## Convenience functions
Prior <- function(x) exp(logBetaPrior(x))
Likelihood <- function(x) exp(logLikelihood(x))
LikePrior <- function(x) exp(logLikePrior(x))
## Plots
## Examine these plots, and write down what you think each of them means
par(mfrow = c(2,2) ## panels
, mar = c(3,6,1,1) ## panel margins
, bty='n' ## no box around plots
, oma = c(1.5,0,0,0)) ## outer margins
curve(Prior(x), 0, 1, ylab = 'prior')
curve(Likelihood, 0, 1, ylab = 'likelihood')
curve(LikePrior, 0, 1, ylab = 'likelihood X prior')
curve(logLikePrior, 0,1, ylab = 'log(likelihood X prior)')
mtext('prevalence', 1, 0, outer=T)
## runMCMC runs a Markov chain Monte Carlo (MCMC) Metropolis-Hastings algorithm to
## numerically estimate the posterior probability distribution of the prevalence. For problems this
## simple, the posterior probability distribution can be solved for analytically. However, for
## problems more complex than this one (such as the Introduction to MCMC-SI_HIV tutorial), numerical
## integration using MCMC may be the only way to calculate the posterior probability distribution
## for parameters.
## We sample the prevalence parameter on the logistic-scale so that it is bounded by [-Inf, Inf] and not [0,1]
runMCMC <- function(iterations
, startvalue = runif(1, logit(.01), logit(.99)) ## random starting value
, proposerSD = .5 ## standard deviation of the gaussian proposal distribution
, verbose = 0){ ## for debugging
if(verbose > 0) browser()
chain <- array(dim = c(iterations + 1, 1)) ## initialize empty iterations X 1 array
chain[1,] <- startvalue ## set first value
for(ii in 1:iterations){
proposal <- rnorm(1, chain[ii,], proposerSD) ## propose next value
MHratio <- exp(logLikePrior(inv.logit(proposal)) -
logLikePrior(inv.logit(chain[ii,])))
## If the MH-ratio is > 1, accept new value. Otherwise, accept it with probability equal to
## the the MH-ratio.
if(runif(1) < MHratio){
chain[ii+1,] <- proposal
}else{ ## If rejecting it, stay at the last state.
chain[ii+1,] <- chain[ii,]
}
}
return(inv.logit(chain)) ## Transform the chain of logit-prevalence values back into prevalences
}
posteriorSample <- runMCMC(1000, proposerSD = .1) ## sample 1000 times from posterior
par(defaultPar, bty = 'n')
plot(posteriorSample, xlab='iteration', ylab = 'prevalence', main = 'posterior sample',
type = 'l', las = 1)
## Compare proposal distributions
par(mfrow = c(2,2), oma = c(0,0,2,0))
for(sdVal in c(.05, .1, .5, 2)) {
posteriorSample <- runMCMC(1000, proposerSD = sdVal)
plot(posteriorSample, xlab='iteration', ylab = 'prevalence',
main = bquote(sigma==.(sdVal)),
ylim = c(0,1),
type = 'l', las = 1)
}
mtext('posterior sample by proposer sd', side = 3, line = 0, outer = T)
####################################################################################################
## Questions
####################################################################################################
## Question 1: Write code to compare histograms of posterior samples
## from proposal distributions with the 4 different proposal standard
## deviations above for 3000 samples. Make sure each histogram has the
## same breaks and x-axis limits.
## Question 2: Sample 4 chains for 3000 iterations using the same
## proposal distribution and plot each chains' trace on the same plot.
## Question 3: Use the Gelman-Rubin diagnostic (gelman.diag) to assess
## whether those four chains have reached convergence. You will need
## to convert the chains from 2 into "mcmc" objects (as.mcmc()), and
## then put them into an mcmc.list (as.mcmc.list())
## Challenge Question: (A) Plot the Gelman-Rubin diagnostic as a function
## of chain length. (B) Make the same plot, but after discarding the first 100
## iterations as a "burnin"