From e9b4e7797631cb21a75ec7e468b7c6280d24f74d Mon Sep 17 00:00:00 2001
From: Bob Carpenter <carp@alias-i.com>
Date: Sun, 12 Nov 2017 17:16:53 -0500
Subject: [PATCH] updated manual for 2.17, fixes #2336

---
 src/docs/bibtex/all.bib                       |  17 +-
 src/docs/stan-reference/acknowledgements.tex  |  16 +-
 src/docs/stan-reference/algorithms.tex        |  16 +-
 src/docs/stan-reference/appendices.tex        |  16 +-
 src/docs/stan-reference/contributed.tex       |  10 +-
 src/docs/stan-reference/distributions.tex     |   7 +-
 src/docs/stan-reference/examples.tex          | 187 +++++++++---------
 src/docs/stan-reference/functions.tex         | 116 ++++++-----
 src/docs/stan-reference/introduction.tex      |  27 ++-
 src/docs/stan-reference/language.tex          | 118 ++++++++---
 src/docs/stan-reference/preface.tex           |  77 +++-----
 src/docs/stan-reference/process.tex           |  66 +++----
 src/docs/stan-reference/programming.tex       | 186 +++++++++++++++--
 .../mining-disasters/changepoint-missing.stan |  53 +++++
 .../mining-disasters/changepoint.stan         |  73 +++++++
 .../programs/mining-disasters/data.R          |  10 +
 .../programs/mining-disasters/fit.R           |  34 ++++
 17 files changed, 702 insertions(+), 327 deletions(-)
 create mode 100644 src/docs/stan-reference/programs/mining-disasters/changepoint-missing.stan
 create mode 100644 src/docs/stan-reference/programs/mining-disasters/changepoint.stan
 create mode 100644 src/docs/stan-reference/programs/mining-disasters/data.R
 create mode 100644 src/docs/stan-reference/programs/mining-disasters/fit.R

diff --git a/src/docs/bibtex/all.bib b/src/docs/bibtex/all.bib
index f234aee13f9..185fe978796 100644
--- a/src/docs/bibtex/all.bib
+++ b/src/docs/bibtex/all.bib
@@ -1,6 +1,16 @@
 % a do-nothing command that serves a purpose
 @preamble{ " \newcommand{\noop}[1]{} " }
 
+@article{lancaster:2000,
+  title={The incidental parameter problem since 1948},
+  author={Lancaster, Tony},
+  journal={Journal of Econometrics},
+  volume={95},
+  number={2},
+  pages={391--413},
+  year={2000}
+}
+
 @manual{minpack:1980,
 	Address = {9700 South Cass Avenue, Argonne, Illinois 60439},
 	Author = {Jorge J. More, Burton S. Garbow, Kenneth E. Hillstrom},
@@ -36,7 +46,7 @@ @article{AhnertMulansky:2011
   volume={1110.3397}
 }
 
-@article{DormandPrince:1980, 
+@article{DormandPrince:1980,
   author={Dormand, John R and Prince, Peter J},
   year={1980},
   title={A family of embedded {R}unge-{K}utta formulae},
@@ -67,11 +77,11 @@ @inproceedings{SerbanHindmarsh:2005
 
 @article{HoetingEtAl:1999,
   title={Bayesian model averaging: a tutorial},
-  author={Hoeting, Jennifer A. and Madigan, David 
+  author={Hoeting, Jennifer A. and Madigan, David
           and Raftery, Adrian E and Volinsky, Chris T.},
   journal={Statistical Science},
   volume = {14},
-  number = {4}, 
+  number = {4},
   pages={382--417},
   year={1999}
 }
@@ -1251,4 +1261,3 @@ @article{ZouHastie:2005
   number = {2},
   pages = {301--320}
 }
-
diff --git a/src/docs/stan-reference/acknowledgements.tex b/src/docs/stan-reference/acknowledgements.tex
index b33b96e5fdc..2491284e5c0 100644
--- a/src/docs/stan-reference/acknowledgements.tex
+++ b/src/docs/stan-reference/acknowledgements.tex
@@ -103,6 +103,7 @@ \subsection*{Code  and Doc Patches}
 Jacob Egner,
 Ashley Ford,
 Jan Gl\"ascher,
+Jan Gleixner,
 Robert J.\ Goedman,
 Danny Goldstein,
 Tom Haber,
@@ -139,9 +140,11 @@ \subsection*{Code  and Doc Patches}
 
 Thanks for documentation bug reports and patches to:
 alvaro1101 (GitHub handle),
+andrasm (GitHub handle),
 Avraham Adler,
 Chris Anderson,
 Asim,
+Jens Astrom,
 Jarret Barber,
 Ryan Batt,
 Frederik Beaujean,
@@ -152,6 +155,7 @@ \subsection*{Code  and Doc Patches}
 Portia Brat,
 Arthur Breitman,
 Eric C.~Brown,
+Eric N.~Brown,
 Juan Sebasti\'an Casallas,
 Alex Chase,
 Daniel Chen,
@@ -159,6 +163,8 @@ \subsection*{Code  and Doc Patches}
 Andy Choi,
 David Chudzicki,
 Michael Clerx,
+Marco Colombo,
+Luis Damiano,
 Andria Dawson,
 daydreamt (GitHub handle),
 Conner DiPaolo,
@@ -175,6 +181,7 @@ \subsection*{Code  and Doc Patches}
 Mauricio Garnier-Villarreal,
 Christopher Gandrud,
 Jonathan Gilligan,
+Aaron Goodman,
 John Hall,
 David Hallvig,
 David Harris,
@@ -205,15 +212,18 @@ \subsection*{Code  and Doc Patches}
 Tamas Papp,
 Anders Gorm Pedersen,
 Tomi Peltola,
+Alex Perrone,
 Andre Pfeuffer,
 Sergio Polini,
 Joerg Rings,
 Sean O'Riordain,
 Brendan Rocks,
+David M.\ Rogers,
 Cody Ross,
 Mike Ross,
 Tony Rossini,
 Nathan Sanders,
+Massimo Santini,
 James Savage,
 Terrance Savitsky,
 Dan Schrage,
@@ -222,23 +232,27 @@ \subsection*{Code  and Doc Patches}
 Janne Sinkkonen,
 skanskan (GitHub handle),
 Yannick Spill,
+Trey Spiller,
 sskates (GitHub handle),
+ssp3nc3r (GitHub handle),
 Martin Stjernman,
 Dan Stowell,
 Alexey Stukalov,
 Dougal Sutherland,
 John Sutton,
 Maciej Swat,
+Jonathan Sweeney,
 J.~Takoua,
 Andrew J.~Tanentzap,
 Shravan Vashisth,
 Aki Vehtari,
+Lukas Vermeer,
 Damjan Vukcevic,
 Matt Wand,
 Amos Waterland,
 Sebastian Weber,
 Sam Weiss,
-Luke Wiklendt,
+Lukasz Wiklendt,
 wrobell (GitHub handle),
 Howard Zail,
 Jon Zelner, and
diff --git a/src/docs/stan-reference/algorithms.tex b/src/docs/stan-reference/algorithms.tex
index 1b7bc6c0700..601847b49d1 100644
--- a/src/docs/stan-reference/algorithms.tex
+++ b/src/docs/stan-reference/algorithms.tex
@@ -53,7 +53,7 @@ \subsection{Auxiliary Momentum Variable}
 for details of the geometry.
 
 In Stan, this matrix may be set to the identity matrix (i.e., unit
-diagonal) or estimated from warmup samples and optionally restricted
+diagonal) or estimated from warmup draws and optionally restricted
 to a diagonal matrix. The inverse $\Sigma^{-1}$ is known as the mass
 matrix, and will be a unit, diagonal, or dense if $\Sigma$ is.
 
@@ -384,7 +384,7 @@ \subsection{Discretization-Interval Adaptation Parameters}
 By setting the target acceptance parameter $\delta$ to a value closer
 to 1 (its value must be strictly less than 1 and its default value is
 0.8), adaptation will be forced to use smaller step sizes. This can
-improve sampling efficiency (effective samples per iteration) at the
+improve sampling efficiency (effective sample size per iteration) at the
 cost of increased iteration times. Raising the value of $\delta$ will
 also allow some models that would otherwise get stuck to overcome
 their blockages.
@@ -729,7 +729,7 @@ \section{Divergent Transitions}
 random walk and biasing estimates by not being able to thoroughly
 explore the posterior distribution.  \cite{Betancourt:2016b} provides
 details of the theory, computation, and practical implications of
-divergent transitions in Hamiltonian Monte Carlo.  
+divergent transitions in Hamiltonian Monte Carlo.
 
 The Stan interfaces report divergences as warnings and provide ways to
 access which iterations encountered divergences.  ShinyStan provides
@@ -737,7 +737,7 @@ \section{Divergent Transitions}
 transitions to diagnose where the divergences arise in parameter
 space.  A common location is in the neck of the funnel in a centered
 parameterization (as in the funnel example discussed in
-\refsection{reparameterization}). 
+\refsection{reparameterization}).
 
 If the posterior is highly curved, very small step sizes are required
 for this gradient-based simulation of the Hamiltonian to be accurate.
@@ -1341,7 +1341,7 @@ \section{Unit Vector}\label{unit-vector.section}
 if it has unit Euclidean length, so that
 %
 \[
-\Vert x \Vert 
+\Vert x \Vert
 \ = \ \sqrt{x^{\top}\,x}
 \ = \ \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2}
 \ = \ 1\ .
@@ -2155,7 +2155,7 @@ \section{Stochastic Gradient Ascent}
 \subsection{Monte Carlo Approximation of the ELBO}
 
 ADVI uses Monte Carlo integration to approximate the variational
-objective function, the ELBO. The number of samples used to
+objective function, the ELBO. The number of draws used to
 approximate the ELBO is denoted by \texttt{elbo\_samples}. We
 recommend a default value of $100$, as we only evaluate the ELBO every
 \texttt{eval\_elbo} iterations, which also defaults to $100$.
@@ -2163,7 +2163,7 @@ \subsection{Monte Carlo Approximation of the ELBO}
 \subsection{Monte Carlo Approximation of the Gradients}
 
 ADVI uses Monte Carlo integration to approximate the gradients of the
-ELBO. The number of samples used to approximate the gradients is
+ELBO. The number of draws used to approximate the gradients is
 denoted by \texttt{grad\_samples}. We recommend a default value of
 $1$, as this is the most efficient. It also a very noisy estimate of
 the gradient, but stochastic gradient ascent is capable of following
@@ -2256,5 +2256,3 @@ \section{Speed Warning and Data Trimming}
 very long time, especially in models with latent parameters that grow
 with the data size. It can be helpful to diagnose a model with smaller
 data sizes in such cases.
-
-
diff --git a/src/docs/stan-reference/appendices.tex b/src/docs/stan-reference/appendices.tex
index f093a486438..192ae20c2ca 100644
--- a/src/docs/stan-reference/appendices.tex
+++ b/src/docs/stan-reference/appendices.tex
@@ -108,7 +108,7 @@ \chapter{Stan for Users of BUGS}\label{stan-for-bugs.appendix}
 Bugs.
 
 To start, take a look at the files of translated \BUGS models at
-\url{http://mc-stan.org/}.  These are 40 or so models from the \BUGS
+\url{http://mc-stan.org}.  These are 40 or so models from the \BUGS
 example volumes, all translated and tested (to provide the same
 answers as \BUGS) in Stan.  For any particular model you want to fit,
 you can look for similar structures in these examples.
@@ -526,13 +526,13 @@ \section{Some Differences when Running from R}
 
 \item Stan can be set up from within R using two lines of code.
   Follow the instructions for running Stan from R on
-  \url{http://mc-stan.org/}.  You don't need to separately download
+  \url{http://mc-stan.org}.  You don't need to separately download
   Stan and RStan.  Installing RStan will automatically set up Stan.
   When RStan moves to CRAN, it will get even easier.
 \item In practice we typically run the same Stan model repeatedly.  If
   you pass RStan the result of a previously fitted model the model will
   not need be recompiled. An example is given on the running
-  Stan from R pages available from \code{http://mc-stan.org/}.
+  Stan from R pages available from \code{http://mc-stan.org}.
 \item When you run Stan, it saves various conditions including
   starting values, some control variables for the tuning and running
   of the no-U-turn sampler, and the initial random seed. You can
@@ -560,7 +560,7 @@ \section{The Stan Community}
 \begin{itemize}
 \item Stan, like WinBUGS, OpenBUGS, and JAGS, has an active community,
   which you can access via the user's mailing list and the developer's
-  mailing list; see \code{http://mc-stan.org/} for information on
+  mailing list; see \code{http://mc-stan.org} for information on
   subscribing and posting and to look at archives.
 \end{itemize}
 
@@ -663,7 +663,7 @@ \subsection{Expressions}
 \small
 \begin{Verbatim}[fontsize=\small]
 expressions ::= expression % ','
-expression ::= numeric_literal
+expression ::= real_literal
              | variable
              | '{' expressions '}'
              | expression `?` expression `:` expression
@@ -687,8 +687,6 @@ \subsection{Expressions}
 
 indexes ::= index % ','
 
-numeric_literal ::= integer_literal | real_literal
-
 integer_literal ::= 0 | [1-9] [0-9]*
 
 real_literal ::= integer_literal ?('.' [0-9]*) ?exp_literal
@@ -715,8 +713,8 @@ \subsection{Statements}
    | 'target' '+=' expression
    | 'break'
    | 'continue'
-   | 'print' '(' (expression | string_literal)% ',' ')'
-   | 'reject' '(' (expression | string_literal)% ',' ')'
+   | 'print' '(' (expression | string_literal) % ',' ')'
+   | 'reject' '(' (expression | string_literal) % ',' ')'
    | 'return' expression
    | ''
 
diff --git a/src/docs/stan-reference/contributed.tex b/src/docs/stan-reference/contributed.tex
index c7e9395deeb..74bf5f69f55 100644
--- a/src/docs/stan-reference/contributed.tex
+++ b/src/docs/stan-reference/contributed.tex
@@ -3,7 +3,7 @@ \part{Contributed Modules}
 \chapter{Contributed Modules}
 
 \noindent
-Stan is an open-source project and welcomes user contributions.  
+Stan is an open-source project and welcomes user contributions.
 
 In order to reduce maintenance on the main trunk of Stan development
 and to allow developer-specified licenses, contributed Stan modules
@@ -16,10 +16,10 @@ \section{Contributing a Stan Module}
 Stan developers either through one of the following.
 %
 \begin{itemize}
-\item \code{stan-users} mailing list: 
+\item Stan forums:
 \\
-\url{https://groups.google.com/forum/?fromgroups#!forum/stan-users}
-\item Stan e-mail: 
+\url{http://discourse.mc-stan.org}
+\item Stan e-mail:
 \\
 \href{mailto:mc.stanislaw@gmail.com}{\tt mc.stanislaw@gmail.com}
 \end{itemize}
@@ -58,5 +58,3 @@ \subsection{Emacs Stan Mode}
 {\it Authors:} & Jeffrey Arnold, Daniel Lee
 \end{tabular}
 \end{quote}
-
-
diff --git a/src/docs/stan-reference/distributions.tex b/src/docs/stan-reference/distributions.tex
index 880b3119944..42ea37c5edd 100644
--- a/src/docs/stan-reference/distributions.tex
+++ b/src/docs/stan-reference/distributions.tex
@@ -118,7 +118,7 @@ \section{Cumulative Distribution Functions}
 \ = \
 \mbox{Pr}[Y < y]
 \ = \
-\int_{-\infty}^y p(y \, | \, \theta) \ \mathrm{d}\theta.
+\int_{-\infty}^y p(y \, | \, \theta) \ \mathrm{d}y.
 \]
 The complementary cumulative distribution function (CCDF) is defined
 as
@@ -2931,8 +2931,3 @@ \subsubsection{Stan Functions}
 freedom \farg{nu} and symmetric and positive-definite scale matrix
  \farg{Sigma}; may only be used in generated quantities block}
 \end{description}
-
-
-
-
-
diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex
index f9d87d968df..069083b89b8 100644
--- a/src/docs/stan-reference/examples.tex
+++ b/src/docs/stan-reference/examples.tex
@@ -635,37 +635,36 @@ \section{Multi-Logit Regression}\label{multi-logit.section}
   int N;
   int D;
   int y[N];
-  vector[D] x[N];
+  matrix[N, D] x;
 }
 parameters {
-  matrix[K,D] beta;
+  matrix[D, K] beta;
 }
 model {
-  for (k in 1:K)
-    beta[k] ~ normal(0, 5);
+  matrix[N, K] x_beta = x * beta;
+
+  to_vector(beta) ~ normal(0, 2);
+
   for (n in 1:N)
-    y[n] ~ categorical(softmax(beta * x[n]));
+    y[n] ~ categorical_logit(x_beta[n]);
 }
 \end{stancode}
 %
-See \refsection{softmax} for a definition of the softmax
-function.   A more efficient way to write the final line is
+The matrix multiplication is pulled out to define a local variable for
+all of the predictors.   Then the prior on \code{beta} is coded in
+vectorized form.    Then, the categorical is expressed on the log-odds
+(logit) scale; an equivalent form without the matrix multiplication
+
+The categorical distribution with log-odds (logit) scaled parameters
+used above is equivalent to writing
 %
 \begin{stancode}
-    y[n] ~ categorical_logit(beta * x[n]);
+    y[n] ~ categorical(softmax(x[n] * beta));
 \end{stancode}
 %
-The \code{categorical\_logit} distribution is like the categorical
-distribution, with the parameters on the logit scale (see
+See \refsection{softmax} for a definition of the softmax function and
 \refsection{categorical-distribution} for a full definition of
-\code{categorical\_logit}).
-
-The first loop may be made more efficient by vectorizing the first
-loop by converting the matrix \code{beta} to a vector,
-%
-\begin{stancode}
-to_vector(beta) ~ normal(0, 5);
-\end{stancode}
+\code{categorical\_logit}.
 
 
 \subsubsection{Constraints on Data Declarations}
@@ -680,10 +679,10 @@ \subsubsection{Constraints on Data Declarations}
 %
 
 \begin{stancode}
-  int<lower=2> K;
-  int<lower=0> N;
-  int<lower=1> D;
-  int<lower=1,upper=K> y[N];
+  int<lower = 2> K;
+  int<lower = 0> N;
+  int<lower = 1> D;
+  int<lower = 1, upper = K> y[N];
 \end{stancode}
 %
 These constraints arise because the number of categories, \code{K},
@@ -727,8 +726,7 @@ \subsection{Identifiability}
 %
 \begin{stancode}
 transformed data {
-  row_vector[D] zeros;
-  zeros = rep_row_vector(0, D);
+  row_vector[D] zeros = rep_row_vector(0, D);
 }
 \end{stancode}
 %
@@ -780,9 +778,7 @@ \subsection{$K-1$ Degrees of Freedom}
 transformed parameters {
   vector[K] beta;  // centered
 
-  for (k in 1:(K-1)) {
-    beta[k] = beta_raw[k];
-  }
+  beta[1 : K - 1] = beta_raw;
   beta[K] = -sum(beta_raw);
   ...
 \end{stancode}
@@ -810,17 +806,16 @@ \subsection{Translated and Scaled Simplex}
   ...
 transformed parameters {
   vector[K] beta;
-  beta = beta_scale * (beta_raw - 1.0 / K);
+  beta = beta_scale * (beta_raw - inv(K));
   ...
 \end{stancode}
 %
-Given that \code{beta\_raw} sums to 1 because it is a simplex, the
-elementwise subtraction of $1 / K$ is guaranteed to sum to zero (note
-that the expression \code{1.0 / K} is used rather than \code{1 / K} to
-prevent integer arithmetic rounding down to zero).  Because the
-magnitude of the elements of the simplex is bounded, a scaling factor
-is required to provide \code{beta} with $K$ degrees of freedom
-necessary to take on every possible value that sums to zero.
+Here \code{inv(K)} is just a short way to write \code{1.0~/~K}.  Given
+that \code{beta\_raw} sums to 1 because it is a simplex, the
+elementwise subtraction of \code{inv(K)} is guaranteed to sum to zero.
+Because the magnitude of the elements of the simplex is bounded, a
+scaling factor is required to provide \code{beta} with $K$ degrees of
+freedom necessary to take on every possible value that sums to zero.
 
 With this parameterization, a Dirichlet prior can be placed on
 \code{beta\_raw}, perhaps uniform, and another prior put on
@@ -1222,7 +1217,7 @@ \subsection{Multilevel 2PL Model}
 
 \begin{stancode}
 parameters {
-  real mu_beta;                  // mean student ability
+  real mu_beta;                // mean question difficulty
   real alpha[J];               // ability for j - mean
   real beta[K];                // difficulty for k
   real<lower=0> gamma[K];      // discrimination of k
@@ -1239,7 +1234,6 @@ \subsection{Multilevel 2PL Model}
   beta ~ normal(0, sigma_beta);
   gamma ~ lognormal(0, sigma_gamma);
   mu_beta ~ cauchy(0, 5);
-  sigma_alpha ~ cauchy(0, 5);
   sigma_beta ~ cauchy(0, 5);
   sigma_gamma ~ cauchy(0, 5);
   for (n in 1:N)
@@ -1295,11 +1289,13 @@ \subsection{Multilevel 2PL Model}
 The intercept term \code{mu\_beta} can't itself be modeled
 hierarchically, so it is given a weakly informative
 $\distro{Cauchy}(0,5)$ prior.  Similarly, the scale terms,
-\code{sigma\_alpha}, \code{sigma\_beta}, and \code{sigma\_gamma}, are
-given half-Cauchy priors.  The truncation in the half-Cauchy prior is
-implicit; explicit truncation is not necessary because the log
-probability need only be calculated up to a proportion and the scale
-variables are constrained to $(0,\infty)$ by their declarations.
+\code{sigma\_beta}, and \code{sigma\_gamma}, are given half-Cauchy
+priors.  As mentioned earlier, the scale and location for \code{alpha}
+are fixed to ensure identifiability.  The truncation in the
+half-Cauchy prior is implicit; explicit truncation is not necessary
+because the log probability need only be calculated up to a proportion
+and the scale variables are constrained to $(0,\infty)$ by their
+declarations.
 
 
 
@@ -2700,7 +2696,7 @@ \subsection{Identifiability and Stationarity}%
 it's worth adding the following constraint to ensure stationarity.
 %
 \begin{stancode}
-read<lower = -1, upper = 1> phi;
+real<lower = -1, upper = 1> phi;
 \end{stancode}
 
 
@@ -2941,7 +2937,7 @@ \subsection{Calculating Sufficient Statistics}
 with a single multinomial distribution.%
 %
 \footnote{The program is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 The data is declared as before, but now a transformed data blocks
 computes the sufficient statistics for estimating the transition and
@@ -2987,7 +2983,7 @@ \subsection{Analytic Posterior}
 multinomial.  The following example%
 %
 \footnote{The program is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 illustrates how a Stan model can define the posterior analytically.
 This is possible in the Stan language because the model only needs to
@@ -3042,7 +3038,7 @@ \subsection{Semisupervised Estimation}
 In Stan, the forward algorithm is coded as follows.%
 %
 \footnote{The program is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 First, two additional data variable are declared for the unsupervised
 data.
@@ -3125,7 +3121,7 @@ \subsection{Predictive Inference}
     real best_logp[T_unsup, K];
     real best_total_logp;
     for (k in 1:K)
-      best_logp[1, K] = log(phi[k, u[1]]);
+      best_logp[1, k] = log(phi[k, u[1]]);
     for (t in 2:T_unsup) {
       for (k in 1:K) {
         best_logp[t, k] = negative_infinity();
@@ -3942,15 +3938,15 @@ \subsection{Estimating Parameters of a Mixture}
 }
 parameters {
   simplex[K] theta;          // mixing proportions
-  ordered mu[K];             // locations of mixture components
+  ordered[K] mu;             // locations of mixture components
   vector<lower=0>[K] sigma;  // scales of mixture components
 }
 model {
-  real log_theta[K] = log(theta);  // cache log calculation
+  vector[K] log_theta = log(theta);  // cache log calculation
   sigma ~ lognormal(0, 2);
   mu ~ normal(0, 10);
   for (n in 1:N) {
-    real lps[K] = log_theta;
+    vector[K] lps = log_theta;
     for (k in 1:K)
       lps[k] += normal_lpdf(y[n] | mu[k], sigma[k]);
     target += log_sum_exp(lps);
@@ -4007,7 +4003,7 @@ \section{Vectorizing Mixtures}
 for (n in 1:N)
   target += log_mix(lambda,
                     normal_lpdf(y[n] | mu[1], sigma[1]),
-                    normal_lpdf(y[n] | mu[2], sigma[2])));
+                    normal_lpdf(y[n] | mu[2], sigma[2]));
 \end{stancode}
 %
 This definition assumes that each observation $y_n$ may have arisen
@@ -4451,9 +4447,9 @@ \subsection{Regression with Measurement Error}
 %
 \begin{stancode}
 data {
-  int<lower=0> N;        // number of cases
-  real x[N];             // predictor (covariate)
-  real y[N];             // outcome (variate)
+  int<lower=0> N;       // number of cases
+  vector[N] x;          // predictor (covariate)
+  vector[N] y;          // outcome (variate)
 }
 parameters {
   real alpha;           // intercept
@@ -4805,7 +4801,7 @@ \subsubsection{Hierarchical Model}
 \footnote{The model provided for this data in
 \citep[Section~5.5]{GelmanEtAl:2013} is included with the
 data in the Stan example model repository,
-\url{http://mc-stan.org/documentation}.}
+\url{http://mc-stan.org/users/documentation}.}
 
 \subsubsection{Extensions and Alternatives}
 
@@ -5380,7 +5376,7 @@ \subsection{Collective Cormack-Jolly-Seber Model}
 \\ \hline
 {2} & - & + & - & $\chi_2$
 \\
-{3} & - & + & + & $\phi_2 \, \phi_3$
+{3} & - & + & + & $\phi_2 \, p_3$
 \\ \hline
 {4} & + & - & - & $\chi_1$
 \\
@@ -6244,7 +6240,7 @@ \subsection{Stan Implementation of Soft $K$-Means}
 clustering.%
 %
 \footnote{The model is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 \begin{stancode}
 data {
@@ -6445,7 +6441,7 @@ \subsection{Estimation with Category-Labeled Training Data}
 as follows%
 %
 \footnote{This model is available in the example model repository;
-  see \url{http://mc-stan.org/documentation}.}
+  see \url{http://mc-stan.org/users/documentation}.}
 %
 \begin{stancode}
 data {
@@ -6964,7 +6960,7 @@ \section{Simulating from a Gaussian Process}
 implementation of a Gaussian process simulator.%
 %
 \footnote{This model is available in the example model repository;
-  see \url{http://mc-stan.org/documentation}.}
+  see \url{http://mc-stan.org/users/documentation}.}
 %
 \begin{stancode}
 data {
@@ -7002,7 +6998,7 @@ \section{Simulating from a Gaussian Process}
 transformed data {
   matrix[N, N] K = cov_exp_quad(x, 1.0, 1.0);
   vector[N] mu = rep_vector(0, N);
-  for (n in 1:N) 
+  for (n in 1:N)
     K[n, n] = K[n, n] + 0.1;
 }
 parameters {
@@ -7024,7 +7020,7 @@ \subsection{Multivariate Inputs}
 multivariate model.%
 %
 \footnote{The model is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 The only lines that change from the univariate model above are as follows.
 %
@@ -7068,7 +7064,7 @@ \subsection{Cholesky Factored and Transformed Implementation}
 simulation.%
 %
 \footnote{The code is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 This model has the same data declarations for \code{N} and \code{x},
 and the same transformed data definitions of \code{mu} and
@@ -7153,7 +7149,7 @@ \subsection{GP with a normal outcome}
 the transformed data block.
 %
 \footnote{The program code is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 
 %
 \begin{stancode}
@@ -7178,9 +7174,9 @@ \subsection{GP with a normal outcome}
   // diagonal elements
   for (n in 1:N)
     K[n, n] = K[n, n] + sq_sigma;
-  
+
   L_K = cholesky_decompose(K);
-  
+
   rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
@@ -7241,15 +7237,15 @@ \subsubsection{Latent variable GP}
   {
     matrix[N, N] L_K;
     matrix[N, N] K = cov_exp_quad(x, alpha, rho);
-  
+
     // diagonal elements
     for (n in 1:N)
       K[n, n] = K[n, n] + delta;
-    
+
     L_K = cholesky_decompose(K);
     f = L_K * eta;
   }
-  
+
   rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
@@ -7420,7 +7416,7 @@ \subsection{Automatic Relevance Determination}
     matrix[N, N] L_K = L_cov_exp_quad_ARD(x, alpha, rho, delta);
     f = L_K * eta;
   }
-  
+
   rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
@@ -7465,13 +7461,14 @@ \subsubsection{Priors for length-scale}
   & x \in \reals^{N \times D}, \, f \in \reals^N
 \end{align*}
 %
-we likely don't need constraints beyond penalizing small length-scales.  We'd
-like to allow the GP prior to represent both high-frequency and low-frequency
-functions, so our prior should put non-negligible mass on both sets of
-functions. In this case, an inverse gamma, \code{inv\_gamma\_lpdf} in Stan's
-language, will work well as it has a sharp left tail that puts negligible mass
-on infinitesimal length-scales, but a generous right tail, allowing for large
-length-scales. Inverse gamma priors will avoid infinitesimal length-scales
+we likely don't need constraints beyond penalizing small
+length-scales.  We'd like to allow the GP prior to represent both
+high-frequency and low-frequency functions, so our prior should put
+non-negligible mass on both sets of functions.  In this case, an
+inverse gamma, \code{inv\_gamma\_lpdf} in Stan’s language, will work
+well as it has a sharp left tail that puts negligible mass on
+infinitesimal length-scales, but a generous right tail, allowing for
+large length-scales. Inverse gamma priors will avoid infinitesimal length-scales
 because the density is zero at zero, so the posterior for length-scale will be
 pushed away from zero. An inverse gamma distribution is one of many
 zero-avoiding or boundary-avoiding distributions. See
@@ -7591,11 +7588,11 @@ \subsection{Predictive Inference with a Gaussian Process}
   {
     matrix[N, N] L_K;
     matrix[N, N] K = cov_exp_quad(x, alpha, rho);
-  
+
     // diagonal elements
     for (n in 1:N)
       K[n, n] = K[n, n] + delta;
-    
+
     L_K = cholesky_decompose(K);
     f = L_K * eta;
   }
@@ -7635,7 +7632,7 @@ \subsection{Predictive Inference with a Gaussian Process}
 \code{y2} by sampling \code{N2} univariate normals with each mean corresponding
 to the appropriate element in \code{f}.
 \footnote{The program code is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 
 \subsubsection{Predictive Inference in non-Gaussian GPs}
@@ -7647,7 +7644,7 @@ \subsubsection{Predictive Inference in non-Gaussian GPs}
 process regression.
 %
 \footnote{The model is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 %
 \begin{stancode}
@@ -7676,11 +7673,11 @@ \subsubsection{Predictive Inference in non-Gaussian GPs}
   {
     matrix[N, N] L_K;
     matrix[N, N] K = cov_exp_quad(x, alpha, rho);
-  
+
     // diagonal elements
     for (n in 1:N)
       K[n, n] = K[n, n] + delta;
-    
+
     L_K = cholesky_decompose(K);
     f = L_K * eta;
   }
@@ -7728,7 +7725,7 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
 
 %
 \footnote{The program code is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 This Stan code below uses the analytic form of the posterior and provides
 sampling of the resulting multivariate normal through the Cholesky
@@ -7767,7 +7764,7 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
       K_div_y1 = mdivide_left_tri_low(L_K, y1);
       K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)';
       k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
-      f2_mu = (k_x1_x2' * K_div_y1); 
+      f2_mu = (k_x1_x2' * K_div_y1);
       v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
       cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred;
       diag_delta = diag_matrix(rep_vector(delta,N2));
@@ -7798,14 +7795,14 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
   {
     matrix[N1, N1] K = cov_exp_quad(x1, alpha, rho);
     real sq_sigma = square(sigma);
-  
+
     // diagonal elements
     for (n1 in 1:N1)
       K[n1, n1] = K[n1, n1] + sq_sigma;
-    
+
     L_K = cholesky_decompose(K);
   }
-  
+
   rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
@@ -7905,7 +7902,7 @@ \subsection{Multiple-output Gaussian processes}
 
     // diagonal elements
     for (n in 1:N)
-      K[n, n] = K[n, n] + delta; 
+      K[n, n] = K[n, n] + delta;
 
     L_K = cholesky_decompose(K);
     f = L_K * eta
@@ -8081,7 +8078,7 @@ \section{Example: System of Nonlinear Algebraic Equations}
 %
 As an illustrative example, we consider a nonlinear system of two equations
 with two unknowns. Our goal is to simultaneously solve all equations for
-$y_1$ and $y_2$, such that the vector $z$ goes to 0. 
+$y_1$ and $y_2$, such that the vector $z$ goes to 0.
 %
 \begin{eqnarray}\label{algebra.equation}
   \begin{aligned}
@@ -8119,7 +8116,7 @@ \section{Coding an Algebraic System}
 unused arguments must be included in the system function with exactly the
 signature above.
 
-\subsubsection{Strict Signature}  
+\subsubsection{Strict Signature}
 %
 The function defining the system must have exactly these argument types and
 return type. This may require passing in zero-length arrays for data or a zero-length
@@ -8134,7 +8131,7 @@ \section{Calling the Algebraic Solver}
 proposal gets rejected and a warning message, stating no acceptable solution was
 found, is issued.
 
-The solver has three tuning parameters to determine convergence: the relative tolerance, 
+The solver has three tuning parameters to determine convergence: the relative tolerance,
 the function tolerance, and the maximum number of steps. Their default values are respectively
 \code{1e-10} ($10^{-10}$), \code{1e-6} ($10^{-6}$), and \code{1e3} ($10^3$). Their
 behavior is explained in \refsection{algebra-control}.
@@ -8173,7 +8170,7 @@ \subsection{Length of the Algebraic Function and of the Vector of Unknowns}
 the system is the same as the number of unknowns.}
 
 \subsection{Pathological Solutions}
-Certain systems may be degenerate, meaning they have multiple solutions. The 
+Certain systems may be degenerate, meaning they have multiple solutions. The
 algebraic solver will not report these cases, as the algorithm stops once it has found
 an acceptable solution. The initial guess will often determine which solution gets found
 first. The degeneracy may be broken by putting additional constraints on the solution.
@@ -8190,7 +8187,7 @@ \section{Control Parameters for the Algebraic Solver}\label{algebra-control.sect
 supplied.
 %
 \begin{stancode}
-  y = algebra_solver(system, y_guess, theta, x_r, x_i, 
+  y = algebra_solver(system, y_guess, theta, x_r, x_i,
                      rel_tol, f_tol, max_steps);
 \end{stancode}
 
@@ -8209,7 +8206,7 @@ \subsection{Tolerance}
 geometrically. Ideally the output of the algebraic function is at the origin; the norm measures
 deviations from this ideal. As the length of the return vector increases, a certain
 function tolerance becomes an increasingly difficult criterion to meet, given each
-individual element of the vector contribute to the norm. 
+individual element of the vector contribute to the norm.
 
 Smaller relative tolerances produce more accurate solutions but require more computational time.
 
@@ -8221,7 +8218,7 @@ \subsection{Maximum Number of Steps}
 %
 The maximum number of steps can be used to stop a runaway simulation. This can arise in
 MCMC when a bad jump is taken, particularly during warmup. If the limit is hit, the
-current metropolis proposal gets rejected. Users will see a  warning message stating the 
+current metropolis proposal gets rejected. Users will see a  warning message stating the
 maximum number of steps has been exceeded.
 
 
diff --git a/src/docs/stan-reference/functions.tex b/src/docs/stan-reference/functions.tex
index d38169fb155..b4863c6dd6c 100644
--- a/src/docs/stan-reference/functions.tex
+++ b/src/docs/stan-reference/functions.tex
@@ -4,18 +4,21 @@ \chapter{Void Functions}
 
 Stan does not technically support functions that do not return values.
 It does support two types of statements that look like functions, one
-for incrementing log probabilities and one for printing.
-Documentation on these functions is included here for completeness.
-The special keyword \code{void} is used for the return type of void
-functions.
-
-\section{Print}
-
-The \code{print} statement is unique among Stan's syntactic constructs
-in two ways.  First, it is the only function-like construct that
-allows a variable number of arguments.  Second, it is the only
-function-like construct to accept string literals (e.g., \code{"hello
-  world"}) as arguments.
+for printing and one for rejecting outputs.  Documentation on these
+functions is included here for completeness.  The special keyword
+\code{void} is used for the return type of void functions, because
+they behave like variadic functions with void return type, even though
+they are special kinds of statements.
+
+Although \code{print} and \code{reject} appear to have the syntax of
+functions, they are actually special kinds of statements with slightly
+different form and behavior than other functions.  First, they are the
+constructs that allow a variable number of arguments.  Second, they
+are the the only constructs to accept string literals (e.g.,
+\code{"hello world"}) as arguments.  Third, they have no effect on the
+log density function and operate solely through side effects.
+
+\section{Print ``Function''}
 
 Printing has no effect on the model's log probability function.  Its
 sole purpose is the side effect (i.e., an effect not represented in a
@@ -26,7 +29,7 @@ \section{Print}
 \begin{description}
   \fitem{void}{print}{T1 \farg{x1},..., TN \farg{xN}}{Print the values
     denoted by the arguments \farg{x1} through \farg{xN} on the
-    standard output stream.  There are no spaces between items in the
+    output message stream.  There are no spaces between items in the
     print, but a line feed (LF; Unicode U+000A; \Cpp literal
     \code{'\textbackslash{n}'}) is inserted at the end of the printed
     line.  The types \code{T1} through \code{TN} can be any of Stan's
@@ -37,6 +40,28 @@ \section{Print}
 The full behavior of the \code{print} statement with examples is
 documented in \refsection{print-statements}.
 
+\section{Reject ``Function''}
+
+The reject statement has the same syntax as the print statement,
+accepting an arbitrary number of arguments of any type (including
+string literals).   The effect of executing a reject statement is to
+throw an exception internally that terminates the current iteration
+with a rejection (the behavior of which will depend on the algorithmic
+context in which it occurs).
+%
+\begin{description}
+  \fitem{void}{reject}{T1 \farg{x1},..., TN \farg{xN}}{Reject the
+    current iteration and print the values denoted by the arguments
+    \farg{x1} through \farg{xN} on the output message stream.  There
+    are no spaces between items in the print, but a line feed (LF;
+    Unicode U+000A; \Cpp literal \code{'\textbackslash{n}'}) is
+    inserted at the end of the printed line.  The types \code{T1}
+    through \code{TN} can be any of Stan's built-in numerical types or
+    double quoted strings of ASCII characters.}
+\end{description}
+%
+The full behavior of the \code{print} statement with examples is
+documented in \refsection{reject-statements}.
 
 \chapter{Integer-Valued Basic Functions}
 
@@ -162,14 +187,16 @@ \section{Absolute Functions}
 \fitemnobody{int}{int\_step}{int \farg{x}}
 %
 \fitem{int}{int\_step}{real \farg{x}}{
-  Returns the integer step, or Heaviside, function of \farg{x}.
+  Returns the step function of \farg{x} as an integer,
   \[
   \mbox{int\_step}(x) =
   \begin{cases}
-    0 & \mbox{if } x \leq 0 \\
-    1 & \mbox{if } x > 0
+    1 & \mbox{if } x > 0 \\
+    0 & \mbox{if } x \leq 0 \mathrm{ or } x \mathrm{ is } Nan
   \end{cases}
-  \]}
+  \]
+{\it Warning:} \code{int\_step(0)} and \code{int\_step(NaN)} return 0
+whereas \code{step(0)} and \code{step(NaN)} return 1.}
 %
 \end{description}
 %
@@ -577,11 +604,13 @@ \subsection{Logical Functions}
     \[
     \mbox{step}(x) =
     \begin{cases}
-      1 & \mbox{if } x > 0   \\
-      0 & \mbox{otherwise}
+      0 & \mbox{if } x < 0   \\
+      1 & \mbox{otherwise}
     \end{cases}
-    \]}
-  %
+    \]
+{\it Warning:} \code{int\_step(0)} and \code{int\_step(NaN)} return 0
+whereas \code{step(0)} and \code{step(NaN)} return 1.}
+%
 \end{description}
 %
 The step function is often used in BUGS to perform conditional
@@ -1531,16 +1560,16 @@ \section{Reductions}\label{array-reductions.section}
 \subsection{Minimum and Maximum}
 
 \begin{description}
-\fitem{real}{min}{real \farg{x}[]}{
+\fitem{real}{min}{real[] \farg{x}}{
 The minimum value in \farg{x}, or $+\infty$ if \farg{x} is size 0.}
 %
-\fitem{int}{min}{int \farg{x}[]}{
+\fitem{int}{min}{int[] \farg{x}}{
 The minimum value in \farg{x}, or error if \farg{x} is size 0.}
 %
-\fitem{real}{max}{real \farg{x}[]}{
+\fitem{real}{max}{real[] \farg{x}}{
 The maximum value in \farg{x}, or $-\infty$ if \farg{x} is size 0.}
 %
-\fitem{int}{max}{int \farg{x}[]}{
+\fitem{int}{max}{int[] \farg{x}}{
 The maximum value in \farg{x}, or error if \farg{x} is size 0.}
 %
 \end{description}
@@ -1549,7 +1578,7 @@ \subsection{Sum, Product, and Log Sum of Exp}
 
 \begin{description}
 %
-\fitem{int}{sum}{int \farg{x}[]}{
+\fitem{int}{sum}{int[] \farg{x}}{
 The sum of the elements in \farg{x}, defined for $x$ of size $N$ by
 \[
 \mbox{\code{sum}}(x)
@@ -1562,13 +1591,13 @@ \subsection{Sum, Product, and Log Sum of Exp}
 \]
 }
 %
-\fitem{real}{sum}{real \farg{x}[]}{
+\fitem{real}{sum}{real[] \farg{x}}{
 The sum of the elements in \farg{x}; see definition above.}
 %
-\fitem{real}{prod}{real \farg{x}[]}{
+\fitem{real}{prod}{real[] \farg{x}}{
 The product of the elements in \farg{x}, or 1 if \farg{x} is size 0.}
 %
-\fitem{real}{prod}{int \farg{x}[]}{
+\fitem{real}{prod}{int[] \farg{x}}{
 The product of the elements in \farg{x},
 \[
 \mbox{\code{product}}(x) =
@@ -1580,7 +1609,7 @@ \subsection{Sum, Product, and Log Sum of Exp}
 \]
 }
 %
-\fitem{real}{log\_sum\_exp}{real \farg{x}[]}{
+\fitem{real}{log\_sum\_exp}{real[] \farg{x}}{
 The natural logarithm of the sum of the exponentials of the elements
 in \farg{x}, or $-\infty$ if the array is empty.}
 \end{description}
@@ -1603,7 +1632,7 @@ \subsection{Sample Mean, Variance, and Standard Deviation}
 sample deviation, but is not unbiased.
 
 \begin{description}
-\fitem{real}{mean}{real \farg{x}[]}{
+\fitem{real}{mean}{real[] \farg{x}}{
 The sample mean of the elements in \farg{x}.
 For an array $x$ of size
 $N > 0$,
@@ -1617,7 +1646,7 @@ \subsection{Sample Mean, Variance, and Standard Deviation}
 It is an error to the call the mean function with an array of size $0$.
 }
 %
-\fitem{real}{variance}{real \farg{x}[]}{
+\fitem{real}{variance}{real[] \farg{x}}{
 The sample variance of the elements in \farg{x}. For $N > 0$,
 \[
 \mbox{\code{variance}}(x)
@@ -1632,7 +1661,7 @@ \subsection{Sample Mean, Variance, and Standard Deviation}
 It is an error to call the \code{variance} function with an array of size 0.
 }
 %
-\fitem{real}{sd}{real \farg{x}[]}{The sample standard deviation of
+\fitem{real}{sd}{real[] \farg{x}}{The sample standard deviation of
 elements in \farg{x}.
 \[
 \mbox{sd}(x)
@@ -1683,12 +1712,12 @@ \subsection{Euclidean Distance and Squared Distance}
 to call \code{squared\_distance} with arguments of unequal size.}
 %
 \fitem{real}{squared\_distance}{vector \farg{x}, row\_vector
-  \farg{y}[]}{The squared Euclidean distance between \farg{x} and \farg{y}}
+ [] \farg{y}}{The squared Euclidean distance between \farg{x} and \farg{y}}
 %
 \fitem{real}{squared\_distance}{row\_vector \farg{x}, vector
-  \farg{y}[]}{The squared Euclidean distance between \farg{x} and \farg{y}}
+ [] \farg{y}}{The squared Euclidean distance between \farg{x} and \farg{y}}
 %
-\fitem{real}{squared\_distance}{row\_vector \farg{x}, row\_vector \farg{y}[]}{The Euclidean
+\fitem{real}{squared\_distance}{row\_vector \farg{x}, row\_vector[] \farg{y}}{The Euclidean
   distance between \farg{x} and \farg{y}}
 %
 \end{description}
@@ -1823,7 +1852,7 @@ \section{Array Concatenation}\label{array-concatenation.section}
 \begin{description}
 \fitem{T}{append\_array}{T \farg{x}, T \farg{y}}{
   Return the concatenation of two arrays in the order of the arguments.
-  T must be an N-dimensional array of any Stan type (with a maximum N of 8). All
+  T must be an N-dimensional array of any Stan type (with a maximum N of 7). All
   dimensions but the first must match. }
 \end{description}
 %
@@ -3452,7 +3481,7 @@ \chapter{Compound Arithmetic and Assignment}
 The compound arithmetic and assignment operations are coded directly
 as statements, as described in
 \refsection{compound-arithmetic-assignment}.  They allow statements of
-the form 
+the form
 %
 \begin{stancode}
 x = x op y;
@@ -3583,7 +3612,7 @@ \section{Compound Elementwise Division and Assignment}
 \chapter{Algebraic Equation Solver}
 \label{functions-algebraic-solver.chapter}
 
-Stan provides a built-in algebraic equation solver. Although it 
+Stan provides a built-in algebraic equation solver. Although it
 looks like other function applications, the algebraic solver is
 special in two ways.
 
@@ -3636,7 +3665,7 @@ \section{Call to the Algebraic Solver}
                   {vector \farg{theta}, real[] \farg{x\_r}, int[] \farg{x\_i}}%
                   {Solves the algebraic system, given an initial guess,
                     using the Powell hybrid algorithm.}
-                   
+
 \fitemfourlines{vector}{algebra\_solver}%
                   {function \farg{algebra\_system}}%
                   {vector \farg{y\_guess}}%
@@ -3685,10 +3714,10 @@ \subsection{Return Value}
 
 \subsection{Sizes and Parallel Arrays}
 Certain sizes have to be consistent. The initial guess, return value of the solver, and
-return value of the algebraic function must all be the same size. 
+return value of the algebraic function must all be the same size.
 
 The parameters, real data, and integer data will be passed from the solver directly to the
-system function. 
+system function.
 
 \subsection{Algorithmic Details}
 
@@ -3872,6 +3901,3 @@ \subsection{Example}
 An example of a complete Stan program with a system definition and
 solver call is shown for data simulation in \reffigure{sho-sim} and
 estimation in \reffigure{sho-both}.
-
-
-
diff --git a/src/docs/stan-reference/introduction.tex b/src/docs/stan-reference/introduction.tex
index da29c6d853e..441a5c5a0cd 100644
--- a/src/docs/stan-reference/introduction.tex
+++ b/src/docs/stan-reference/introduction.tex
@@ -18,7 +18,7 @@ \section{Stan Home Page}
 the Stan home page:
 %
 \begin{quote}
-\url{http://mc-stan.org/}
+\url{http://mc-stan.org}
 \end{quote}
 
 
@@ -42,7 +42,7 @@ \subsection{CmdStan}
 standalone document.  The CmdStan home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/cmdstan.html}
+\url{http://mc-stan.org/users/interfaces/cmdstan.html}
 \end{quote}
 
 \subsection{RStan}
@@ -53,7 +53,7 @@ \subsection{RStan}
 home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/rstan.html}
+\url{http://mc-stan.org/users/interfaces/rstan.html}
 \end{quote}
 
 \subsection{PyStan}
@@ -63,7 +63,7 @@ \subsection{PyStan}
 The PyStan home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/pystan.html}
+\url{http://mc-stan.org/users/interfaces/pystan.html}
 \end{quote}
 
 \subsubsection{MatlabStan}
@@ -73,7 +73,7 @@ \subsubsection{MatlabStan}
 MatlabStan home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/matlab-stan.html}
+\url{http://mc-stan.org/users/interfaces/matlab-stan.html}
 \end{quote}
 %
 
@@ -83,7 +83,7 @@ \subsubsection{Stan.jl}
 wraps a CmdStan process.  The Stan.jl home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/julia-stan.html}
+\url{http://mc-stan.org/users/interfaces/julia-stan.html}
 \end{quote}
 %
 
@@ -93,7 +93,7 @@ \subsubsection{StataStan}
 wraps a CmdStan process.  The StataStan home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/stata-stan.html}
+\url{http://mc-stan.org/users/interfaces/stata-stan.html}
 \end{quote}
 
 \subsubsection{MathematicaStan}
@@ -103,7 +103,7 @@ \subsubsection{MathematicaStan}
 MathematicaStan home page is
 %
 \begin{quote}
-\url{http://mc-stan.org/mathematica-stan.html}
+\url{http://mc-stan.org/users/interfaces/mathematica-stan.html}
 \end{quote}
 
 
@@ -122,7 +122,7 @@ \section{Stan Programs}
 other types.  Variables are declared in blocks corresponding to the
 variable's use: data, transformed data, parameter, transformed
 parameter, or generated quantity.  Unconstrained local variables may
-be declared within statement blocks.  
+be declared within statement blocks.
 
 The transformed data, transformed parameter, and generated quantities
 blocks contain statements defining the variables declared in their
@@ -295,7 +295,7 @@ \subsection{Convergence Monitoring and Effective Sample Size}
   warmup iterations.  At most this rerunning strategy will consume
   about 50\% more cycles than guessing the correct number of
   iterations at the outset.}
-  
+
 When estimating a mean based on a sample of $M$ independent draws, the
 estimation error is proportional to $1/\sqrt{M}$.  If the draws are
 positively correlated, as they typically are when drawn using \MCMC
@@ -303,7 +303,7 @@ \subsection{Convergence Monitoring and Effective Sample Size}
 where {\sc n\_eff} is the effective sample size.  Thus it is standard
 practice to also monitor (an estimate of) the effective sample size
 until it is large enough for the estimation or inference task at
-hand.  
+hand.
 
 \subsection{Bayesian Inference and Monte Carlo Methods}
 
@@ -349,7 +349,7 @@ \section{Optimization}
 If the prior is uniform, the posterior mode corresponds to the maximum
 likelihood estimate (MLE) of the parameters.  If the prior is not
 uniform, the posterior mode is sometimes called the maximum a
-posteriori (MAP) estimate. 
+posteriori (MAP) estimate.
 
 For optimization, the Jacobian of any transforms induced by
 constraints on variables are ignored.  It is more efficient in many
@@ -407,6 +407,3 @@ \section{Variational Inference}
 differentiation toolbox \citep{Kucukelbir:2015}. ADVI circumvents all of the
 mathematics typically required to derive variational inference algorithms; it
 works with any Stan model.
-
-
-
diff --git a/src/docs/stan-reference/language.tex b/src/docs/stan-reference/language.tex
index d9e185be979..9e47fd36507 100644
--- a/src/docs/stan-reference/language.tex
+++ b/src/docs/stan-reference/language.tex
@@ -9,18 +9,31 @@ \chapter{Encodings, Includes, and Comments}
 
 \section{Character Encoding}
 
-\subsection{Stan Program}
+\subsection{Content}
 
-The content of a Stan program must be coded in ASCII.  Extended
-character sets such as UTF-8 encoded Unicode may not be used for
-identifiers or other text in a program.
+The content of a Stan program must be coded in ASCII.  All identifiers
+must consist of only ASCII alpha-numeric characters and the underscore
+character.  All arithmetic operators and punctuation must be coded in
+ASCII.
+
+\subsubsection{Compatibility with Latin-1 and UTF-8}
+
+The UTF-8 encoding of Unicode and the Latin-1 (ISO-8859-1) encoding
+share the first 128 code points with ASCII and thus cannot be
+distinguished from ASCII.  That means you can set editors, etc., to
+use UTF-8 or Latin-1 (or the other Latin-n variants) without worrying
+that the content of a Stan program will be destroyed.
 
 \subsection{Comments}
 
-The content of comments is ignored by the language compiler and may be
-written using any character encoding (e.g., ASCII, UTF-8, Latin1,
-Big5).  The comment delimiters themselves must be coded in ASCII.
+Any bytes on a line after a line-comment sequence (\code{//} or
+\Verb|#|) are ignored up until the ASCII newline character
+(\Verb|\n|).  They may thus be written in any character encoding which
+is convenient.
 
+Any content after a block comment open sequence in ASCII (\code{/*})
+up to the closing block comment (\code{*/}) is ignored, and thus may
+also be written in whatever character set is convenient.
 
 \section{Includes}\label{includes.section}
 
@@ -281,7 +294,7 @@ \subsection{Constrained Data Types}
 declared constraints but do not have finite log density, then the
 samplers and optimizers may have any of a number of pathologies
 including just getting stuck, failure to initialize, excessive
-Metropolis rejection, or biased samples due to inability to explore
+Metropolis rejection, or biased draws due to inability to explore
 the tails of the distribution.
 
 
@@ -503,6 +516,36 @@ \subsection{Expressions as Bounds}
 in the example code, the functions \code{min()} and \code{max()} may
 be applied to containers such as arrays.
 
+\subsection{Declaring Optional Variables}
+
+A variable may be declared with a size that depends on a boolean
+constant.  For example, consider the definition of \code{alpha} in the
+following program fragment.
+%
+\begin{stancode}
+data {
+  int<lower = 0, upper = 1> include_alpha;
+  ...
+parameters {
+  vector[include_alpha ? N : 0] alpha;
+\end{stancode}
+%
+If \code{include\_alpha} is true, the model will include the vector
+\code{alpha};  if the flag is false, the model will not include
+\code{alpha} (technically, it will include \code{alpha} of size 0,
+which means it won't contain any values and won't be included in any
+output).
+
+This technique is not just useful for containers.  If the value of
+\code{N} is set to 1, then the vector \code{alpha} will contain a
+single element and thus \code{alpha[1]} behaves like an optional
+scalar, the existence of which is controlled by \code{include\_alpha}.
+
+This coding pattern allows a single Stan program to define different
+models based on the data provided as input.  This strategy is used
+extensively in the implementation of the RStanArm package (see
+\url{http://mc-stan.org/users/interfaces/rstanarm}).
+
 \section{Vector and Matrix Data Types}
 
 Stan provides three types of container objects: arrays, vectors, and
@@ -655,8 +698,8 @@ \subsection{Positive, Ordered Vectors}
 positive_ordered[5] d;
 \end{stancode}
 %
-Like ordered vectors, after their declaration positive ordered vectors
-assigned to other vectors and other vectors may be assigned to them.
+Like ordered vectors, after their declaration, positive ordered vectors
+may be assigned to other vectors and other vectors may be assigned to them.
 Constraints will be checked after executing the block in which the
 variables were declared.
 
@@ -1378,17 +1421,18 @@ \subsection{Real Literals}
 
 A number written with a period or with scientific notation is assigned
 to a the continuous numeric type \code{real}.  Real literals are
-written in base 10 with a period (\code{.}) as a separator.  Examples
+written in base 10 with a period (\code{.}) as a separator and
+optionally an exponent with optional sign.  Examples
 of well-formed real literals include the following.
 %
 \begin{quote}
 \code{0.0}, \ \code{1.0}, \ \code{3.14}, \ \code{-217.9387}, \
-\code{2.7e3}, \ \code{-2E-5}
+\code{2.7e3}, \ \code{-2E-5}, \ \code{1.23e+3}.
 \end{quote}
 %
 The notation \code{e} or \code{E} followed by a positive or negative
 integer denotes a power of 10 to multiply.  For instance, \code{2.7e3}
-denotes $2.7 \times 10^3$ and \code{-2E-5} denotes $-2 \times
+and \code{2.7e+3} denote $2.7 \times 10^3$, whereas \code{-2E-5} denotes $-2 \times
 10^{-5}$.
 
 
@@ -2642,7 +2686,7 @@ \subsection{Errors Due to Chain Rule}
   x ~ normal(0, 1);
 \end{stancode}
 %
-and it would seem the model should produce unit normal samples for
+and it would seem the model should produce unit normal draws for
 \code{x}.  But rather than canceling, the expression \code{sqrt(x -
   x)} causes a problem for derivatives.  The cause is the mechanistic
 evaluation of the chain rule,
@@ -2717,6 +2761,17 @@ \chapter{Statements}
 local variables to be declared in blocks and also allows an empty
 statement consisting only of a semicolon.
 
+\section{Statement Block Contexts}
+
+The data and parameters blocks do not allow statments of any kind
+because these blocks are solely used to declare the data variables for
+input and the parameter variables for sampling.  All other blocks
+allow statements.  In these blocks, both variable declarations and
+statements are allowed.  The first statement that is not a variable
+declaration ends the list of block variable declarations.  See
+\refchapter{blocks} for more information about the block structure of
+Stan programs.
+
 
 \section{Assignment Statement}\label{assignment-statement.section}
 
@@ -3989,8 +4044,7 @@ \section{Reject Statements}\label{reject-statements.section}
 
 The Stan \code{reject} statement provides a mechanism to
 report errors or problematic values encountered during program
-execution and either halt processing or reject samples or optimization
-iterations.
+execution and either halt processing or reject iterations.
 
 Like the \code{print} statement, the reject statement accepts
 any number of quoted string literals or Stan expressions as arguments.
@@ -4225,7 +4279,7 @@ \subsection{Transformed Variables}
 variables scope over every statement that follows them, transformed
 data variables may be defined in terms of the data variables.
 
-Before generating any samples, data variables are read in, then the
+Before generating any draws, data variables are read in, then the
 transformed data variables are declared and the associated statements
 executed to define them.  This means the statements in the transformed
 data block are only ever evaluated once.%
@@ -4481,7 +4535,7 @@ \subsection{Variable Reads and Transformations}
 
 The \code{data} block is for the declaration of variables that are
 read in as data.  With the current model executable, each Markov chain
-of samples will be executed in a different process, and each such
+of draws will be executed in a different process, and each such
 process will read the data exactly once.%
 %
 \footnote{With multiple threads, or even running chains sequentially
@@ -4622,14 +4676,14 @@ \subsection{Gradient Calculation}
 unconstrained parameters, but this is usually dwarfed by the gradient
 calculations.
 
-\subsection{Writing Samples}
+\subsection{Writing Draws}
 
-In the basic Stan compiled program, the values of variables are
-written to a file for each sample.  The constrained versions of the
-variables are written, again in the order they are defined in the
-\code{parameters} block.  In order to do this, the transformed
-parameter, model, and generated quantities statements must be
-executed.
+In the basic Stan compiled program, there is a file to which the
+values of variables are written for each draw.  The constrained
+versions of the variables are written in the order they are
+defined in the \code{parameters} block.  In order to do this, the
+transformed parameter, model, and generated quantities statements must
+also be executed.
 
 
 \section{Program Block: \code{transformed parameters}}
@@ -4638,7 +4692,7 @@ \section{Program Block: \code{transformed parameters}}
 variable declarations followed by statements.  After the statements
 are executed, the constraints on the transformed parameters are
 validated.  Any variable declared as a transformed parameter is part
-of the output produced for samples.
+of the output produced for draws.
 
 Any variable that is defined wholly in terms of data or transformed
 data should be declared and defined in the transformed data block.
@@ -4696,9 +4750,9 @@ \section{Program Block: \code{generated quantities}}
 \item calculating log likelihoods, deviances, etc.\ for model comparison.
 \end{itemize}
 %
-Forward samples, event probabilities and statistics may all be
+Parameter estimates, predictions, statistics, and event probabilities
 calculated directly using plug-in estimates.  Stan automatically
-provides full Bayesian inference by producing samples from the
+provides full Bayesian inference by producing draws from the
 posterior distribution of any calculated event probabilities,
 predictions, or statistics.  See \refchapter{bayesian} for more
 information on Bayesian inference.
@@ -5312,7 +5366,7 @@ \subsection{Log Probability and Gradient Calculation}
 \subsection{Metropolis Accept/Reject}
 
 A standard Metropolis accept/reject step is required to retain detailed
-balance and ensure samples are marginally distributed according to the
+balance and ensure draws are marginally distributed according to the
 probability function defined by the model.  This Metropolis adjustment
 is based on comparing log probabilities, here defined by the
 Hamiltonian, which is the sum of the potential (negative log
@@ -5350,7 +5404,7 @@ \section{Variational Inference}
 Variational inference also runs similar to sampling. It begins by reading the
 data and initializing the algorithm. The initial variational approximation is a
 random draw from the standard normal distribution in the unconstrained
-(real-coordinate) space. Again, similar to sampling, it outputs samples from the
+(real-coordinate) space. Again, similar to sampling, it outputs draws from the
 approximate posterior once the algorithm has decided that it has converged.
 Thus, the tools we use for analyzing the result of Stan's sampling routines can
 also be used for variational inference.
@@ -5368,8 +5422,8 @@ \section{Model Diagnostics}
 
 \section{Output}
 
-For each final sample (not counting samples during warmup or samples
-that are thinned), there is an output stage of writing the samples.
+For each final draw (not counting draws during warmup or draws
+that are thinned), there is an output stage of writing the draw.
 
 \subsection{Generated Quantities}
 
diff --git a/src/docs/stan-reference/preface.tex b/src/docs/stan-reference/preface.tex
index 6bc620bc5fa..85eb733a08d 100644
--- a/src/docs/stan-reference/preface.tex
+++ b/src/docs/stan-reference/preface.tex
@@ -28,8 +28,8 @@ \section*{Why Stan?}
 We briefly considered trying to tune proposals for a random-walk
 Metropolis-Hastings sampler, but that seemed too problem specific and
 not even necessarily possible without some kind of adaptation rather
-than tuning of the proposals. 
- 
+than tuning of the proposals.
+
 
 \section*{The Path to Stan}
 
@@ -53,7 +53,7 @@ \section*{The Path to Stan}
 {\sc coin-or} toolkit.  But neither package supported very many
 special functions (e.g., probability functions, log gamma, inverse logit) or
 linear algebra operations (e.g., Cholesky decomposition) and were not
-easily and modularly extensible.  
+easily and modularly extensible.
 
 So we built our own reverse-mode algorithmic differentiation package.
 But once we'd built our own reverse-mode algorithmic differentiation
@@ -62,7 +62,7 @@ \section*{The Path to Stan}
 templated on all the arguments.  We only needed algorithmic
 differentiation variables for parameters, not data or transformed
 data, and promotion is very inefficient in both time and memory.  So
-we wrote our own fully templated probability functions.  
+we wrote our own fully templated probability functions.
 
 Next, we integrated the Eigen \Cpp package for matrix operations and
 linear algebra functions.  Eigen makes extensive use of expression
@@ -72,12 +72,12 @@ \section*{The Path to Stan}
 libraries --- it doesn't support mixed operations of algorithmic
 differentiation variables and primitives like \code{double}.
 
-At this point (Spring 2011), we were happily fitting models coded 
-directly in \Cpp on top of the pre-release versions of the Stan API. 
-Seeing how well this all worked, we set our sights on the generality 
-and ease of use of \BUGS.  So we designed a modeling language in which 
-statisticians could write their models in familiar notation that could 
-be transformed to efficient \Cpp code and then compiled into an 
+At this point (Spring 2011), we were happily fitting models coded
+directly in \Cpp on top of the pre-release versions of the Stan API.
+Seeing how well this all worked, we set our sights on the generality
+and ease of use of \BUGS.  So we designed a modeling language in which
+statisticians could write their models in familiar notation that could
+be transformed to efficient \Cpp code and then compiled into an
 efficient executable program.  It turned out that our modeling
 language was a bit more general than we'd anticipated, and we had an
 imperative probabilistic programming language on our hands.%
@@ -86,7 +86,7 @@ \section*{The Path to Stan}
   probabilistic programming languages for specifying a directed
   graphical model.  In these languages, stochastic and deterministic
   (poor choice of name) nodes may represent random quantities.}
- 
+
 The next problem we ran into as we started implementing richer models
 is variables with constrained support (e.g., simplexes and covariance
 matrices).  Although it is possible to implement \HMC with bouncing
@@ -124,7 +124,7 @@ \section*{The Path to Stan}
 because the conjugate priors and lack of posterior correlations make
 it an ideal candidate for efficient Gibbs sampling.  But we thought
 the efficiency of compilation might compensate for the lack of ideal
-fit to the problem.  
+fit to the problem.
 
 We realized we were doing redundant calculations, so we wrote a
 vectorized form of the normal distribution for multiple variates with
@@ -176,7 +176,7 @@ \section*{Stan 2}
 while the outside hasn't changed dramatically in Stan 2, the inside is
 almost totally different in terms of how the HMC samplers are
 organized, how the output is analyzed, how the mathematics library is
-organized, etc.  
+organized, etc.
 
 We've also improved our original simple optimization algorithms and
 now use L-BFGS (a limited memory quasi-Newton method that uses
@@ -218,40 +218,18 @@ \section*{Stan 2}
 scalably, and reliably.
 
 
-\section*{Stan's Future}
+\section*{Stan's Roadmap}
 
-We're not done. There's still an enormous amount of work to do to
-improve Stan.  Some older, higher-level goals are in a standalone
-to-do list:
+There's still an enormous amount of work to do to improve Stan.
+Rather than a step-by-step roadmap, we have collected a list of
+high-level development goals on a wiki page:
 %
 \begin{quote}
 \url{https://github.com/stan-dev/stan/wiki/Longer-Term-To-Do-List}
 \end{quote}
-
 %
-We are gradually weaning ourselves off of the to-do list in favor of
-the GitHub issue tracker (see the next section for a link).
-
-Some major features are on our short-term horizon: Riemannian manifold
-Hamiltonian Monte Carlo (RHMC), transformed Laplace approximations
-with uncertainty quantification for maximum likelihood estimation,
-marginal maximum likelihood estimation, data-parallel expectation
-propagation, and streaming (stochastic) variational inference.  The
-latter has been prototyped and described in papers. 
-
-We will also continue to work on improving numerical stability and
-efficiency throughout.  In addition, we plan to revise the interfaces
-to make them easier to understand and more flexible to use (a
-difficult pair of goals to balance).
-
-Later in the Stan 2 release cycle (Stan 2.7), we added variational
-inference to Stan's sampling and optimization routines, with the
-promise of approximate Bayesian inference at much larger scales than
-is possible with Monte Carlo methods.  The future plans involve
-extending to a stochastic data-streaming implementation for very
-large-scale data problems.
-
-
+Issues start here before they have complete functional specifications
+and move to the GitHub issue tracker.
 
 \section*{You Can Help}
 
@@ -259,10 +237,10 @@ \section*{You Can Help}
 suggestions for Stan.  We're especially interested in hearing about
 models you've fit or had problems fitting with Stan.  The best way to
 communicate with the Stan team about user issues is through the
-following user's group.
+Stan forums:
 %
 \begin{quote}
-\url{http://groups.google.com/group/stan-users}
+\url{http://discourse.mc-stan.org}
 \end{quote}
 %
 For reporting bugs or requesting features, Stan's issue tracker is at
@@ -278,15 +256,10 @@ \section*{You Can Help}
 \footnote{See \refappendix{licensing} for more information on Stan's
   licenses and the licenses of the software on which it depends.}
 %
-is that we love to collaborate.  We're interested in hearing
-from you if you'd like to volunteer to get involved on the development
-side.  We have all kinds of projects big and small that we haven't had
-time to code ourselves.  For developer's issues, we have a separate
-group.
-%
-\begin{quote}
-\url{http://groups.google.com/group/stan-dev}
-\end{quote}
+is that the Stan developers like to collaborate.  We're interested in
+hearing from you if you'd like to volunteer to get involved on the
+development side.  We have all kinds of projects big and small that we
+haven't had time to code ourselves.
 
 To contact the project developers off the mailing lists, send email to
 \begin{quote}
diff --git a/src/docs/stan-reference/process.tex b/src/docs/stan-reference/process.tex
index 473e8a57d22..91ccfffbd4e 100644
--- a/src/docs/stan-reference/process.tex
+++ b/src/docs/stan-reference/process.tex
@@ -314,21 +314,18 @@ \section{Operational Overview}
 both academic institutions and industrial labs.
 
 Communication among team members takes place in several venues.  Most
-discussions take place openly on the Stan developers group, often
-initiated by discussions in the Stan users group.%
+discussions take place openly on the Stan forum.%
 %
-\footnote{The groups are hosted by Google Groups, with information on
-  reading and posting available from \url{http://mc-stan.org/groups.html}.}
+\footnote{The groups are hosted at \url{http://discourse.mc-stan.org}.}
 %
-The users and developers groups are archived and may be read by
-anyone at any time. Communication that's not suitable for the public,
-such as grant funding, is carried out on a private group restricted to
-the core Stan developers. Further issue-specific discussion takes
-place concurrently with development and source control.%
+The Stan forums are archived and may be read by anyone at any
+time. Communication that's not suitable for the public, such as grant
+funding, is carried out on a private group restricted to the core Stan
+developers. Further issue-specific discussion takes place concurrently
+with development and source control.%
 %
 \footnote{The issue tracker for feature requests and bug reports is
-  hosted by GitHub.  Full information on reading and making issue
-  requests is available from \url{http://mc-stan.org/issues.html}.}
+  hosted by GitHub.}
 %
 Bigger design issues and discussions that should be preserved take
 place on the project Wiki.%
@@ -339,8 +336,8 @@ \section{Operational Overview}
 The developers host a weekly videoconference during which project
 issues are discussed and prioritized.%
 %
-\footnote{The weekly meetings are hosted on Google+.  They are not
-  recorded or stored.}
+\footnote{The weekly meetings are hosted on Google Hangouts.  They are
+  not recorded or stored.}
 %
 The developers also meet informally at their places of employment
 (e.g., Columbia University) or at conferences and workshops when
@@ -631,28 +628,27 @@ \section{Maintenance, Support, and Retirement}
 
 There is extensive documentation in the form of manuals available for
 the Stan language and algorithms
-(\url{http://mc-stan.org/manual.html}), as well as each of the
-interfaces, CmdStan (\url{http://mc-stan.org/cmdstan.html}), PyStan
-(\url{http://mc-stan.org/pystan.html}), and RStan
-(\url{http://mc-stan.org/cmdstan.html}). There is also an extensive
-suite of example models (\url{http://mc-stan.org/documentation}) which
-may be used directly or modified by users. There is also a fairly
-extensive set of Wiki pages geared toward developers
+(\url{http://mc-stan.org/users/documentation}), as well as the
+interfaces for the command line, R, Python, MATLAB, Mathematica,
+Stata, and Julia (\url{http://mc-stan.org/users/interfaces}).
+
+There is also an extensive suite of tutorials and example models
+(\url{http://mc-stan.org/users/documentation}) which may be used directly or
+modified by users. There is also a fairly extensive set of Wiki pages
+geared toward developers
 (\url{https://github.com/stan-dev/stan/wiki}).
 
 Issue trackers for reporting bugs and requesting features are
-available online for Stan C++
-(\url{https://github.com/stan-dev/stan/issues}), CmdStan
-(\url{https://github.com/stan-dev/cmdstan/issues}), RStan
-(\url{https://github.com/stan-dev/rstan/issues}), and PyStan
-(\url{https://github.com/stan-dev/pystan/issues}).
-
-There is Stan users group and also a group for Stan developers that
-can be accessed online, in daily news digest form, or as an e-mail
-list (see \url{http://mc-stan.org/groups.html}).  The users group is
-where users can request support for installation issues, modeling
-issues, or performance/accuracy issues.  These lists all come with
-built-in search facilities through their host, Google Groups.
+available online for Stan's C++ math library and core library, as well
+as for the interfaces (all available from the organization page,
+(\url{https://github.com/stan-dev}).
+
+There is Stan forum for both users and developers (see
+\url{http://discourse.mc-stan.org}).  The users topics allow users can
+request support for installation issues, modeling issues, or
+performance/accuracy issues.  These lists all come with built-in
+search facilities through their host or simply through top-level
+web searches in the search engine of your choice.
 
 A number of books provide introductions to Stan, including {\it
   Bayesian Data Analysis, 3rd Edition} \citep{GelmanEtAl:2013} and
@@ -812,10 +808,10 @@ \section{Contributing a Stan Module}
 Stan developers either through one of the following.
 %
 \begin{itemize}
-\item \code{stan-users} mailing list:
+\item Stan forums: \
 \\
-\url{https://groups.google.com/forum/?fromgroups#!forum/stan-users}
-\item Stan e-mail:
+\url{http://discourse.mc-stan.org}
+\item Stan e-mail: \
 \\
 \href{mailto:mc.stanislaw@gmail.com}{\tt mc.stanislaw@gmail.com}
 \end{itemize}
diff --git a/src/docs/stan-reference/programming.tex b/src/docs/stan-reference/programming.tex
index cf1d32415bf..ed4e766fbdd 100644
--- a/src/docs/stan-reference/programming.tex
+++ b/src/docs/stan-reference/programming.tex
@@ -94,11 +94,8 @@ \subsubsection{Beta Distribution}
   real<lower=0.1> lambda;
   ...
 transformed parameters {
-  real<lower=0> alpha;
-  real<lower=0> beta;
-  ...
-  alpha = lambda * phi;
-  beta = lambda * (1 - phi);
+  real<lower=0> alpha = lambda * phi;
+  real<lower=0> beta = lambda * (1 - phi);
   ...
 model {
   phi ~ beta(1, 1); // uniform on phi, could drop
@@ -117,10 +114,8 @@ \subsubsection{Beta Distribution}
 %
 \begin{stancode}
 model {
-  real alpha;
-  real beta;
-  alpha = lambda * phi;
-  beta = lambda * (1 - phi);
+  real alpha = lambda * phi
+  real beta = lambda * (1 - phi);
 ...
   for (n in 1:N)
     theta[n] ~ beta(alpha, beta);
@@ -182,7 +177,7 @@ \subsubsection{Dirichlet Priors}
   simplex[K] theta[N];
   ...
 transformed parameters {
-  vector[K] alpha = kappa * theta;
+  vector[K] alpha = kappa * phi;
   ...
 }
 model {
@@ -246,7 +241,7 @@ \subsection{Transforming Unconstrained Priors: Probit and Logit}
   real<lower = 0, upper = 1> theta = inv_logit(theta_raw);
 ...
 model {
-  theta_unc ~ logistic(mu, sigma);
+  theta_raw ~ logistic(mu, sigma);
   ...
   y ~ foo(theta);
 ...
@@ -307,6 +302,48 @@ \subsection{Transforming Unconstrained Priors: Probit and Logit}
 %
 where either or both of \code{mu} and \code{sigma} can be vectors.
 
+
+% \subsection{Transforming multinomial to Poisson}
+
+% Lancaster \cite{lancaster:2000} provides an efficient
+% reparameterization of the multinomial distribution in terms of the
+% Poisson distribution.  Suppose we have a multinomial such as the
+% following:
+
+% \begin{stancode}
+% data {
+%   int<lower = 0> K;        // number of categories
+%   int<lower = 0> x[N, K];  // outcomes
+% \end{stancode}
+
+
+% This reparameterization is critical for
+% applying multinomials efficiently in Stan.
+
+% \begin{stancode}
+% data {
+%   int<lower = 0> K;  // number of categories
+%   ...
+% }
+% parameters {
+%   vector[K] log_alpha_star;
+%   ...
+% }
+% transformed parameters {
+%   vecotr[K] log_alpha;
+%   for (g in 1:groups)
+%     log_alpha[g] = log_alphastar[g] - log_sum_exp(x[g] * beta);
+% }
+% model {
+%   log_alpha_star ~ ...;  // some prior
+%   x ~ poisson_log(log_alpha);
+% }
+% \end{stancode}
+
+
+
+
+
 \section{Changes of Variables}
 
 Changes of variables are applied when the transformation of a
@@ -610,7 +647,7 @@ \chapter{Custom Probability Functions}%
 
 \section{Examples}
 
-\subsection{Triangle Distribution}
+\subsection{Triangle distribution}
 
 A simple example is the triangle distribution,
 whose density is shaped like an isosceles triangle with corners at
@@ -639,7 +676,7 @@ \subsection{Triangle Distribution}
 $\distro{Triangle}(-1,1)$ for sampling.%
 %
 \footnote{The program is available in the Stan example model repository;
-see \url{http://mc-stan.org/documentation}.}
+see \url{http://mc-stan.org/users/documentation}.}
 %
 \begin{stancode}
 parameters {
@@ -689,7 +726,7 @@ \subsection{Triangle Distribution}
   \refchapter{variable-transforms} for more information on the
   transforms used by Stan.}
 
-\subsection{Exponential Distribution}
+\subsection{Exponential distribution}
 
 If Stan didn't happen to include the exponential distribution, it
 could be coded directly using the following assignment statement,
@@ -724,6 +761,46 @@ \subsection{Exponential Distribution}
 constants, the sampling statement will drop both terms (but still
 check for out-of-domain errors on the inputs).
 
+\subsection{Bivariate normal cumulative distribution function}
+
+For another example of user-defined functions, consider the following
+definition of the bivariate normal cumulative distribution function
+(CDF) with location zero, unit variance, and correlation \code{rho}.
+That is, it computes
+\[
+\mbox{binormal\_cdf}(z_1, z_2, \rho) = \mbox{Pr}[Z_1 > z_1 \mbox{ and } Z_2 > z_2]
+\]
+where the random 2-vector $Z$ has the distribution
+\[
+Z \sim \distro{MultiNormal}\left(
+\begin{bmatrix}
+0 \\
+0
+\end{bmatrix}, \
+\begin{bmatrix}
+1 & \rho
+\\
+\rho & 1
+\end{bmatrix}
+]\right).
+\]
+%
+The following Stan program implements this function,
+%
+\begin{stancode}
+real binormal_cdf(real z1, real z2, real rho) {
+  if (z1 != 0 || z2 != 0) {
+    real denom = fabs(rho) < 1.0 ? sqrt((1 + rho) * (1 - rho)) : not_a_number();
+    real a1 = (z2 / z1 - rho) / denom;
+    real a2 = (z1 / z2 - rho) / denom;
+    real product = z1 * z2;
+    real delta = product < 0 || (product == 0 && (z1 + z2) < 0);
+    return 0.5 * (Phi(z1) + Phi(z2) - delta) - owens_t(z1, a1) - owens_t(z2, a2);
+  }
+  return 0.25 + asin(rho) / (2 * pi());
+}
+\end{stancode}
+
 
 \chapter{User-Defined Functions}\label{functions-programming.chapter}
 
@@ -852,7 +929,7 @@ \subsubsection{Catching errors}
 In generated quantities, rejections cause execution to halt because
 there is no way to recover and generate the remaining parameters, so
 extra care should be taken in calling functions in the generated
-quantities block.  See Section~\refsection{reject-statements} for full
+quantities block.  See \refsection{reject-statements} for full
 details about the reject statement.
 
 \subsection{Type Declarations for Functions}
@@ -1022,7 +1099,7 @@ \subsection{Data-only Function Arguments}
 
 This qualifier should be used when writing functions that call Stan's
 built-in ordinary differential equation (ODE) solvers or using Stan's
-algebraic solver. 
+algebraic solver.
 These solvers have
 strictly specified signatures where some arguments of are data only
 expressions. (See \refchapter{ode-solver},
@@ -1420,7 +1497,7 @@ \subsubsection{Redundant Intercepts}
 %
 The consequence is that an improper uniform prior $p(\mu,\sigma)
 \propto 1$ leads to an improper posterior.  This impropriety arises
-because the neighborhoods around $\lambda_1 + q, \lambda_1 - q$ have
+because the neighborhoods around $\lambda_1 + q, \lambda_2 - q$ have
 the same mass no matter what $q$ is.  Therefore, a sampler would need
 to spend as much time in the neighborhood of $\lambda_1=1000000000$
 and $\lambda_2=-1000000000$ as it does in the neighborhood of
@@ -2462,6 +2539,43 @@ \section{Converting among Matrix, Vector, and Array Types}
 reshaping operations involving multiple indexing and range indexing.
 
 
+\section{Aliasing in Stan Containers}
+
+Stan expressions are all evaluated before assignment happens, so there
+is no danger of so-called aliasing in array, vector, or matrix
+operations.  Contrast the behavior of the assignments to \code{u} and
+\code{x}, which start with the same values.
+
+The loop assigning to \code{u} and the compound slicing assigning to \code{x}.
+
+ the following trivial Stan program.
+
+\begin{stancode}
+transformed data {
+  vector[4] x = [ 1, 2, 3, 4 ]';
+  vector[4] u = [ 1, 2, 3, 4 ]';
+
+  for (t in 2:4)
+    u[t] = u[t - 1] * 3;
+
+  x[2:4] = x[1:3] * 3;
+
+  print("u = ", u);
+  print("x = ", x);
+}
+\end{stancode}
+%
+The output it produces is as follows
+%
+\begin{stancode}
+u = [1,3,9,27]
+x = [1,3,6,9]
+\end{stancode}
+%
+In the loop version assiging to \code{u}, the values are updated before being used to
+define subseqeunt values;  in the sliced expression assigning to
+\code{x}, the entire right-hand side is evaluated before assigning to
+the left-hand side.
 
 \chapter{Multiple Indexing and Range Indexing}\label{multi-indexing.chapter}
 
@@ -2846,7 +2960,7 @@ \section{Matrices with Parameters and Constants}
 
 \begin{stancode}
 transformed data {
-  int<lower=1, upper=3> idxs[7,
+  int<lower=1, upper=3> idxs[7, 2]
     = { {1, 1},
         {2, 1}, {2, 2}, {2, 3},
         {3, 1}, {3, 2}, {3, 3} };
@@ -3075,6 +3189,42 @@ \section{Well-Specified Models}
   sigma)}, where \code{mu} and \code{sigma} are new parameters, with
 their own hyperpriors.
 
+\section{Avoiding Validation}
+
+Stan validates all of its data structure constraints.  For example,
+consider a transformed parameter defined to be a covariance matrix and
+then used as a covariance parameter in the model block.
+%
+\begin{stancode}
+transformed parameters {
+  cov_matrix[K] Sigma;
+  ...
+}                               // first validation
+model {
+  y ~ multi_normal(mu, Sigma);  // second validation
+  ...
+\end{stancode}
+%
+Becuase \code{Sigma} is declared to be a covariance matrix, it will be
+factored at the end of the transformed parameter block to ensure that
+it is positive definite.  The multivariate normal log density function
+also validates that \code{Sigma} is positive definite.  This test is
+expensive, having cubic run time (i.e., $\mathcal{O}(N^3)$ for
+$N \times N$ matrices), so it should not be done twice.%
+
+The test may be avoided by simply declaring \code{Sigma} to be a simple
+unconstrained matrix.
+%
+\begin{stancode}
+transformed parameters {
+  matrix[K, K] Sigma;
+  ...
+model {
+  y ~ multi_normal(mu, Sigma);  // only validation
+\end{stancode}
+%
+Now the only validation is carried out by the multivariate normal.
+
 
 \section{Reparameterization}\label{reparameterization.section}
 
diff --git a/src/docs/stan-reference/programs/mining-disasters/changepoint-missing.stan b/src/docs/stan-reference/programs/mining-disasters/changepoint-missing.stan
new file mode 100644
index 00000000000..2846818c592
--- /dev/null
+++ b/src/docs/stan-reference/programs/mining-disasters/changepoint-missing.stan
@@ -0,0 +1,53 @@
+// this is a trivial missing data problem because
+// there are only two possible cases for missingness,
+// and in both cases, the posterior predictive is
+// analytic given e and l
+
+// but it does make sense to do the model with not every
+// year observed, assuming unobserved years are missing at
+// random
+
+data {
+  int<lower=0> t_l;                         // earliest time
+  int<lower=t_l> t_h;                       // latest time
+
+  real<lower=0> r_e;                        // before rate prior
+  real<lower=0> r_l;                        // after rate prior
+
+  int<lower=1> N_obs;                       // num observed
+  int<lower=t_l,upper=t_h> T_obs[N_obs];    // times
+  int<lower=0> D_obs[N_obs];                // disasters
+
+  int<lower=0> N_miss;                      // num missing
+  int<lower=t_l,upper=t_h> T_miss[N_miss];  // missing times
+}
+transformed data {
+  real log_unif;
+  int<lower=0> S;
+  S <- t_h - t_l + 1;
+  log_unif <- -log(S);  // log p(s) = log Uniform(s|1,T)
+}
+parameters {
+  real<lower=0> e;    // before rate
+  real<lower=0> l;    // after rate
+}
+transformed parameters {
+  real lp[S];
+  lp <- rep_array(log_unif, T);
+  for (s in t_l:t_h)
+    for (n in 1:N)
+      lp[s] <- lp[s] + poisson_log(D[n], if_else(T[n] < s, e, l));
+}
+model {
+  // prior
+  e ~ exponential(r_e);
+  l ~ exponential(r_l);
+
+  // likelihood
+  increment_log_prob(log_sum_exp(lp));
+}
+generated quantities {
+  int<lower=0> D_miss[N_miss];
+  for (n in 1:N_miss)
+    D_miss[n] <- poisson_rng(if_else(T_miss[n] < s, e, l));
+}
diff --git a/src/docs/stan-reference/programs/mining-disasters/changepoint.stan b/src/docs/stan-reference/programs/mining-disasters/changepoint.stan
new file mode 100644
index 00000000000..f3c615f4f67
--- /dev/null
+++ b/src/docs/stan-reference/programs/mining-disasters/changepoint.stan
@@ -0,0 +1,73 @@
+/**
+ * Changepoint model for UK coal mining disasters between 1851 and
+ * 1962.
+ *
+ * Data is a series D[t] for t in 1:T of yearly disaster counts for
+ * year t.
+ *
+ * Priors:
+ *   e ~ exponential(r_e)
+ *   l ~ exponential(r_e)
+ * 
+ * Latent changepoint:
+ *   s ~ uniform(1,T);
+ *
+ * Likelihood of disasters:
+ *   D[t] ~ Poisson(t < s ? e : l)
+ *
+ * Estimand of interest is s, the point at which the disaster
+ * rate changed.
+ *
+ * Marginalization for Stan implementation:
+ *
+ *   p(e,l|D) = exponential(e|r_e) * exponential(l|r_l)
+ *            * SUM_s Uniform(s|1,T) * PROD_t Poisson(D[t] | t < s ? e : l)
+ *
+ * where
+ *
+ *   log SUM_s Uniform(s|1,T) * PROD_t Poisson(D[t] | t < s ? e : l)
+ *   = LOG_SUM_EXP_s ( log Uniform(s|1,T) 
+ *                     + SUM_t log Poisson(D[t] | t < s ? e : l) )
+ *
+ * Model from PyMC 2.3 documentation
+ *   http://pymc-devs.github.io/pymc/index.html
+ *   http://pymc-devs.github.io/pymc/tutorial.html#an-example-statistical-model
+ * 
+ * Data from: 
+ *   R.G. Jarrett. A note on the intervals between coal mining
+ *   disasters.  Biometrika, 66:191–193, 1979.
+ */
+data {
+  real<lower=0> r_e;  // before change disaster rate prior
+  real<lower=0> r_l;  // after change disaster rate prior
+
+  int<lower=1> T;     // years
+  int<lower=0> D[T];  // disasters per year
+}
+transformed data {
+  real log_unif;
+  log_unif <- -log(T);  // log p(s)
+}
+parameters {
+  real<lower=0> e;    // before change disaster rate
+  real<lower=0> l;    // after change disaster rate
+}
+transformed parameters {
+  vector[T] lp;
+  lp <- rep_vector(log_unif, T);
+  for (s in 1:T)
+    for (t in 1:T)
+      lp[s] <- lp[s] + poisson_log(D[t], if_else(t < s, e, l));
+}
+model {
+  // prior
+  e ~ exponential(r_e);
+  l ~ exponential(r_l);
+
+  // likelihood
+  increment_log_prob(log_sum_exp(lp));
+}    
+generated quantities {
+  int<lower=1,upper=T> s;
+  s <- categorical_rng(softmax(lp));
+}
diff --git a/src/docs/stan-reference/programs/mining-disasters/data.R b/src/docs/stan-reference/programs/mining-disasters/data.R
new file mode 100644
index 00000000000..0ab2ca66a5f
--- /dev/null
+++ b/src/docs/stan-reference/programs/mining-disasters/data.R
@@ -0,0 +1,10 @@
+r_e <- 1
+r_l <- 1
+T <- 111
+D <- c(4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
+       3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
+       2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
+       1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
+       0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
+       3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
+       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1)
diff --git a/src/docs/stan-reference/programs/mining-disasters/fit.R b/src/docs/stan-reference/programs/mining-disasters/fit.R
new file mode 100644
index 00000000000..8e2d146e8d9
--- /dev/null
+++ b/src/docs/stan-reference/programs/mining-disasters/fit.R
@@ -0,0 +1,34 @@
+source("data.R");
+fit <- stan("changepoint.stan", data=c("r_e", "r_l", "T", "D"));
+fit_ss <- extract(fit);
+log_Pr_s <- rep(0,T);
+for (t in 1:T)
+  log_Pr_s[t] <- mean(fit_ss$lp[,t]);
+print(log_Pr_s);
+
+log <- softmax <- function(x) {
+  z <- exp(x - max(x));
+  return(log(z / sum(z)));
+}
+
+
+library("ggplot2");
+qplot(1:T,log(log_softmax(log_Pr_s))) +
+ xlab("year") + ylab("log p(change at year)") +
+ scale_x_discrete(breaks=c(1875,1900,1925,1950)-1850,
+                  labels=c("1875","1900","1925","1950"))
+
+ss_s <- fit_ss$s
+earliest <- min(ss_s);
+latest <- max(ss_s);
+frequency <- rep(0,latest - earliest + 1);
+for (n in 1:length(ss_s)) {
+  idx <- ss_s[n] - earliest + 1;
+  frequency[idx] <- frequency[idx] + 1;
+}
+year <- 1850 + (earliest:latest);
+
+ggplot(data = data.frame(year=year, frequency=frequency),
+       aes(x=year,y=frequency)) +
+ geom_bar(stat="identity", fill="white",color="black") +
+  xlab("year") + ylab("frequency in 4000 draws")