\documentclass[10pt]{article}
\usepackage[american]{babel}
\usepackage[utf8]{inputenc}
%\newcommand{\R}{{\sf R}}
\newcommand{\Splus}{{\sf S-PLUS}}
\newcommand{\fixme}[1]{\textbf{FIXME: #1}}
%\newcommand{\fixme}[1]{}
\usepackage{url}
\usepackage{alltt}
\usepackage{fancyvrb} % with VerbatimFootnotes, allow verb in footnotes
\usepackage{listings}
\usepackage{verbatim}
\usepackage{hyperref}
\usepackage{natbib}
\usepackage{graphicx}
\DeclareGraphicsExtensions{.pdf}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bm}

\newcommand{\code}[1]{{\tt #1}}
\newcommand{\ie}{i.e.\ }
\newcommand{\eg}{e.g.\ }
\newcommand{\TLP}{theta logistic population\ }

\newcommand{\N}{\mathcal{N}}
\newcommand{\bt}{\bm{\theta}} % Bold theta
\newcommand{\mb}[1]{\mathbf{#1}}
\newcommand{\X}{\mb{X}}
\newcommand{\x}{\mb{x}}
\renewcommand{\u}{\mb{u}}
\newcommand{\R}{\mb{R}}
\newcommand{\Yy}{\mb{Y}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\Xx}{\mathcal{X}}
\newcommand{\p}{\mb{p}}
\newcommand{\Pt}{\mb{P}_t}
\newcommand{\Ptm}{\mb{P}_{t-1}}
\renewcommand{\L}{\mb{L}}
\newcommand{\T}{\bm{\lambda}}


\bibliographystyle{chicago}

\title{Theta logistic population model writeup}
\author{Casper W. Berg}
\date{\today}
\begin{document}
\maketitle

\SweaveOpts{keep.source=TRUE}
<<echo=FALSE>>=
options(continue=" ")
if (require(cacheSweave,quietly=TRUE)) {
  setCacheDir("theta_cache")
}
library(rbenchmark)
library(ggplot2)
library(grid) ## for units()
@ 

\section{Summary}

\begin{itemize}
\item ADMB is by far the fastest method, but it takes a little bit more coding than doing it in the BUGS language. 
\item HMM can be implemented in R, but it takes some effort and is quite slow.  
\item JAGS is slower but not too bad for this relatively simple problem. 
%\item JAGS produces quite biased parameter estimates with correspondingly wider credible intervals compared to those from ADMB and the HMM method.
\item All methods produce quite similar results with respect to parameter estimates.
\item JAGS produced much wider credible intervals for the estimated parameters than the corresponding confidence intervals for ADMB and HMM.  
\item For this application, choosing the mean of the MCMC chains from the BUGS runs instead of the mode leads to more bias in the parameter estimates.   
\item Coverage fractions are close to 0.95 for all the methods.
\end{itemize}

\section{Introduction}
This example is based on \cite{Pedersen2011}, which contains much more details than this shorter writeup.

The example is a theta logistic population model, which is a nonlinear state-space model. The log-transformed population size is modelled as a nonlinear function of its previous size in the following way:

\begin{align}
\label{eq:system} X_t &= X_{t-1} + r_0\left( 1- \left(\frac{\exp(X_{t-1})}{K}\right)^\theta \right) + e_t, \\
\label{eq:obs} Y_t &= X_t + u_t, 
\end{align}
where $e_t \sim N(0,Q)$ and $u_t \sim N(0,R)$. 

This example does not contain any real data, so instead we will simulate 200 timesteps of the population with an initial value substantially below the carrying capacity, such that in the beginning the population will grow, and in the end of the time series the population size will fluctuate around the carrying capacity. If we had only observed the population fluctuating around the carrying capacity it might have been impossible to identify all the parameters in the model, e.g. $\theta$, and a simpler model should be chosen, for instance by setting $\theta=1$. 

\section{Basics}
A state-space model describes the dynamics of a latent state ($\mb{X}_t$) and how data ($Y_t$) relate to this state.
An important feature of SSMs is their ability to model random variations in the latent state and in data. 
For $t\in\{1,\ldots,N\}$ the general system and observation equations of the SSM are respectively $\X_t = g(t,\X_{t-1},\mb{e}_t)$, and  $\Yy_t = h(t,\X_t,\mb{u}_t)$, where $\mb{e}_t \sim N(\mb{0},\mb{Q}_t)$ is the system error and $u_t \sim N(0,\R_t)$ is the observation error. 
Here, ``$\sim N(\cdot)$'' means Gaussian distributed. 
Because of the possible nonlinearity of $g$ and $h$, advanced filtering and smoothing methods must be employed to gain meaningful estimates of $\X_t$. 
In this respect, the extended Kalman filter, the unscented Kalman filter, and Bayesian filtering \eg using Markov chain Monte Carlo (MCMC) sampling or BUGS are common approaches. 
Alternative methods for nonlinear state estimation are hidden Markov models \citep[HMMs,][]{Zucchini2009} and mixed effects models using the software AD Model Builder (ADMB).

All parameters in this example except the carrying capacity, $K$, has been log transformed to ensure positive values. The carrying capacity might also have been log transformed as this should obviously also be positive, but since the chosen true value is far from zero we judged that log-transforming $K$ was not critical.
Besides being convenient for ensuring positive valus, log transforming parameters also changes the shape of the likelihood surface. If standard confidence intervals based on quadratic approximations of the likelihood surface are used, it is of course important, that this quadratic approximation is reasonable, which it might often not be when dealing with nonlinear problems. In this problem, $\theta$ and $r_0$ are known to be highly correlated, and with a boomerang-shaped likelihood surface when not log transformed \citep{Polansky2009}, which is undesirable.
 
\section{ADMB}
The ADMB solution to this problem is the same basic template used for any type of state-space model: The states are declared as random effects, and the system and observation equations are put into \texttt{SEPARABLE\_FUNCTION}s in order gain huge speed improvements. ADMB will then be able to exploit the fact, that the covariance matrix for all the states in a state-space model is a banded matrix, and skip work on all the zeroes.

The data section of the ADMB solution contains the initial parameter values (this is just an alternative to using a \texttt{.pin} file), the number of data points $N$, and the observations $y$. 

The parameter section contains all the fixed effects (starting with \texttt{init}), the random effects, which are the latent states (population size), and finally the objective function which is the joint negative log-likelihood.

The procedure section is quite simple for this problem, as all the likelihood contribution is done within the \texttt{SEPARABLE\_FUNCTION}s. The only thing we are doing outside the \texttt{SEPARABLE\_FUNCTION}s is to set the values of the \texttt{sdreport\_numbers}. Notice, that we pass the log-transformed values (i.e. the parameters to be estimated) to the \texttt{SEPARABLE\_FUNCTION}s, and not the \texttt{sdreport\_numbers}. This is because all computations used in the likelihood must be performed inside the \texttt{SEPARABLE\_FUNCTION}s and NOT in the procedure section. 

\VerbatimInput[frame=single,numbers=left,samepage=false,fontsize=\small]{thetaPretty.tpl}

\section{BUGS}
In \cite{Pedersen2011} we used OpenBUGS to run our model, which is possible from within R using the \texttt{rbugs} package. In this example we changed to JAGS and the \texttt{R2Jags} package, as OpenBUGS apparantly is problematic to run using MacOS, and we wanted platform independent solutions if possible. The BUGS code was able to run unchanged using JAGS, and there were only minor differences between \texttt{rbugs} and \texttt{R2Jags}.

As to keep the results comparable between the frequentist and Bayesian methods, we wanted vague (or uninformative) priors on our parameters, so we assigned uniform priors with wide support to all the fixed parameters, except the variance parameters for which we tried two different priors.
It is common to assign vague inverse-gamma distributed priors to variance parameters \citep{Spiegelhalter2003,Lambert2005}. \cite{Gelman2006}, however, recommends using a uniform prior on the log-transformed standard deviation.
Therefore, as in \cite{Pedersen2011}, to asses the sensitivity of the estimation results to the choice of prior we perform BUGS estimation using both types of priors.

The OpenBUGS versions used in \cite{Pedersen2011} showed significant differences in running time, with the inverse-gamma distribution on the variances being six times faster than using a uniform distribution on the log-tranformed standard deviations. 
JAGS however showed no noticable difference in running speed using the two different priors.

\VerbatimInput[frame=single,numbers=left,samepage=true,fontsize=\small]{../BUGS/bugmodel-GammaPrior.txt}

\VerbatimInput[frame=single,numbers=left,samepage=true,fontsize=\small]{../BUGS/bugmodel-UPrior.txt}


\section{R}
There is - to my knowledge - no easy way to do nonlinear state-space models in R. In \cite{Pedersen2011} it is described how it can be done by discretizing the continuous state-space and reforumulating the state-space model as a hidden Markov model, and Matlab code was provided in the online supplementary material.
I translated this code into R, and it is available on the svn-repository on the NCEAS website. I will not go into details about the R implementation in this writeup.  


\section{Simulation results}
The simulation results show, that ADMB is many orders of magnitude faster than JAGS and the HMM implementation in R (\ref{fig:sim_timings}), though it should be noted, that the two latter methods might be able to produce similar results in shorter time, by choosing shorter mcmc-chains for JAGS and a coarser grid for the HMM solution.

From the distribution of parameter estimates (figure \ref{fig:sim_params}) 
we must note, that the estimates of $K$,$\log r_0$, and $\theta$ seems to be biased -- $K$ and $\log r_0$ positively and $\theta$ negatively.
This is not too surprising considering that these parameters are quite correlated (this can be confirmed for instance by inspecting the matrix of parameter correlations produced by ADMB in the \textrm{.cor} file), and that the likelihood surface may be ugly in the sense that it is not well approximated by a quadratic form as discussed in \cite{Pedersen2011}.

All of the methods seem to perform equally well with respect to bias in parameter estimates.

%The problem is most pronounced for the JAGS solutions, whose estimators seem to behave quite differently than those produced by ADMB and the HMM method. The bias and variance of the JAGS estimators are considerably larger.
This is also evident in figure \ref{fig:sim_fullparams}, which show both point estimates and CIs for each simulation and method.

Figure 4 summarises the root mean square error (rmse), length of the confidence intervals, and coverage fractions (the fraction of simulations where the CI contains the true value). It shows, that ADMB and the HMM method gives roughly the same results, whereas JAGS has a greater error and wider CIs. The coverage fractions are not too different -- all of them are relatively close to the expected value of 0.95. The wider CIs produced by JAGS is consistent with the results obtained in \cite{Pedersen2011}.

All the comparisons discussed so far has been using the mode of the MCMC chains as the parameter estimate for the BUGS methods.
For posterior distributions obtained from MCMC that are approximately normal it does of course not matter whether the mean, mode or median is chosen as they are identical for the normal distribution.
In this case however, the posterior distributions are not close to normal, and so the choice of estimator is important. Figure \ref{fig:sim_params_mean} is identical to figure \ref{fig:sim_params} except that the mean of the chains instead of the mode was chosen for the BUGS runs. Choosing the mean in this case clearly results in more biased estimates of the parameters.


<<loadsims,echo=FALSE>>=
load("../SIMS/allsim.RData")
source("../../TOOLS/compile-funs.R")
source("../../TOOLS/sim_plotfuns.R")

## Alternative params plot with mode instead of mean for BUGS runs
simsAlt=sims;
for(i in 1:length(sims)){
  for(name in names(sims[[i]]$BUGS)){
    simsAlt[[i]]$BUGS[[name]]$coef=simsAlt[[i]]$BUGS[[name]]$coef.mode
  }
}


ttvals <- read.table("../SIMS/true.dat",header=TRUE)
tvals <- get_cmat(sims,ttvals)

S <- simsum(tvals,ttvals,etime=FALSE)

S$plots$params$data$method[S$plots$params$data$method=="ADMB"]="A";
S$plots$params$data$method[S$plots$params$data$method=="BUGS.GP"]="B.G";
S$plots$params$data$method[S$plots$params$data$method=="BUGS.UP"]="B.U";

S$plots$cisum$data$parameter=as.character(S$plots$cisum$data$parameter);
S$plots$cisum$data$parameter[S$plots$cisum$data$parameter=="logr0"]="r0";
S$plots$cisum$data$parameter[S$plots$cisum$data$parameter=="logSqrtQ"]="Q";
S$plots$cisum$data$parameter[S$plots$cisum$data$parameter=="logSqrtR"]="R";
S$plots$cisum$data$parameter[S$plots$cisum$data$parameter=="logTheta"]="theta";

S$plots$cisum$data[S$plots$cisum$data$summary=="rmse" & S$plots$cisum$data$parameter=="K","value"]=S$plots$cisum$data[S$plots$cisum$data$summary=="rmse" & S$plots$cisum$data$parameter=="K","value"]/100;

S$plots$cisum$data[S$plots$cisum$data$summary=="cilen" & S$plots$cisum$data$parameter=="K","value"]=S$plots$cisum$data[S$plots$cisum$data$summary=="cilen" & S$plots$cisum$data$parameter=="K","value"]/100;

S$plots$cisum$data[S$plots$cisum$data[,1]=="coverage",4]=S$plots$cisum$data[S$plots$cisum$data[,1]=="coverage",4]+rnorm(nrow(S$plots$cisum$data[S$plots$cisum$data[,1]=="coverage",]),sd=0.005);
##plot(tvals$logSqrtQ_lo,ylim=c(-4,0))
##points(tvals$logSqrtQ_hi)
##lines(c(-1, 10),rep(ttvals[4,2],2))

##############
tvals.mode <- get_cmat(simsAlt,ttvals)
S.mode <- simsum(tvals.mode,ttvals,etime=FALSE)

S.mode$plots$params$data$method[S.mode$plots$params$data$method=="ADMB"]="A";
S.mode$plots$params$data$method[S.mode$plots$params$data$method=="BUGS.GP"]="B.G";
S.mode$plots$params$data$method[S.mode$plots$params$data$method=="BUGS.UP"]="B.U";

S.mode$plots$cisum$data$parameter=as.character(S.mode$plots$cisum$data$parameter);
S.mode$plots$cisum$data$parameter[S.mode$plots$cisum$data$parameter=="logr0"]="r0";
S.mode$plots$cisum$data$parameter[S.mode$plots$cisum$data$parameter=="logSqrtQ"]="Q";
S.mode$plots$cisum$data$parameter[S.mode$plots$cisum$data$parameter=="logSqrtR"]="R";
S.mode$plots$cisum$data$parameter[S.mode$plots$cisum$data$parameter=="logTheta"]="theta";

S.mode$plots$cisum$data[S.mode$plots$cisum$data$summary=="rmse" & S.mode$plots$cisum$data$parameter=="K","value"]=S.mode$plots$cisum$data[S.mode$plots$cisum$data$summary=="rmse" & S.mode$plots$cisum$data$parameter=="K","value"]/100;

S.mode$plots$cisum$data[S.mode$plots$cisum$data$summary=="cilen" & S.mode$plots$cisum$data$parameter=="K","value"]=S.mode$plots$cisum$data[S.mode$plots$cisum$data$summary=="cilen" & S.mode$plots$cisum$data$parameter=="K","value"]/100;

S.mode$plots$cisum$data[S.mode$plots$cisum$data[,1]=="coverage",4]=S.mode$plots$cisum$data[S.mode$plots$cisum$data[,1]=="coverage",4]+rnorm(nrow(S.mode$plots$cisum$data[S.mode$plots$cisum$data[,1]=="coverage",]),sd=0.005);
  
  
@ 

\begin{figure}
  \centering
<<timefig,echo=FALSE,fig=TRUE>>=
print(S$plots$time)
@   
  \caption{Timings}
  \label{fig:sim_timings}
\end{figure}

\begin{figure}
  \centering
<<paramfig,echo=FALSE,fig=TRUE>>=
print(S.mode$plots$params)
@   
  \caption{Distribution of parameter estimates}
  \label{fig:sim_params}
\end{figure}


\begin{figure}
  \centering
<<fullparamfig,echo=FALSE,fig=TRUE>>=
print(S.mode$plots$fullparams)


##test
##X11()
##par(mfrow=c(5,4),mar=c(1,1,1,1));
##pars=c("K"="X1","logr0"="X2","logSqrtQ"="X3","logSqrtR"="X4","logTheta"="X5");
##for(mp in pars){
##  for(mt in unique(tvals$method)){
##    lo=tvals[tvals$method==mt,which(names(tvals)==mp)+5];
##    hi=tvals[tvals$method==mt,which(names(tvals)==mp)+10];
##    lohi=numeric(3*length(lo));
 ##   xlohi=lohi;
##    for(i in 1:length(lo)){
##      lohi[(i-1)*3+1]=lo[i];
##      lohi[(i-1)*3+2]=hi[i];
##      lohi[(i-1)*3+3]=NA;
##      xlohi[(i-1)*3+1]=i;
##      xlohi[(i-1)*3+2]=i;
##      xlohi[(i-1)*3+3]=NA;
##    }
    
##    plot(tvals[tvals$method==mt,mp],ylim=c(min(lo)*0.9,max(hi)*1.1),xlab="",ylab="",main=names(which(pars==mp)))
    ##points(lo);
    ##points(hi);
##    lines(xlohi,lohi);
##    trueVal=ttvals[ttvals[,1]==names(which(pars==mp)),2];
##    cat(trueVal," ",names(which(pars==mp))," ",c(min(lo)*0.9,max(hi)*1.1),"\n")
##    lines(c(-1,10000),c(trueVal,trueVal),col=2);
##  }
##}

@   
  \caption{Full distribution of parameters+CIs}
  \label{fig:sim_fullparams}
\end{figure}


\begin{figure}
  \centering
<<coverfig,echo=FALSE,fig=TRUE>>=
print(S.mode$plots$cisum)
@   
  \caption{Confidence interval summaries. Rmse and interval lengths for the parameter $K$ has been divided by 100 to get a better scaling on the y-axis. Coverage fractions has been added a small amount of jitter to avoid overlapping values.}
  \label{fig:cisum}
\end{figure}

\begin{figure}
  \centering
<<paramfig2,echo=FALSE,fig=TRUE>>=
print(S$plots$params)
@   
  \caption{Distribution of parameter estimates using the mean value of the chains for BUGS runs instead of the mode.}
  \label{fig:sim_params_mean}
\end{figure}


\bibliography{../../nonlin}

\end{document}