# C.A.L. Bailer-Jones 
# R code used in lecture 8 of Statistical Methods course,
# "Bayesian modelling using Monte Carlo methods for sampling and marginalization",
# to do Bayesian inference of a 3-parameter linear model to 2D data

library(gplots) # for plotCI()
source("auxiliary/monte_carlo.R") # provides metrop() and make.covariance.matrix()
# Define here the function log.post() - at the end of this file - by cut and paste
# into the R session.


########## Define true model and simulate experimental data from it

set.seed(50)
Ndat <- 10
x <- sort(runif(Ndat, 0, 10))
sigTrue <- 1
modMat <- c(0,1) # 1 x P vector: coefficients, a_p, of polynomial sum_{p=0} a_p*x^p
y <- cbind(1,x) %*% as.matrix(modMat) + rnorm(Ndat, 0, sigTrue)
# Dimensions in matrix multiplication: [Ndat x 1] = [Ndat x P] %*% [P x 1] + [Ndat]
# cbind does the logical thing combining a scalar and vector; then vector addition
y <- drop(y) # converts into a vector
plotCI(x, y, xlim=c(0,10), uiw=sigTrue, gap=0)
abline(a=modMat[1], b=modMat[2], col="red") # true model
thetaTrue <- c(modMat[1], atan(modMat[2]), log10(sigTrue))
# Above is the true parameters, transformed to be conformable with model to be used below
data <- data.frame(cbind(x,y)) # only this is used in the analysis 
rm(x,y)


########## Define model and infer the posterior PDF over its parameters

# Model to infer: linear regression with Gaussian noise
# Parameters: intercept a_0, gradient a_1; Gaussian noise sigma, ysig.
# Prior PDFs over model parameters:
# a_0:  intercept. P(a_0) ~ N(mean=m, sd=s), a Gaussian with m and s estimated from the data.
# a_1:  gradient, a_1 = tan(alpha), alpha is angle between horizontal and model line.
#       P(alpha) ~ 1 [uniform] => P(alpha) ~ 1/(1+tan(alpha)^2) but it is easier if we
#       use alpha (in radians) as the model parameter, alpha = atan(a_1)
# ysig: Jeffreys prior P(ysig) ~ 1/ysig, or equivalently P(log10(ysig)) ~ 1.
#       This is an "improper prior", which means its integral does not converge.
# theta is 1 x J vector of all model parameters (J=3): theta = c(a_0, alpha, log10(ysig)).
# The sampling is performed on theta. It is defined in this way to make sampling from a
#   Gaussian more convenient: (1) linear steps in alpha better than in tan(alpha);
#   (2) ysig cannot be negative, and additive steps in log10(ysig) - multiplicative steps 
#   in ysig - are more natural for a standard deviation.

# define covariance matrix of MCMC sampling PDF
sampleCov <- make.covariance.matrix(sampleSD=c(0.1, 0.02, 0.1), sampleCor=0)
# set starting point
thetaInit <- c(2, pi/8, log10(3))
# run the MCMC to find postSamp, samples of the posterior PDF
set.seed(150)
postSamp <- metrop(func=log.post, thetaInit=thetaInit, Nburnin=1e4, Nsamp=1e4,
                   verbose=1e3, sampleCov=sampleCov, data)
# note that 10^(postSamp[,1]+postSamp[,2]) is the unnormalized posterior at each sample


# Plot MCMC chains and use density estimation to plot 1D posterior PDFs from these.
# Note that we don't need to do any explicit marginalization to get the 1D PDFs.
parnames <- c("a_0", "alpha = arctan(a_1) / radians", "log10(ysig)")
par(mfrow=c(3,2), mar=c(3.5,3.5,1.0,0.5), oma=0.8*c(1,1,1,1), mgp=c(2.0,0.6,0))
for(p in 3:5) { # columns of postSamp
  plot(1:nrow(postSamp), postSamp[,p], type="l", xlab="iteration", ylab=parnames[p-2])
  postDen <- density(postSamp[,p], n=2^10)
  plot(postDen$x, postDen$y, type="l", xlab=parnames[p-2], ylab="density")
  abline(v=thetaTrue[p-2], col="red")
}

# Plot gradient and intercept samples in 2D
par(mfrow=c(1,2))
plot(postSamp[,3], postSamp[,4], xlab="intercept a_0", ylab="alpha = atan(a_1) / radians", pch=".")
plot(postSamp[,3], tan(postSamp[,4]), xlab="intercept a_0", ylab="gradient a_1", pch=".")

# Find MAP solution and mean solution.
# MAP = Maximum A Posteriori, i.e. peak of posterior.
# MAP is not the peak in each 1D PDF, but the peak of the 3D PDF.
# mean is easy, because samples have been drawn from the (unnormalized) posterior.
posMAP     <- which.max(postSamp[,1]+postSamp[,2]) 
(thetaMAP  <- postSamp[posMAP, 3:5])
(thetaMean <- apply(postSamp[,3:5], 2, mean)) # Monte Carlo integration
# Overplot these solutions with original data and true model
par(mfrow=c(1,1))
plotCI(data$x, data$y, xlim=c(0,10), uiw=sigTrue, gap=0)
abline(a=modMat[1],    b=modMat[2],         col="red",   lw=2) # true model
abline(a=thetaMAP[1],  b=tan(thetaMAP[2]),  col="blue",  lw=2) # MAP  model
abline(a=thetaMean[1], b=tan(thetaMean[2]), col="green", lw=2) # mean model
# Compare this with the result from ML estimation from lm()
abline(lm(data$y ~ data$x), col="black", lty=2)


########## Make prediction: determine PDF(ycand | xnew, data)
# Set up uniform grid of candidate values of y, ycand[], and at each of these
# calculate the probability density (a scalar) by integrating the likelihood
# over the posterior. We do this with the Monte Carlo approximation of integration.
# Model and likelihood used here must be consistent with log.post() !
xnew  <- 6 # then repeat again with xnew=15
dy    <- 0.01
ymid  <- thetaMAP[1] + xnew*tan(thetaMAP[2]) # for centering choice of ycand
ycand <- seq(ymid-5, ymid+5, dy) # uniform sampling of y with step size dy
ycandPDF <- vector(mode='numeric', length=length(ycand))
# predict y at all values of parameters drawn from posterior by applying model.
# Dimensions in matrix multiplication: [Nsamp x 1] = [Nsamp x P] %*% [P x 1]
modPred <- cbind(postSamp[,3], tan(postSamp[,4])) %*% t(cbind(1,xnew))
for(k in 1:length(ycand)) {
  like <- dnorm(modPred - ycand[k], mean=0, sd=10^postSamp[,5]) # [Nsamp x 1]
  ycandPDF[k] <- mean(like) # Monte Carlo integration. Gives a scalar
}
# Note that ycandPDF[k] is normalized, i.e. sum(dy*ycandPDF) = 1
plot(ycand, ycandPDF, type="l") 
# find peak and approximate confidence intervals at ± 1sigma
peak.ind  <- which.max(ycandPDF)
lower.ind <- max( which(cumsum(dy*ycandPDF) < pnorm(-1)) )
upper.ind <- min( which(cumsum(dy*ycandPDF) > pnorm(+1)) )
abline(v=ycand[c(peak.ind, lower.ind, upper.ind)])
# Overplot this prediction with original data and the models
par(mfrow=c(1,1))
plotCI(data$x, data$y, xlim=range(c(xnew,data$x)), ylim=c(-1,11), uiw=sigTrue, gap=0)
abline(a=modMat[1],    b=modMat[2],         col="red",   lw=2) # true model
abline(a=thetaMAP[1],  b=tan(thetaMAP[2]),  col="blue",  lw=2) # MAP  model
abline(a=thetaMean[1], b=tan(thetaMean[2]), col="green", lw=2) # mean model
plotCI(xnew, ycand[peak.ind], li=ycand[lower.ind], ui=ycand[upper.ind],
       gap=0, add=TRUE, col="magenta")


########## Functions required

# Return log10(unnormalized posterior), a scalar.
# theta is vector of parameters; data is 2 column matrix [x,y].
# Note that the model and the priors and hard-wried into this function.
# dnorm(..., log=TRUE) returns log base e, so multiply by adj = 1/ln(10) = 0.4342945
# to get log base 10, as required by metrop().
log.post <- function(theta, data) {
  adj <- 0.4342945
  # convert alpha to a_1 and log10(ysig) to ysig
  theta[2] <- tan(theta[2])
  theta[3] <- 10^theta[3]
  # likelihood
  modPred <- drop( theta[1:2] %*% t(cbind(1,data$x)) )
  # Dimensions in mixed vector matrix multiplication: [Ndat] = [P] %*% [P x Ndat] 
  logLike <- adj*sum( dnorm(modPred - data$y, mean=0, sd=theta[3], log=TRUE) ) # scalar
  # prior (not normalized - doesn't need to be, as we're returning unnormalized posterior)
  a0Prior      <- dnorm(theta[1], mean=0, sd=2)
  alphaPrior   <- 1
  logysigPrior <- 1 
  logPrior <- sum( log10(a0Prior), log10(alphaPrior), log10(logysigPrior) ) # scalar
  # return logPosterior
  logLike + logPrior
}