# C.A.L. Bailer-Jones # R code used in lecture 8 of Statistical Methods course, # "Bayesian modelling using Monte Carlo methods for sampling and marginalization", # to do Bayesian inference of a 3-parameter linear model to 2D data library(gplots) # for plotCI() source("auxiliary/monte_carlo.R") # provides metrop() and make.covariance.matrix() # Define here the function log.post() - at the end of this file - by cut and paste # into the R session. ########## Define true model and simulate experimental data from it set.seed(50) Ndat <- 10 x <- sort(runif(Ndat, 0, 10)) sigTrue <- 1 modMat <- c(0,1) # 1 x P vector: coefficients, a_p, of polynomial sum_{p=0} a_p*x^p y <- cbind(1,x) %*% as.matrix(modMat) + rnorm(Ndat, 0, sigTrue) # Dimensions in matrix multiplication: [Ndat x 1] = [Ndat x P] %*% [P x 1] + [Ndat] # cbind does the logical thing combining a scalar and vector; then vector addition y <- drop(y) # converts into a vector plotCI(x, y, xlim=c(0,10), uiw=sigTrue, gap=0) abline(a=modMat[1], b=modMat[2], col="red") # true model thetaTrue <- c(modMat[1], atan(modMat[2]), log10(sigTrue)) # Above is the true parameters, transformed to be conformable with model to be used below data <- data.frame(cbind(x,y)) # only this is used in the analysis rm(x,y) ########## Define model and infer the posterior PDF over its parameters # Model to infer: linear regression with Gaussian noise # Parameters: intercept a_0, gradient a_1; Gaussian noise sigma, ysig. # Prior PDFs over model parameters: # a_0: intercept. P(a_0) ~ N(mean=m, sd=s), a Gaussian with m and s estimated from the data. # a_1: gradient, a_1 = tan(alpha), alpha is angle between horizontal and model line. # P(alpha) ~ 1 [uniform] => P(alpha) ~ 1/(1+tan(alpha)^2) but it is easier if we # use alpha (in radians) as the model parameter, alpha = atan(a_1) # ysig: Jeffreys prior P(ysig) ~ 1/ysig, or equivalently P(log10(ysig)) ~ 1. # This is an "improper prior", which means its integral does not converge. # theta is 1 x J vector of all model parameters (J=3): theta = c(a_0, alpha, log10(ysig)). # The sampling is performed on theta. It is defined in this way to make sampling from a # Gaussian more convenient: (1) linear steps in alpha better than in tan(alpha); # (2) ysig cannot be negative, and additive steps in log10(ysig) - multiplicative steps # in ysig - are more natural for a standard deviation. # define covariance matrix of MCMC sampling PDF sampleCov <- make.covariance.matrix(sampleSD=c(0.1, 0.02, 0.1), sampleCor=0) # set starting point thetaInit <- c(2, pi/8, log10(3)) # run the MCMC to find postSamp, samples of the posterior PDF set.seed(150) postSamp <- metrop(func=log.post, thetaInit=thetaInit, Nburnin=1e4, Nsamp=1e4, verbose=1e3, sampleCov=sampleCov, data) # note that 10^(postSamp[,1]+postSamp[,2]) is the unnormalized posterior at each sample # Plot MCMC chains and use density estimation to plot 1D posterior PDFs from these. # Note that we don't need to do any explicit marginalization to get the 1D PDFs. parnames <- c("a_0", "alpha = arctan(a_1) / radians", "log10(ysig)") par(mfrow=c(3,2), mar=c(3.5,3.5,1.0,0.5), oma=0.8*c(1,1,1,1), mgp=c(2.0,0.6,0)) for(p in 3:5) { # columns of postSamp plot(1:nrow(postSamp), postSamp[,p], type="l", xlab="iteration", ylab=parnames[p-2]) postDen <- density(postSamp[,p], n=2^10) plot(postDen$x, postDen$y, type="l", xlab=parnames[p-2], ylab="density") abline(v=thetaTrue[p-2], col="red") } # Plot gradient and intercept samples in 2D par(mfrow=c(1,2)) plot(postSamp[,3], postSamp[,4], xlab="intercept a_0", ylab="alpha = atan(a_1) / radians", pch=".") plot(postSamp[,3], tan(postSamp[,4]), xlab="intercept a_0", ylab="gradient a_1", pch=".") # Find MAP solution and mean solution. # MAP = Maximum A Posteriori, i.e. peak of posterior. # MAP is not the peak in each 1D PDF, but the peak of the 3D PDF. # mean is easy, because samples have been drawn from the (unnormalized) posterior. posMAP <- which.max(postSamp[,1]+postSamp[,2]) (thetaMAP <- postSamp[posMAP, 3:5]) (thetaMean <- apply(postSamp[,3:5], 2, mean)) # Monte Carlo integration # Overplot these solutions with original data and true model par(mfrow=c(1,1)) plotCI(data$x, data$y, xlim=c(0,10), uiw=sigTrue, gap=0) abline(a=modMat[1], b=modMat[2], col="red", lw=2) # true model abline(a=thetaMAP[1], b=tan(thetaMAP[2]), col="blue", lw=2) # MAP model abline(a=thetaMean[1], b=tan(thetaMean[2]), col="green", lw=2) # mean model # Compare this with the result from ML estimation from lm() abline(lm(data$y ~ data$x), col="black", lty=2) ########## Make prediction: determine PDF(ycand | xnew, data) # Set up uniform grid of candidate values of y, ycand[], and at each of these # calculate the probability density (a scalar) by integrating the likelihood # over the posterior. We do this with the Monte Carlo approximation of integration. # Model and likelihood used here must be consistent with log.post() ! xnew <- 6 # then repeat again with xnew=15 dy <- 0.01 ymid <- thetaMAP[1] + xnew*tan(thetaMAP[2]) # for centering choice of ycand ycand <- seq(ymid-5, ymid+5, dy) # uniform sampling of y with step size dy ycandPDF <- vector(mode='numeric', length=length(ycand)) # predict y at all values of parameters drawn from posterior by applying model. # Dimensions in matrix multiplication: [Nsamp x 1] = [Nsamp x P] %*% [P x 1] modPred <- cbind(postSamp[,3], tan(postSamp[,4])) %*% t(cbind(1,xnew)) for(k in 1:length(ycand)) { like <- dnorm(modPred - ycand[k], mean=0, sd=10^postSamp[,5]) # [Nsamp x 1] ycandPDF[k] <- mean(like) # Monte Carlo integration. Gives a scalar } # Note that ycandPDF[k] is normalized, i.e. sum(dy*ycandPDF) = 1 plot(ycand, ycandPDF, type="l") # find peak and approximate confidence intervals at ± 1sigma peak.ind <- which.max(ycandPDF) lower.ind <- max( which(cumsum(dy*ycandPDF) < pnorm(-1)) ) upper.ind <- min( which(cumsum(dy*ycandPDF) > pnorm(+1)) ) abline(v=ycand[c(peak.ind, lower.ind, upper.ind)]) # Overplot this prediction with original data and the models par(mfrow=c(1,1)) plotCI(data$x, data$y, xlim=range(c(xnew,data$x)), ylim=c(-1,11), uiw=sigTrue, gap=0) abline(a=modMat[1], b=modMat[2], col="red", lw=2) # true model abline(a=thetaMAP[1], b=tan(thetaMAP[2]), col="blue", lw=2) # MAP model abline(a=thetaMean[1], b=tan(thetaMean[2]), col="green", lw=2) # mean model plotCI(xnew, ycand[peak.ind], li=ycand[lower.ind], ui=ycand[upper.ind], gap=0, add=TRUE, col="magenta") ########## Functions required # Return log10(unnormalized posterior), a scalar. # theta is vector of parameters; data is 2 column matrix [x,y]. # Note that the model and the priors and hard-wried into this function. # dnorm(..., log=TRUE) returns log base e, so multiply by adj = 1/ln(10) = 0.4342945 # to get log base 10, as required by metrop(). log.post <- function(theta, data) { adj <- 0.4342945 # convert alpha to a_1 and log10(ysig) to ysig theta[2] <- tan(theta[2]) theta[3] <- 10^theta[3] # likelihood modPred <- drop( theta[1:2] %*% t(cbind(1,data$x)) ) # Dimensions in mixed vector matrix multiplication: [Ndat] = [P] %*% [P x Ndat] logLike <- adj*sum( dnorm(modPred - data$y, mean=0, sd=theta[3], log=TRUE) ) # scalar # prior (not normalized - doesn't need to be, as we're returning unnormalized posterior) a0Prior <- dnorm(theta[1], mean=0, sd=2) alphaPrior <- 1 logysigPrior <- 1 logPrior <- sum( log10(a0Prior), log10(alphaPrior), log10(logysigPrior) ) # scalar # return logPosterior logLike + logPrior }