R scripts for the lecture course
Machine Learning, pattern recognition and statistical data modelling
Coryn A.L. Bailer-Jones, 2007

Basis Expansions and Splines
----------------------------


***** Polynomial fitting

attach(GAGurine)
#par(mfrow = c(3, 2))
plot(Age, GAG)
GAG.lm <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6) + I(Age^7) + I(Age^8))
anova(GAG.lm)
# 7th and 8th powers are not significant
GAG.lm2 <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6))
xx <- seq(0, 17, len = 200)
lines(xx, predict(GAG.lm2, data.frame(Age = xx)))
detach(GAGurine)


***** Splines

# Regression spline exactly fits all points
library(fields)    # needed for rat.diet data set
x.grid <- seq( 0, 120, 200)
fit <- splint(rat.diet$t, rat.diet$trt, x.grid)
plot( rat.diet$t, rat.diet$trt)
lines( x,y)

# Now use GAGurine data
attach(GAGurine)

# Exact spline (i.e. regression spline with knot at each point)
age.grid <- seq(min(Age), max(Age), 0.01)
fit <- splint(Age, GAG, age.grid)
plot(Age, GAG)
lines(age.grid, fit)

# Linear fit -  do this to inspect degrees of freedom 
lm1 <- lm(GAG ~ Age)
# Need to put in a named data frame so that predict.lm properly recognises it
age.grid <- data.frame(seq(min(Age), max(Age), 0.1)) 
colnames(age.grid) <- "Age" 
plot(Age, GAG)
lines(age.grid$Age, predict(lm1, age.grid), col='red', lw=2)
# We know that df=2. The following gives 312, i.e. the total number of d.o.f
# available is equal to the number of measurements, which is 314
lm1$df.residual

# For some reason sreg does not work on the GAGurine data, e.g. 
# predict(sreg(Age, GAG)
# gives errors, even though it is the same syntax as the example in the sreg help page

# Regression spline with fixed knots and no smoothing -  smooth.spline() with spar=0
# See how varying nknots affects fit. Need nknots >=4 to avoid errors and maximum
# nknots is no. of points in data
library(fields)
plot(rat.diet$t, rat.diet$trt)
ss <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=20)
eval <- seq(min(rat.diet$t), max(rat.diet$t), 1)
#predict(ss, eval)    # gives both x and y, so when we plot we just do
lines(predict(ss, eval), col='blue')
# Directly overplot fits with variable numbers of knots
lines(predict(smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=5), eval), col='blue')
# See how df varies with nknots
knot.values <- seq(4,39,1)
Nval <- length(knot.values)
dof <- vector(length=Nval)
for (k in 1:Nval) {
  dof[k] <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=knot.values[k])$df
}
plot(knot.values, dof)
# If you repeat this but with smoothing, i.e. with non-zero spar or non-zero df, you see
# some interesting things. But generally we don't want to have to specify the number of
# knots but rather prefer to use all points as knots. See below!

# Smoothing spline - sreg
library(fields)
?sreg
plot(rat.diet$t, rat.diet$trt)
eval <- seq(min(rat.diet$t), max(rat.diet$t), 1)
# If we don't specify dof then it is evaluated by GCV
sreg1 <- sreg(rat.diet$t, rat.diet$trt)
# Note the effective d.o.f in the following
sreg1
lines(eval, predict(sreg1, eval), col='red', lw=1)
# Now try various values of df and overplot in different colours. 
# Try: 2, 3, 5, 20, 30, 39 (last is maximum)
sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=2)
lines(eval, predict(sreg1, eval), col='blue')
# Look at extreme: df=0 (straight line)
# df=infinity or  lambda=0 (no regularization, i.e. regression spline) doesn't work with this
# function: we must use splint instead. Indeed, maximum df is the no. of points

# See how RMSE on total data set varies with d.o.f (this is a dummy to show that we
# actually have to do CV to find the parameter!)
rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )}
dof.values <- seq(2,20,0.1)
Nval <- length(dof.values)
rmse <- vector(length=Nval)
for (k in 1:Nval) {
   sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=dof.values[k])
   resid <- predict(sreg1, rat.diet$t)-rat.diet$trt
   rmse[k] <- rms(resid)
}
plot(dof.values, rmse, xlab='effective degrees of freedom', ylab='RMS')
# Question: what's happening here? Why does error keep going down?
# Answer: We're fitting all data. There's no validation set: We're not testing the 
# generalization performance

# Now carry out an exhaustive CV search for the optimal df using a training set
set.seed(10)
train <- sample(seq(1:dim(rat.diet)[1]), 20)
rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )}
dof.values <- seq(2,20,0.1)
Nval <- length(dof.values)
rmse2 <- vector(length=Nval)
for (k in 1:Nval) {
   sreg1 <- sreg(rat.diet$t[train], rat.diet$trt[train], df=dof.values[k])
   resid <- predict(sreg1, rat.diet$t[-train])-rat.diet$trt[-train]
   rmse2[k] <- rms(resid)
}
plot(dof.values, rmse2, xlab='effective degrees of freedom', ylab='RMS', ylim=c(0,8))
lines(dof.values, rmse, col='red')
min(rmse2)
which.min(rmse2)
dof.values[which.min(rmse2)]   # = 3.9 with seed=100, =7.3 with seed=50
# The minimum found by GCV was df=7.5
# With some training sets (i.e. some seeds) we see second minimum near to df=7.5
# It's rather sensitive to the training set selection, which is perhaps not surprising as there
# are only 39 points in data set. Most common minimum is around 7, but it's very shallow
# Plot these as a comparison
plot(rat.diet$t, rat.diet$trt)
lines(eval, predict(sreg(rat.diet$t, rat.diet$trt, df=3.9), eval), col='blue', lw=2)
lines(eval, predict(sreg(rat.diet$t, rat.diet$trt, df=8), eval), col='red', lw=2)

# As noted above, if sreg() is used withou specifying the df then this is found by GCV.
# The results of this are retained in the sreg() object and can be accessed by sreg.plot
sreg1 <- sreg(rat.diet$t, rat.diet$trt)
plot(sreg1)