R scripts for the lecture course Machine Learning, pattern recognition and statistical data modelling Coryn A.L. Bailer-Jones, 2007 Basis Expansions and Splines ---------------------------- ***** Polynomial fitting attach(GAGurine) #par(mfrow = c(3, 2)) plot(Age, GAG) GAG.lm <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6) + I(Age^7) + I(Age^8)) anova(GAG.lm) # 7th and 8th powers are not significant GAG.lm2 <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6)) xx <- seq(0, 17, len = 200) lines(xx, predict(GAG.lm2, data.frame(Age = xx))) detach(GAGurine) ***** Splines # Regression spline exactly fits all points library(fields) # needed for rat.diet data set x.grid <- seq( 0, 120, 200) fit <- splint(rat.diet$t, rat.diet$trt, x.grid) plot( rat.diet$t, rat.diet$trt) lines( x,y) # Now use GAGurine data attach(GAGurine) # Exact spline (i.e. regression spline with knot at each point) age.grid <- seq(min(Age), max(Age), 0.01) fit <- splint(Age, GAG, age.grid) plot(Age, GAG) lines(age.grid, fit) # Linear fit - do this to inspect degrees of freedom lm1 <- lm(GAG ~ Age) # Need to put in a named data frame so that predict.lm properly recognises it age.grid <- data.frame(seq(min(Age), max(Age), 0.1)) colnames(age.grid) <- "Age" plot(Age, GAG) lines(age.grid$Age, predict(lm1, age.grid), col='red', lw=2) # We know that df=2. The following gives 312, i.e. the total number of d.o.f # available is equal to the number of measurements, which is 314 lm1$df.residual # For some reason sreg does not work on the GAGurine data, e.g. # predict(sreg(Age, GAG) # gives errors, even though it is the same syntax as the example in the sreg help page # Regression spline with fixed knots and no smoothing - smooth.spline() with spar=0 # See how varying nknots affects fit. Need nknots >=4 to avoid errors and maximum # nknots is no. of points in data library(fields) plot(rat.diet$t, rat.diet$trt) ss <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=20) eval <- seq(min(rat.diet$t), max(rat.diet$t), 1) #predict(ss, eval) # gives both x and y, so when we plot we just do lines(predict(ss, eval), col='blue') # Directly overplot fits with variable numbers of knots lines(predict(smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=5), eval), col='blue') # See how df varies with nknots knot.values <- seq(4,39,1) Nval <- length(knot.values) dof <- vector(length=Nval) for (k in 1:Nval) { dof[k] <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=knot.values[k])$df } plot(knot.values, dof) # If you repeat this but with smoothing, i.e. with non-zero spar or non-zero df, you see # some interesting things. But generally we don't want to have to specify the number of # knots but rather prefer to use all points as knots. See below! # Smoothing spline - sreg library(fields) ?sreg plot(rat.diet$t, rat.diet$trt) eval <- seq(min(rat.diet$t), max(rat.diet$t), 1) # If we don't specify dof then it is evaluated by GCV sreg1 <- sreg(rat.diet$t, rat.diet$trt) # Note the effective d.o.f in the following sreg1 lines(eval, predict(sreg1, eval), col='red', lw=1) # Now try various values of df and overplot in different colours. # Try: 2, 3, 5, 20, 30, 39 (last is maximum) sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=2) lines(eval, predict(sreg1, eval), col='blue') # Look at extreme: df=0 (straight line) # df=infinity or lambda=0 (no regularization, i.e. regression spline) doesn't work with this # function: we must use splint instead. Indeed, maximum df is the no. of points # See how RMSE on total data set varies with d.o.f (this is a dummy to show that we # actually have to do CV to find the parameter!) rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )} dof.values <- seq(2,20,0.1) Nval <- length(dof.values) rmse <- vector(length=Nval) for (k in 1:Nval) { sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=dof.values[k]) resid <- predict(sreg1, rat.diet$t)-rat.diet$trt rmse[k] <- rms(resid) } plot(dof.values, rmse, xlab='effective degrees of freedom', ylab='RMS') # Question: what's happening here? Why does error keep going down? # Answer: We're fitting all data. There's no validation set: We're not testing the # generalization performance # Now carry out an exhaustive CV search for the optimal df using a training set set.seed(10) train <- sample(seq(1:dim(rat.diet)[1]), 20) rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )} dof.values <- seq(2,20,0.1) Nval <- length(dof.values) rmse2 <- vector(length=Nval) for (k in 1:Nval) { sreg1 <- sreg(rat.diet$t[train], rat.diet$trt[train], df=dof.values[k]) resid <- predict(sreg1, rat.diet$t[-train])-rat.diet$trt[-train] rmse2[k] <- rms(resid) } plot(dof.values, rmse2, xlab='effective degrees of freedom', ylab='RMS', ylim=c(0,8)) lines(dof.values, rmse, col='red') min(rmse2) which.min(rmse2) dof.values[which.min(rmse2)] # = 3.9 with seed=100, =7.3 with seed=50 # The minimum found by GCV was df=7.5 # With some training sets (i.e. some seeds) we see second minimum near to df=7.5 # It's rather sensitive to the training set selection, which is perhaps not surprising as there # are only 39 points in data set. Most common minimum is around 7, but it's very shallow # Plot these as a comparison plot(rat.diet$t, rat.diet$trt) lines(eval, predict(sreg(rat.diet$t, rat.diet$trt, df=3.9), eval), col='blue', lw=2) lines(eval, predict(sreg(rat.diet$t, rat.diet$trt, df=8), eval), col='red', lw=2) # As noted above, if sreg() is used withou specifying the df then this is found by GCV. # The results of this are retained in the sreg() object and can be accessed by sreg.plot sreg1 <- sreg(rat.diet$t, rat.diet$trt) plot(sreg1)