R scripts for the lecture course Machine Learning, pattern recognition and statistical data modelling Coryn A.L. Bailer-Jones, 2007 Data exploration ---------------- *** Density estimation # Figure 5.8 p. 127 in MASS4 geyser attach(geyser) par(mfrow=c(2,3)) # histograms are sensitive to placing of bin centres truehist(duration, h=0.5, x0=0.0, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.1, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.2, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.3, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.4, xlim=c(0, 6), ymax=0.7) # Improve by averaging the histograms breaks <- seq(0, 5.9, 0.1) counts <- numeric(length(breaks)) for(i in (0:4)) counts[i+(1:55)] <- counts[i+(1:55)] + rep(hist(duration, breaks=0.1*i + seq(0, 5.5, 0.5), prob=TRUE, plot=FALSE)$intensities, rep(5,11)) plot(breaks+0.05, counts/5, type="l", xlab="duration", ylab="averaged", bty="n", xlim=c(0, 6), ylim=c(0, 0.7)) detach() So use Gaussian kernel density estimation instead # Fig. 5.9 on p. 128 of MASS4 # nrd, SJ and SJ-dpi are different methods for choosing the fixed bandwidth # do ?bw.nrd for more details. To get values used: # bw.nrd(geyser$duration) = 0.389 # bw.SJ(geyser$duration) = 0.090 # bw.SJ(geyser$duration, method='dpi') = 0.144 attach(geyser) par(mfrow=c(1,2)) truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2) lines(density(duration, width = "nrd")) truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2) lines(density(duration, width = "SJ", n = 256), lty = 3) # dotted line lines(density(duration, n = 256, width = "SJ-dpi"), lty = 1) detach() Density estimation in 2D # Fig. 5.11 on p. 131 of MASS4 geyser2 <- data.frame(as.data.frame(geyser)[-1, ], pduration = geyser$duration[-299]) attach(geyser2) par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3)) plot(pduration, waiting, xlim = c(0.5, 6), ylim = c(40, 110), xlab = "previous duration", ylab = "waiting") f1 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110)) # on a linear scale image(f1, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting") # to get on a log scale do: # image(f1$x, f1$y, log(f1$z), zlim = c(-40, 0), xlab = "previous duration", ylab = "waiting") f2 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110), h = c(width.SJ(duration), width.SJ(waiting)) ) image(f2, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting") persp(f2, phi = 30, theta = 20, d = 5, xlab = "previous duration", ylab = "waiting", zlab = "") detach() *** Classification via density modelling (and MLE) # Generate two clusters of Gaussian data set.seed(20) x1 <- rnorm(100, mean=0, sd=0.50) y1 <- rnorm(100, mean=0, sd=0.50) x2 <- rnorm(100, mean=1, sd=0.70) y2 <- rnorm(100, mean=1, sd=0.30) plot(x1, y1, type='n', xlim=c(-1.5,3.0), ylim=c(-1.5,3.0), xlab='x', ylab='y') points(x1, y1, pch=19, col=2) points(x2, y2, pch=23, col=3, bg=3) # infer properties of PDF on assumption that x and y are independent ( mean1 <- c(mean(x1), mean(y1)) ) ( mean2 <- c(mean(x2), mean(y2)) ) ( sd1 <- c(sd(x1), sd(y1)) ) ( sd2 <- c(sd(x2), sd(y2)) ) # build covariance matrices and work out eigenvalues and eigenvectors, i.e. without # assuming x and y are independent covmat1 <- matrix( c(cov(x1,x1), cov(x1,y1), cov(y1,x1), cov(y1,y1)), nrow=2 ) covmat2 <- matrix( c(cov(x2,x2), cov(x2,y2), cov(y2,x2), cov(y2,y2)), nrow=2 ) eigen1 <- eigen(covmat1, symmetric=TRUE) eigen2 <- eigen(covmat2, symmetric=TRUE) # standard deviations in x and y are sqrt(eigen1$values) sqrt(eigen2$values) # note the differences in the the principal directions (eigenvectors) for class 1. This is # a symmetric Gaussian so the principal directions could happily lie in any direction. # Alternatively use SVD, e.g. svd(covmat2) sqrt(svd(covmat2)$d) # overplotted fitted Gaussians, using fact that x and y are independent xgrid=seq(-1,3,0.1) ygrid=seq(-1,3,0.1) contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x1), sd=sd(x1)) %o% dnorm(ygrid, mean=mean(y1), sd=sd(y1)), col=2, add=TRUE ) contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x2), sd=sd(x2)) %o% dnorm(ygrid, mean=mean(y2), sd=sd(y2)), col=3, add=TRUE ) Decision boundaries are quadratic in general. They are linear if the two class covariance matrices are equal.