R scripts for the lecture course Machine Learning, pattern recognition and statistical data modelling Coryn A.L. Bailer-Jones, 2007 Unsupervised learning --------------------- Data library(cluster) # swiss.x contains 5 measurements of socio-economic indicators in each of 46 swiss # provinces from 1888 swiss.x <- as.matrix(swiss[,-1]) K-means clustering #select Nclust cluster centres as being a random selection of Nclust vectors Nclust <- 3 Imax <- 5 set.seed(100) init <- sample(dim(swiss.x)[1], Nclust) # this is the initial Nclust prototypes km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax) # plot data using these two columns. This is just a projection so the cluster may not be at # all obvious, as is the case with j <- 2 ; j <- 5 i <- 1 ; j <- 2 plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=0.75) points(km$centers[,i], km$centers[,j], pch=3, cex=1.4, col=c('blue', 'green', 'red')) sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue') sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green') sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red') # Use the following to produce plots for different Imax par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3)) i <- 1 ; j <- 2 plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'initial cluster centres'), cex.main=1.0) points(swiss.x[init,i], swiss.x[init,j], pch=3, cex=1.4,) Nclust <- 3 for (Imax in 1:3) { set.seed(45) init <- sample(dim(swiss.x)[1], Nclust) km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax) plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=1.0) points(km$centers[,i], km$centers[,j], pch=3, cex=1.8, col=c('blue', 'green', 'red')) sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue') sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green') sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red') } # Now repeat, using different initial cluster centres. Using seeds 100 and 45 we have quite # different starting conditions, yet converge quickly to the same solution. The final cluster # labels (colours) are, of course, arbitrary K-medoids clustering library(cluster) Nclust <- 3 km <- pam(swiss.x, k=Nclust) # plot data using these two columns. This is just a projection so the cluster may not be at # all obvious, as is the case with j <- 2 ; j <- 5 i <- 1 ; j <- 2 #par(mfrow=c(1,1)) plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'clusters with k-medoids'), cex.main=0.75) points(km$medoids[,i], km$medoids[,j], pch=3, cex=1.4, col=c('blue', 'green', 'red')) sel <- which(km$clustering==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue') sel <- which(km$clustering==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green') sel <- which(km$clustering==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red') Hierarchical clustering: agglomerative and divisive # dist measures the N(N-1)/2 Euclidean distances between all pairs in the data set # hclust performs a hierarchical clustering h <- hclust(dist(swiss.x), method = "single") # plclust also plots the tree. The "height" is the value of the dissimilarity metric at which # the agglomeration took place. Thus "height" is a non-decreasing set of values plot(h) # cuttree cuts the three at the level where we get the specified number of groups and # returns cluster number which each vector (province) lies in cutree(h, 3) # Compare two agglomeration methods h_single <- hclust(dist(swiss.x), method = "single") h_complete <- hclust(dist(swiss.x), method = "complete") par(mfrow=c(1,2), cex=0.75) plot(h_single); plot(h_complete) # Euclidean is the 2-norm. Manhattan is the 1-norm. Compare: h_2norm <- hclust(dist(swiss.x, method='euclidean'), method = "single") h_1norm <- hclust(dist(swiss.x, method='manhattan'), method = "single") par(mfrow=c(1,2), cex=0.75) plot(h_2norm); plot(h_1norm) # Divisive clustering. I specify a dissimilarity vector rather than give the data d <- diana(dist(swiss.x)) par(mfrow=c(1,2), cex=0.75) plot(d, which.plots=2) # height is the diameter of the clusters prior to splitting plot(h_1norm) # for comparison Hierarchical clustering: hybrid approach library(hybridHclust) mc <- mutualCluster(swiss.x) # hhc is in hclust format, so we can use tools in package {cluster} hhc <- hybridHclust(swiss.x, themc=mc, trace=TRUE) plot(hhc) Self-organizing maps # must pre-define grid size and topology sgrid <- somgrid(xdim=5, ydim=3, topo='hexagonal') som.swiss <- batchSOM(swiss.x, grid=sgrid, 1) plot(som.swiss) # this uses the 'stars' representation: each vector from the centre represents one of the # original data dimensions. It's length is proportional to the (mean ?) of the vectors # assigned to that node/prototype MDS library(MASS) swiss.mds <- isoMDS(dist(swiss.x)) plot(swiss.mds$points, type = "n") text(swiss.mds$points, labels = as.character(1:nrow(swiss.x)))