This code covers chapter 8 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar. See table of contents for code examples for other chapters.

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

ruspini_scaled data is in package cluster. It is a very simple data set with well separated clusters.

data(ruspini, package="cluster")

Shuffle rows

ruspini <- ruspini[sample(1:nrow(ruspini)),]
plot(ruspini)

Scale each column in the data to zero mean and unit standard deviation (z-scores). This prevents one attribute with a large range to dominate the others for the distance calculation.

ruspini_scaled <- scale(ruspini)
plot(ruspini_scaled)

Clustering methods

k-means Clustering

Assumes Euclidean distances. We use k=10 clusters and run the algorithm 10 times with random initialized centroids. The best result is returned.

km <- kmeans(ruspini_scaled, centers=4, nstart=10)
km
## K-means clustering with 4 clusters of sizes 15, 23, 20, 17
## 
## Cluster means:
##            x          y
## 1  0.4607268 -1.4912271
## 2 -0.3595425  1.1091151
## 3 -1.1385941 -0.5559591
## 4  1.4194387  0.4692907
## 
## Clustering vector:
## 63  2 34 10 60 71 36 12 75 55 26 65  6 51 42 46 74 19 30 20 15 67 32 70 66 
##  1  3  2  3  4  1  2  3  1  4  2  1  3  4  2  4  1  3  2  3  3  1  2  1  1 
## 68 47 13 57 33 62 44 27  1  3 43 54 35 16 69 11 21  9 14 37 41 17 28 72  5 
##  1  4  3  4  2  1  4  2  3  3  2  4  2  3  1  3  2  3  3  2  2  3  2  1  3 
## 52 61 22 73 58 25 53 56 48 23 31 38  4 59  8 64 18  7 50 45 24 29 39 40 49 
##  4  1  2  1  4  2  4  4  4  2  2  2  3  4  3  1  3  3  4  4  2  2  2  2  4 
## 
## Within cluster sum of squares by cluster:
## [1] 1.082373 2.658679 2.705477 3.641276
##  (between_SS / total_SS =  93.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(ruspini_scaled, col=km$cluster)
points(km$centers, pch=3, cex=2) # this adds the centroids
text(km$centers, labels=1:4, pos=2) # this adds the cluster ID

Alternative plot from package cluster (uses principal components analysis for >2 dimensions)

library(cluster)
## 
## Attaching package: 'cluster'
## The following object is masked _by_ '.GlobalEnv':
## 
##     ruspini
clusplot(ruspini_scaled, km$cluster)

Inspect the centroids (cluster profiles)

km$centers
##            x          y
## 1  0.4607268 -1.4912271
## 2 -0.3595425  1.1091151
## 3 -1.1385941 -0.5559591
## 4  1.4194387  0.4692907
def.par <- par(no.readonly = TRUE) # save default, for resetting...
layout(t(1:4)) # 4 plots in one
for(i in 1:4) barplot(km$centers[i,], ylim=c(-2,2), main=paste("Cluster", i))

par(def.par)  #- reset to default

Find data for a single cluster

All you need is to select the rows corresponding to the cluster. The next example plots all data points of cluster 1

cluster1 <- ruspini_scaled[km$cluster==1,]
head(cluster1)
##            x         y
## 63 0.9218907 -1.458375
## 71 0.3645599 -1.417309
## 75 0.2989916 -1.273580
## 65 0.4629124 -1.581571
## 74 0.5612649 -1.253047
## 67 0.3645599 -1.519973
plot(cluster1, xlim = c(-2,2), ylim = c(-2,2))

Try 10 clusters

plot(ruspini_scaled, col=kmeans(ruspini_scaled, centers=10)$cluster)