This code covers chapter 4 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar. See table of contents for code examples for other chapters.

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

Prepare Zoo Data Set

data(Zoo, package="mlbench")
head(Zoo)
##           hair feathers  eggs  milk airborne aquatic predator toothed
## aardvark  TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## antelope  TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
## bass     FALSE    FALSE  TRUE FALSE    FALSE    TRUE     TRUE    TRUE
## bear      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## boar      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## buffalo   TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
##          backbone breathes venomous  fins legs  tail domestic catsize
## aardvark     TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## antelope     TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## bass         TRUE    FALSE    FALSE  TRUE    0  TRUE    FALSE   FALSE
## bear         TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## boar         TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## buffalo      TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
##            type
## aardvark mammal
## antelope mammal
## bass       fish
## bear     mammal
## boar     mammal
## buffalo  mammal

Get summary statistics

summary(Zoo)
##     hair          feathers          eggs            milk        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:58        FALSE:81        FALSE:42        FALSE:60       
##  TRUE :43        TRUE :20        TRUE :59        TRUE :41       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   airborne        aquatic         predator        toothed       
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:77        FALSE:65        FALSE:45        FALSE:40       
##  TRUE :24        TRUE :36        TRUE :56        TRUE :61       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##   backbone        breathes        venomous          fins        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:18        FALSE:21        FALSE:93        FALSE:84       
##  TRUE :83        TRUE :80        TRUE :8         TRUE :17       
##  NA's :0         NA's :0         NA's :0         NA's :0        
##                                                                 
##                                                                 
##                                                                 
##       legs          tail          domestic        catsize       
##  Min.   :0.000   Mode :logical   Mode :logical   Mode :logical  
##  1st Qu.:2.000   FALSE:26        FALSE:88        FALSE:57       
##  Median :4.000   TRUE :75        TRUE :13        TRUE :44       
##  Mean   :2.842   NA's :0         NA's :0         NA's :0        
##  3rd Qu.:4.000                                                  
##  Max.   :8.000                                                  
##                                                                 
##             type   
##  mammal       :41  
##  bird         :20  
##  reptile      : 5  
##  fish         :13  
##  amphibian    : 4  
##  insect       : 8  
##  mollusc.et.al:10

translate all the TRUE/FALSE values into factors (nominal) or the tree

for(i in c(1:12, 14:16)) Zoo[[i]] <- as.factor(Zoo[[i]])

summary(Zoo)
##     hair     feathers     eggs       milk     airborne   aquatic  
##  FALSE:58   FALSE:81   FALSE:42   FALSE:60   FALSE:77   FALSE:65  
##  TRUE :43   TRUE :20   TRUE :59   TRUE :41   TRUE :24   TRUE :36  
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##   predator   toothed    backbone   breathes   venomous     fins   
##  FALSE:45   FALSE:40   FALSE:18   FALSE:21   FALSE:93   FALSE:84  
##  TRUE :56   TRUE :61   TRUE :83   TRUE :80   TRUE : 8   TRUE :17  
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##       legs          tail     domestic   catsize              type   
##  Min.   :0.000   FALSE:26   FALSE:88   FALSE:57   mammal       :41  
##  1st Qu.:2.000   TRUE :75   TRUE :13   TRUE :44   bird         :20  
##  Median :4.000                                    reptile      : 5  
##  Mean   :2.842                                    fish         :13  
##  3rd Qu.:4.000                                    amphibian    : 4  
##  Max.   :8.000                                    insect       : 8  
##                                                   mollusc.et.al:10

A First Decision Tree

Recursive Partitioning (similar to CART) uses the Gini index to make splitting decisions and early stopping (pre-pruning).

library(rpart)

Create Tree With Default Settings (uses pre-pruning)

tree1 <- rpart(type ~ ., data=Zoo)
tree1
## n= 101 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 101 60 mammal (0.41 0.2 0.05 0.13 0.04 0.079 0.099)  
##    2) milk=TRUE 41  0 mammal (1 0 0 0 0 0 0) *
##    3) milk=FALSE 60 40 bird (0 0.33 0.083 0.22 0.067 0.13 0.17)  
##      6) feathers=TRUE 20  0 bird (0 1 0 0 0 0 0) *
##      7) feathers=FALSE 40 27 fish (0 0 0.12 0.33 0.1 0.2 0.25)  
##       14) fins=TRUE 13  0 fish (0 0 0 1 0 0 0) *
##       15) fins=FALSE 27 17 mollusc.et.al (0 0 0.19 0 0.15 0.3 0.37)  
##         30) backbone=TRUE 9  4 reptile (0 0 0.56 0 0.44 0 0) *
##         31) backbone=FALSE 18  8 mollusc.et.al (0 0 0 0 0 0.44 0.56) *

Note: the class variable needs a factor (nominal) or rpart will create a regression tree instead of a decision tree. Use as.factor() if necessary.

Plotting

library(rpart.plot)
rpart.plot(tree1, extra = 2, under = TRUE, varlen=0, faclen=0)

Note: extra=2 prints for each leaf node the number of correctly classified objects from data and the total number of objects from the training data falling into that node (correct/total).

Create a Full Tree

To create a full tree, we set the complexity parameter cp to 0 (split even if it does not improve the tree) and we set the minimum number of observations in a node needed to split to the smallest value of 2 (see: ?rpart.control). Note: full trees overfit the training data!

tree2 <- rpart(type ~., data=Zoo, control=rpart.control(minsplit=2, cp=0))
rpart.plot(tree2, extra = 2, under = TRUE,  varlen=0, faclen=0)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes predicted by the model 7 is greater than length(box.palette) 6.
## To make this warning go away use box.palette=0 or trace=-1.

tree2
## n= 101 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 101 60 mammal (0.41 0.2 0.05 0.13 0.04 0.079 0.099)  
##     2) milk=TRUE 41  0 mammal (1 0 0 0 0 0 0) *
##     3) milk=FALSE 60 40 bird (0 0.33 0.083 0.22 0.067 0.13 0.17)  
##       6) feathers=TRUE 20  0 bird (0 1 0 0 0 0 0) *
##       7) feathers=FALSE 40 27 fish (0 0 0.12 0.33 0.1 0.2 0.25)  
##        14) fins=TRUE 13  0 fish (0 0 0 1 0 0 0) *
##        15) fins=FALSE 27 17 mollusc.et.al (0 0 0.19 0 0.15 0.3 0.37)  
##          30) backbone=TRUE 9  4 reptile (0 0 0.56 0 0.44 0 0)  
##            60) aquatic=FALSE 4  0 reptile (0 0 1 0 0 0 0) *
##            61) aquatic=TRUE 5  1 amphibian (0 0 0.2 0 0.8 0 0)  
##             122) eggs=FALSE 1  0 reptile (0 0 1 0 0 0 0) *
##             123) eggs=TRUE 4  0 amphibian (0 0 0 0 1 0 0) *
##          31) backbone=FALSE 18  8 mollusc.et.al (0 0 0 0 0 0.44 0.56)  
##            62) airborne=TRUE 6  0 insect (0 0 0 0 0 1 0) *
##            63) airborne=FALSE 12  2 mollusc.et.al (0 0 0 0 0 0.17 0.83)  
##             126) predator=FALSE 4  2 insect (0 0 0 0 0 0.5 0.5)  
##               252) legs>=3 2  0 insect (0 0 0 0 0 1 0) *
##               253) legs< 3 2  0 mollusc.et.al (0 0 0 0 0 0 1) *
##             127) predator=TRUE 8  0 mollusc.et.al (0 0 0 0 0 0 1) *

Training error on tree with pre-pruning

head(predict(tree1, Zoo))
##          mammal bird reptile fish amphibian insect mollusc.et.al
## aardvark      1    0       0    0         0      0             0
## antelope      1    0       0    0         0      0             0
## bass          0    0       0    1         0      0             0
## bear          1    0       0    0         0      0             0
## boar          1    0       0    0         0      0             0
## buffalo       1    0       0    0         0      0             0
pred <- predict(tree1, Zoo, type="class")
head(pred)
## aardvark antelope     bass     bear     boar  buffalo 
##   mammal   mammal     fish   mammal   mammal   mammal 
## Levels: mammal bird reptile fish amphibian insect mollusc.et.al
confusion_table <- table(Zoo$type, pred)
confusion_table
##                pred
##                 mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal            41    0       0    0         0      0             0
##   bird               0   20       0    0         0      0             0
##   reptile            0    0       5    0         0      0             0
##   fish               0    0       0   13         0      0             0
##   amphibian          0    0       4    0         0      0             0
##   insect             0    0       0    0         0      0             8
##   mollusc.et.al      0    0       0    0         0      0            10
correct <- sum(diag(confusion_table))
correct
## [1] 89
error <- sum(confusion_table)-correct
error
## [1] 12
accuracy <- correct / (correct+error)
accuracy
## [1] 0.8811881

Use a function for accuracy

accuracy <- function(truth, prediction) {
    tbl <- table(truth, prediction)
    sum(diag(tbl))/sum(tbl)
}

accuracy(Zoo$type, pred)
## [1] 0.8811881

Training error of the full tree

accuracy(Zoo$type, predict(tree2, Zoo, type="class"))
## [1] 1

Get a confusion table with more statistics (using caret)

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(data = pred, reference = Zoo$type)
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal            41    0       0    0         0      0             0
##   bird               0   20       0    0         0      0             0
##   reptile            0    0       5    0         4      0             0
##   fish               0    0       0   13         0      0             0
##   amphibian          0    0       0    0         0      0             0
##   insect             0    0       0    0         0      0             0
##   mollusc.et.al      0    0       0    0         0      8            10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8812          
##                  95% CI : (0.8017, 0.9371)
##     No Information Rate : 0.4059          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8431          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity                 1.0000       1.000        1.00000      1.0000
## Specificity                 1.0000       1.000        0.95833      1.0000
## Pos Pred Value              1.0000       1.000        0.55556      1.0000
## Neg Pred Value              1.0000       1.000        1.00000      1.0000
## Prevalence                  0.4059       0.198        0.04950      0.1287
## Detection Rate              0.4059       0.198        0.04950      0.1287
## Detection Prevalence        0.4059       0.198        0.08911      0.1287
## Balanced Accuracy           1.0000       1.000        0.97917      1.0000
##                      Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity                    0.0000       0.00000              1.00000
## Specificity                    1.0000       1.00000              0.91209
## Pos Pred Value                    NaN           NaN              0.55556
## Neg Pred Value                 0.9604       0.92079              1.00000
## Prevalence                     0.0396       0.07921              0.09901
## Detection Rate                 0.0000       0.00000              0.09901
## Detection Prevalence           0.0000       0.00000              0.17822
## Balanced Accuracy              0.5000       0.50000              0.95604

Model Evaluation

Pure R Implementation

Holdout Sample

Use a simple split into 2/3 training and 1/3 testing data. Find the size of the training set.

n_train <- as.integer(nrow(Zoo)*.66)
n_train
## [1] 66

Randomly choose the rows of the training examples.

train_id <- sample(1:nrow(Zoo), n_train)
head(train_id)
## [1] 99 70 58 64 52 51

Split the data

train <- Zoo[train_id,]
test <- Zoo[-train_id, colnames(Zoo) != "type"]
test_type <- Zoo[-train_id, "type"]

tree1 <- rpart(type ~., data=train,control=rpart.control(minsplit=2))

Training error

accuracy(train$type, predict(tree1, train, type="class"))
## [1] 1

Generalization error

accuracy(test_type, predict(tree1, test, type="class"))
## [1] 0.9428571

10-Fold Cross Validation

index <- 1:nrow(Zoo)
index <- sample(index) ### shuffle index
fold <- rep(1:10, each=nrow(Zoo)/10)[1:nrow(Zoo)]

folds <- split(index, fold) ### create list with indices for each fold

Do each fold

accs <- vector(mode="numeric")
for(i in 1:length(folds)) {
    tree <- rpart(type ~., data=Zoo[-folds[[i]],], control=rpart.control(minsplit=2))
    accs[i] <- accuracy(Zoo[folds[[i]],]$type, predict(tree, Zoo[folds[[i]],], type="class"))
}
accs
##  [1] 0.7 0.9 1.0 0.9 0.9 1.0 1.0 1.0 0.9 1.0

Report the average

mean(accs)
## [1] 0.93

Caret: For Easier Model Building and Evaluation

see http://cran.r-project.org/web/packages/caret/vignettes/caret.pdf

Enable multi-core

library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel()

k-fold Cross Validation

caret packages training and testing into a single function called train(). It internally splits the data into training and testing sets and thus will provide you with generalization error estimates. trainControl is used to choose how testing is performed.

Train also tries to tune extra parameters by trying different values. For rpart, train tries to tune the cp parameter (tree complexity) using accuracy to chose the best model. I set minsplit to 2 since we have not much data. Note: Parameters used for tuning (in this case cp) need to be set using a data.frame in the argument tuneGrid! Setting it in control will be ignored.

library(caret)
fit <- train(type ~ ., data = Zoo , method = "rpart",
    control=rpart.control(minsplit=2),
    trControl = trainControl(method = "cv", number = 10),
    tuneLength=5)
fit
## CART 
## 
## 101 samples
##  16 predictors
##   7 classes: 'mammal', 'bird', 'reptile', 'fish', 'amphibian', 'insect', 'mollusc.et.al' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 90, 90, 92, 90, 92, 89, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.9475758  0.9292659
##   0.08333333  0.8580303  0.8135818
##   0.16666667  0.7633838  0.6822969
##   0.21666667  0.6820202  0.5727742
##   0.33333333  0.4623232  0.1148352
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.

Note: Train has built 10 trees. Accuracy and kappa for each tree/test fold can be obtained.

fit$resample
##     Accuracy     Kappa Resample
## 1  0.8888889 0.8474576   Fold09
## 2  0.9090909 0.8854167   Fold04
## 3  1.0000000 1.0000000   Fold06
## 4  1.0000000 1.0000000   Fold01
## 5  1.0000000 1.0000000   Fold02
## 6  0.8888889 0.8448276   Fold10
## 7  0.8888889 0.8448276   Fold05
## 8  1.0000000 1.0000000   Fold07
## 9  0.9000000 0.8701299   Fold08
## 10 1.0000000 1.0000000   Fold03

A model using the best tuning parameters and using all the data is available as fit$finalModel.

rpart.plot(fit$finalModel, extra = 2, under = TRUE,  varlen=0, faclen=0)
## Warning: All boxes will be white (the box.palette argument will be ignored) because
## the number of classes predicted by the model 7 is greater than length(box.palette) 6.
## To make this warning go away use box.palette=0 or trace=-1.

Note: For many models, caret converts factors into dummy coding, i.e., a single 0-1 variable for each factor level. This is why you see split nodes like milkTRUE>=0.5.

caret also computes variable importance. By default it uses competing splits (splits which would be runners up, but do not get chosen by the tree) for rpart models (see ? varImp). Toothed is comes out to be the runner up a lot, but never gets chosen!

varImp(fit)
## rpart variable importance
## 
##               Overall
## toothedTRUE  100.0000
## feathersTRUE  84.2553
## milkTRUE      66.4661
## eggsTRUE      64.1041
## breathesTRUE  63.6169
## backboneTRUE  58.6556
## finsTRUE      54.3868
## hairTRUE      53.9568
## legs          31.0115
## tailTRUE      25.8914
## airborneTRUE  24.4090
## aquaticTRUE   15.8718
## predatorTRUE  11.9787
## venomousTRUE   1.6471
## catsizeTRUE    0.9982
## domesticTRUE   0.0000

Here is the variable importance without competing splits.

varImp(fit, compete = FALSE)
## rpart variable importance
## 
##              Overall
## milkTRUE     100.000
## feathersTRUE  55.587
## finsTRUE      37.991
## backboneTRUE  20.525
## airborneTRUE  18.773
## aquaticTRUE    9.612
## legs           6.758
## eggsTRUE       5.407
## predatorTRUE   4.506
## domesticTRUE   0.000
## tailTRUE       0.000
## hairTRUE       0.000
## breathesTRUE   0.000
## venomousTRUE   0.000
## toothedTRUE    0.000
## catsizeTRUE    0.000
dotPlot(varImp(fit, compete=FALSE))

Note: Not all models provide a variable importance function. In this case caret might calculate varImp by itself and ignore the model (see ? varImp)!

Repeated Bootstrap Sampling

An alternative to CV is repeated bootstrap sampling. It will give you very similar estimates.

fit <- train(type ~ ., data = Zoo, method = "rpart",
    control=rpart.control(minsplit=2),
    trControl = trainControl(method = "boot", number = 10),
    tuneLength=5)
fit
## CART 
## 
## 101 samples
##  16 predictors
##   7 classes: 'mammal', 'bird', 'reptile', 'fish', 'amphibian', 'insect', 'mollusc.et.al' 
## 
## No pre-processing
## Resampling: Bootstrapped (10 reps) 
## Summary of sample sizes: 101, 101, 101, 101, 101, 101, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.9324564  0.9129097
##   0.08333333  0.8505674  0.8072392
##   0.16666667  0.7126258  0.6234359
##   0.21666667  0.6179493  0.4956660
##   0.33333333  0.5201291  0.2946243
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.

Holdout Sample

Partition data 66%/34%. Note: CV and repeated bootstrap sampling is typically preferred.

inTrain <- createDataPartition(y=Zoo$type, p = .66, list=FALSE)
training <- Zoo[ inTrain,]
testing <- Zoo[-inTrain,]

Find best model (trying more values for tuning using tuneLength).

fit <- train(type ~ ., data = training, method = "rpart",
    control=rpart.control(minsplit=2),
    trControl = trainControl(method = "cv", number = 10),
    tuneLength=20)
fit
## CART 
## 
## 71 samples
## 16 predictors
##  7 classes: 'mammal', 'bird', 'reptile', 'fish', 'amphibian', 'insect', 'mollusc.et.al' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 63, 65, 64, 63, 65, 62, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.9472222  0.9342531
##   0.01713586  0.9472222  0.9342531
##   0.03427173  0.9347222  0.9185668
##   0.05140759  0.9347222  0.9185668
##   0.06854345  0.9347222  0.9185668
##   0.08567931  0.9093254  0.8849325
##   0.10281518  0.9093254  0.8849325
##   0.11995104  0.9093254  0.8843285
##   0.13708690  0.8303571  0.7819399
##   0.15422277  0.7827381  0.7160669
##   0.17135863  0.7341270  0.6543792
##   0.18849449  0.7341270  0.6517442
##   0.20563035  0.6579365  0.5432650
##   0.22276622  0.6093254  0.4731158
##   0.23990208  0.6093254  0.4731158
##   0.25703794  0.6093254  0.4731158
##   0.27417381  0.6093254  0.4731158
##   0.29130967  0.6093254  0.4731158
##   0.30844553  0.5407540  0.3168658
##   0.32558140  0.4740873  0.1868658
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.01713586.
plot(fit)