This code covers chapter 5 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar. See table of contents for code examples for other chapters.

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

Show fewer digits

options(digits=3)

Install keras and tensorflow

R> install.packages(“keras”)

R> library(keras)

R> install_tensorflow()

R> install_keras()

library(keras)

Load and prepare the data set

data(Zoo, package="mlbench")
head(Zoo)
##           hair feathers  eggs  milk airborne aquatic predator toothed
## aardvark  TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## antelope  TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
## bass     FALSE    FALSE  TRUE FALSE    FALSE    TRUE     TRUE    TRUE
## bear      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## boar      TRUE    FALSE FALSE  TRUE    FALSE   FALSE     TRUE    TRUE
## buffalo   TRUE    FALSE FALSE  TRUE    FALSE   FALSE    FALSE    TRUE
##          backbone breathes venomous  fins legs  tail domestic catsize
## aardvark     TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## antelope     TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## bass         TRUE    FALSE    FALSE  TRUE    0  TRUE    FALSE   FALSE
## bear         TRUE     TRUE    FALSE FALSE    4 FALSE    FALSE    TRUE
## boar         TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
## buffalo      TRUE     TRUE    FALSE FALSE    4  TRUE    FALSE    TRUE
##            type
## aardvark mammal
## antelope mammal
## bass       fish
## bear     mammal
## boar     mammal
## buffalo  mammal
Zoo_predictors <- Zoo[,-ncol(Zoo)]
Zoo_class <- Zoo[, ncol(Zoo)]

Create a matrix and normalize the data (using kera’s normalize() function). If you have nominal variables (factor), then you need to use kera’s to_categorical() function to create one-hot encoding.

Zoo_predictors <- normalize(as.matrix(Zoo_predictors))
head(Zoo_predictors)
##       [,1] [,2]  [,3]  [,4] [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12]
## [1,] 0.209    0 0.000 0.209    0 0.000 0.209 0.209 0.209 0.209     0 0.000
## [2,] 0.209    0 0.000 0.209    0 0.000 0.000 0.209 0.209 0.209     0 0.000
## [3,] 0.000    0 0.378 0.000    0 0.378 0.378 0.378 0.378 0.000     0 0.378
## [4,] 0.209    0 0.000 0.209    0 0.000 0.209 0.209 0.209 0.209     0 0.000
## [5,] 0.204    0 0.000 0.204    0 0.000 0.204 0.204 0.204 0.204     0 0.000
## [6,] 0.209    0 0.000 0.209    0 0.000 0.000 0.209 0.209 0.209     0 0.000
##      [,13] [,14] [,15] [,16]
## [1,] 0.834 0.000     0 0.209
## [2,] 0.834 0.209     0 0.209
## [3,] 0.000 0.378     0 0.000
## [4,] 0.834 0.000     0 0.209
## [5,] 0.816 0.204     0 0.204
## [6,] 0.834 0.209     0 0.209

One-hot encode the class variable

Zoo_class <- to_categorical(as.integer(Zoo_class))
head(Zoo_class)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    0    1    0    0    0    0    0    0
## [2,]    0    1    0    0    0    0    0    0
## [3,]    0    0    0    0    1    0    0    0
## [4,]    0    1    0    0    0    0    0    0
## [5,]    0    1    0    0    0    0    0    0
## [6,]    0    1    0    0    0    0    0    0

Construct the model structure

model <- keras_model_sequential()
model
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## Total params: 0
## Trainable params: 0
## Non-trainable params: 0
## ___________________________________________________________________________
model %>%
  layer_dense(units = 8, activation = 'relu', input_shape = c(ncol(Zoo_predictors))) %>%
  layer_dense(units = ncol(Zoo_class), activation = 'softmax')
model
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 8)                     136         
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 8)                     72          
## ===========================================================================
## Total params: 208
## Trainable params: 208
## Non-trainable params: 0
## ___________________________________________________________________________

See ? layer_dense to learn more about creating the model structure

Compile the model

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = 'adam',
  metrics = 'accuracy'
)

Fit the model

Uses 20% of the data for validation

train <- sample(c(TRUE, FALSE), size = nrow(Zoo), prob = c(0.8, 0.2), replace = TRUE)

history <- model %>% fit(
  Zoo_predictors[train,],
  Zoo_class[train,],
  validation_data = list(Zoo_predictors[!train,], Zoo_class[!train,]),
  epochs = 200,
  batch_size = 5
)

history
## Trained on 72 samples, validated on 29 samples (batch_size=5, epochs=200)
## Final epoch (plot to see history):
##      acc: 0.9583
##     loss: 0.1755
##  val_acc: 0.8621
## val_loss: 0.3377
plot(history)

val_acc is the accuracy on the test (validation) set.

Make predictions on the test set

classes <- model %>% predict_classes(Zoo_predictors[!train,], batch_size = 128)

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(data = factor(classes, levels = 1:length(levels(Zoo$type)), labels = levels(Zoo$type)),
  ref = Zoo$type[!train])
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      mammal bird reptile fish amphibian insect mollusc.et.al
##   mammal            13    0       0    0         0      0             0
##   bird               0    6       0    0         0      0             0
##   reptile            0    0       0    0         0      0             0
##   fish               0    0       1    3         0      0             0
##   amphibian          0    0       0    0         0      0             0
##   insect             0    0       0    0         0      2             0
##   mollusc.et.al      0    0       0    0         3      0             1
## 
## Overall Statistics
##                                         
##                Accuracy : 0.862         
##                  95% CI : (0.683, 0.961)
##     No Information Rate : 0.448         
##     P-Value [Acc > NIR] : 4.86e-06      
##                                         
##                   Kappa : 0.812         
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: mammal Class: bird Class: reptile Class: fish
## Sensitivity                  1.000       1.000         0.0000       1.000
## Specificity                  1.000       1.000         1.0000       0.962
## Pos Pred Value               1.000       1.000            NaN       0.750
## Neg Pred Value               1.000       1.000         0.9655       1.000
## Prevalence                   0.448       0.207         0.0345       0.103
## Detection Rate               0.448       0.207         0.0000       0.103
## Detection Prevalence         0.448       0.207         0.0000       0.138
## Balanced Accuracy            1.000       1.000         0.5000       0.981
##                      Class: amphibian Class: insect Class: mollusc.et.al
## Sensitivity                     0.000         1.000               1.0000
## Specificity                     1.000         1.000               0.8929
## Pos Pred Value                    NaN         1.000               0.2500
## Neg Pred Value                  0.897         1.000               1.0000
## Prevalence                      0.103         0.069               0.0345
## Detection Rate                  0.000         0.069               0.0345
## Detection Prevalence            0.000         0.069               0.1379
## Balanced Accuracy               0.500         1.000               0.9464