The plyr package

Nick Kennedy
Clinical Research Fellow
GI Unit, IGMM
University of Edinburgh

16/09/2015

Loops in R

lapply

set.seed(123)
my_list <- list(a = rnorm(100), b = rnorm(50), c = runif(20))
lapply(my_list, mean)
## $a
## [1] 0.09040591
## 
## $b
## [1] -0.2539004
## 
## $c
## [1] 0.4793934

Split-Apply-Combine

Image copyright Hadley Wickham

The plyr package

  l a d _
l llply laply ldply l_ply
a alply aaply adply a_ply
d dlply daply ddply d_ply
r rlply raply rdply r_ply
m mlply maply mdply m_ply

Meaning of the first two characters of the -ply functions

List input examples: llply

llply(my_list, mean)
## $a
## [1] 0.09040591
## 
## $b
## [1] -0.2539004
## 
## $c
## [1] 0.4793934

List input examples: laply

llply(my_list, mean)
## $a
## [1] 0.09040591
## 
## $b
## [1] -0.2539004
## 
## $c
## [1] 0.4793934
#laply(my_list, I)
#Will return an error

List input examples: ldply

ldply(my_list, mean)
##   .id          V1
## 1   a  0.09040591
## 2   b -0.25390043
## 3   c  0.47939338
ldply(my_list, function(x) data.frame(mean = mean(x), sd = sd(x)))
##   .id        mean        sd
## 1   a  0.09040591 0.9128159
## 2   b -0.25390043 0.9893339
## 3   c  0.47939338 0.2830888

List input examples: l_ply

layout(1:3)
l_ply(my_list, hist)

layout(1)

Array input examples: aaply

x <- array(1:24, c(2, 3, 4))
all(aaply(x, 2, .fun = identity) == aperm(x, c(2, 1, 3)))
## [1] TRUE

Array input examples: aaply

set.seed(913)
x <- matrix(rnorm(100), 10, 10)
aaply(x, 1, sd)
##         1         2         3         4         5         6         7 
## 0.9194631 0.8210030 0.5969373 0.8796405 0.9995688 1.2863265 0.8085736 
##         8         9        10 
## 0.8429865 0.9946271 1.1209652

Using aaply on a data.frame

my_fun <- function(r) r$Sepal.Length + r$Petal.Length
iris_data <- iris[, c("Sepal.Length", "Petal.Length")]
aaply(iris_data, 1, my_fun)[1:10, 1:10]
##             Petal.Length
## Sepal.Length   1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.9   3
##          4.3  NA 5.4  NA  NA  NA  NA  NA  NA  NA  NA
##          4.4  NA  NA  NA 5.7 5.8  NA  NA  NA  NA  NA
##          4.5  NA  NA  NA 5.8  NA  NA  NA  NA  NA  NA
##          4.6 5.6  NA  NA  NA 6.0 6.1  NA  NA  NA  NA
##          4.7  NA  NA  NA 6.0  NA  NA 6.3  NA  NA  NA
##          4.8  NA  NA  NA  NA 6.2  NA 6.4  NA 6.7  NA
##          4.9  NA  NA  NA  NA 6.3 6.4  NA  NA  NA  NA
##          5    NA  NA 6.2 6.3 6.4 6.5 6.6  NA  NA  NA
##          5.1  NA  NA  NA  NA 6.5 6.6 6.7 6.8 7.0 8.1
##          5.2  NA  NA  NA  NA 6.6 6.7  NA  NA  NA  NA

Using aaply on a data.frame

aaply(iris_data, 1, my_fun, .expand = FALSE)[1:20]
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 6.5 6.3 6.0 6.1 6.4 7.1 6.0 6.5 5.8 6.4 6.9 6.4 6.2 5.4 7.0 7.2 6.7 6.5 
##  19  20 
## 7.4 6.6

Array input examples: adply

x <- array(1:24, c(2, 3, 4))
sum(x[1, , ])
## [1] 144
adply(x, 1, sum)
##   X1  V1
## 1  1 144
## 2  2 156

Array input examples: adply

add_one <- function(a) a + 1
add_one(x[1, 1, ])
## [1]  2  8 14 20
adply(x, 1:2, add_one)
##   X1 X2 V1 V2 V3 V4
## 1  1  1  2  8 14 20
## 2  2  1  3  9 15 21
## 3  1  2  4 10 16 22
## 4  2  2  5 11 17 23
## 5  1  3  6 12 18 24
## 6  2  3  7 13 19 25

Array input examples: adply

add_one(x[1, , ])
##      [,1] [,2] [,3] [,4]
## [1,]    2    8   14   20
## [2,]    4   10   16   22
## [3,]    6   12   18   24
adply(x, 1, add_one)
##   X1 1  2  3  4
## 1  1 2  8 14 20
## 2  1 4 10 16 22
## 3  1 6 12 18 24
## 4  2 3  9 15 21
## 5  2 5 11 17 23
## 6  2 7 13 19 25

adply on a data.frame with .expand

adply(iris_data, 1, my_fun, .expand = TRUE)[1:10, ]
##    Sepal.Length Petal.Length  V1
## 1           5.1          1.4 6.5
## 2           4.9          1.4 6.3
## 3           4.7          1.3 6.0
## 4           4.6          1.5 6.1
## 5           5.0          1.4 6.4
## 6           5.4          1.7 7.1
## 7           4.6          1.4 6.0
## 8           5.0          1.5 6.5
## 9           4.4          1.4 5.8
## 10          4.9          1.5 6.4
adply(iris_data, 1, my_fun, .expand = FALSE)[1:10, ]
##    X1  V1
## 1   1 6.5
## 2   2 6.3
## 3   3 6.0
## 4   4 6.1
## 5   5 6.4
## 6   6 7.1
## 7   7 6.0
## 8   8 6.5
## 9   9 5.8
## 10 10 6.4

Other a*ply functions

Data.frame input examples

daply(iris, .(Species), function(r) mean(r$Petal.Length))
##     setosa versicolor  virginica 
##      1.462      4.260      5.552
ddply(iris, .(Species), function(r) mean(r$Petal.Length))
##      Species    V1
## 1     setosa 1.462
## 2 versicolor 4.260
## 3  virginica 5.552

sumarise with ddply functions

ddply(iris, .(Species, round(Sepal.Length, 0)), summarise,
      Mean.Petal.Length = mean(Petal.Length),
      SD.Petal.Length = sd(Petal.Length))
##       Species round(Sepal.Length, 0) Mean.Petal.Length SD.Petal.Length
## 1      setosa                      4          1.280000       0.1095445
## 2      setosa                      5          1.490000       0.1661016
## 3      setosa                      6          1.420000       0.1923538
## 4  versicolor                      5          3.583333       0.5382069
## 5  versicolor                      6          4.277778       0.3711843
## 6  versicolor                      7          4.687500       0.2167124
## 7   virginica                      5          4.500000              NA
## 8   virginica                      6          5.255556       0.3238391
## 9   virginica                      7          5.737500       0.3263434
## 10  virginica                      8          6.566667       0.2804758

Replication using r*ply functions

Replication using r*ply functions example

result <- raply(100, mean(runif(1000)))
sum(result)
## [1] 49.96337
hist(result)

Progress bars with plyr

file_list <- list.files("data", "\\.csv$")
process_file <- function(file_name) {
  # Do something rather slow on a file and return a one row data.frame
}
processed_data <- ldply(file_list, process_file, .progress = "text")
  |================                                    |  21%

Parallelisation with plyr

Parallelisation with plyr (doMC)

library("doMC")
registerDoMC(4)
system.time(llply(1:4, sleepy_time, .parallel = TRUE))

Parallelisation with plyr (doParallel)

library("doParallel")
cl <- makeCluster(4)
registerDoParallel(cl)
sleepy_time <- function(x) Sys.sleep(2)
system.time(llply(1:4, sleepy_time, .parallel = FALSE))
##    user  system elapsed 
##    0.00    0.00    8.02
system.time(llply(1:4, sleepy_time, .parallel = TRUE))
##    user  system elapsed 
##    0.03    0.00    3.00
stopCluster(cl)

Parallelisation with plyr (doParallel 2)

library("pROC")
my_data <- data.frame(resp = sample(1:2, 1000, TRUE), V1 = rnorm(1000), V2 = rnorm(1000))
library("doParallel")
cl <- makeCluster(4)
registerDoParallel(cl)
llply(c("V1", "V2"), function(var) auc(my_data$resp, my_data[, var]),
  .parallel = TRUE, .paropts = list(.packages = "pROC", .export = "my_data"))
## [[1]]
## Area under the curve: 0.4919
## 
## [[2]]
## Area under the curve: 0.5129
stopCluster(cl)
llply(c("V1", "V2"), function(var) auc(my_data$resp, my_data[, var]), .parallel = TRUE)
## Error in do.ply(i) : task 1 failed - "could not find function "auc""

Parallelisation with plyr (doParallel 3)

library("pROC")
my_data <- data.frame(resp = sample(1:2, 1000, TRUE), V1 = rnorm(1000), V2 = rnorm(1000))
library("doParallel")
cl <- makeCluster(4)
registerDoParallel(cl)
clusterExport(cl, "my_data")
invisible(clusterEvalQ(cl, library("pROC")))
llply(c("V1", "V2"), function(var) auc(my_data$resp, my_data[, var]), .parallel = TRUE)
## [[1]]
## Area under the curve: 0.538
## 
## [[2]]
## Area under the curve: 0.5007
stopCluster(cl)

Limitations of plyr

library("plyr")
library("dplyr")
plyr::summarise(x, mean = mean(y))

Conclusions