This report was automatically generated with the R package knitr (version 1.12.3).
# The purpose of this script is to create a data object (dto)
# (dto) which will hold all data and metadata from each candidate study of the exercise
# Run the line below to stitch a basic html output. For elaborated report, run the corresponding .Rmd file
# knitr::stitch_rmd(script="./manipulation/0-ellis-island.R", output="./manipulation/stitched-output/0-ellis-island.md")
# These first few lines run only when the file is run in RStudio, !!NOT when an Rmd/Rnw file calls it!!
rm(list=ls(all=TRUE)) #Clear the variables from previous runs.
cat("\f") # clear console
# Call `base::source()` on any repo file that defines functions needed below. Ideally, no real operations are performed.
source("./scripts/common-functions.R")
# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
library(magrittr) #Pipes
# Verify these packages are available on the machine, but their functions need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
requireNamespace("ggplot2")
requireNamespace("tidyr")
requireNamespace("dplyr") #Avoid attaching dplyr, b/c its function names conflict with a lot of packages (esp base, stats, and plyr).
requireNamespace("testit") #For asserting conditions meet expected patterns.
#
# There will be a total of (4) elements in (dto): (1) studyName, (2) filePath, (3) unitDat, and (4) metaData
dto <- list() # creates empty list object to populate with script to follow
#
### dto (1) : names of candidate studies
#
# inspect what files there are
(listFiles <- list.files("./data/unshared/raw", full.names = T, pattern = ".sav", recursive = F))
## [1] "./data/unshared/raw/ALSA-Wave1.Final.sav"
## [2] "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
## [3] "./data/unshared/raw/SATSA-Q3.Final.sav"
## [4] "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
## [5] "./data/unshared/raw/TILDA-Wave1.Final.sav"
# list the names of the studies to be used in subsequent code
studyNames <- c("alsa", "lbsl", "satsa", "share", "tilda")
dto[["studyName"]] <- studyNames
#
### dto (2) : file paths to corresponding data files
#
# manually declare the file paths to enforce the order and prevent mismatching
alsa_path_input <- "./data/unshared/raw/ALSA-Wave1.Final.sav"
lbsl_path_input <- "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
satsa_path_input <- "./data/unshared/raw/SATSA-Q3.Final.sav"
share_path_input <- "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
tilda_path_input <- "./data/unshared/raw/TILDA-Wave1.Final.sav"
# combine file paths into a single object
filePaths <- c(alsa_path_input, lbsl_path_input, satsa_path_input, share_path_input, tilda_path_input )
dto[["filePath"]] <- filePaths
# declare where the derived data object should be placed
path_output_folder <- "./data/unshared/derived/"
figure_path <- 'manipulation/stitched-output/'
#
### dto (3) : datasets with raw source data from each study
#
# at this point the object `dto` contains components:
names(dto)
## [1] "studyName" "filePath"
# next, we will add another element to this list `dto` and call it "unitData"
# it will be a list object in itself, storing datasets from studies as seperate elements
# no we will reach to the file paths in `dto[["filePath"]][[i]] and input raw data sets
# where `i` is iteratively each study in `dto[["studyName"]][[i]]
data_list <- list() # declare a list to populate
for(i in seq_along(dto[["studyName"]])){
# i <- 1
# input the 5 SPSS files in .SAV extension provided with the exercise
data_list[[i]] <- Hmisc::spss.get(dto[["filePath"]][i], use.value.labels = TRUE)
}
## Warning in read.spss(file, use.value.labels = use.value.labels,
## to.data.frame = to.data.frame, : ./data/unshared/raw/ALSA-Wave1.Final.sav:
## Unrecognized record type 7, subtype 18 encountered in system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## to.data.frame = to.data.frame, : ./data/unshared/raw/LBSL-Panel2-
## Wave1.Final.sav: Unrecognized record type 7, subtype 18 encountered in
## system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## to.data.frame = to.data.frame, : ./data/unshared/raw/SATSA-Q3.Final.sav:
## Unrecognized record type 7, subtype 18 encountered in system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## to.data.frame = to.data.frame, : ./data/unshared/raw/SHARE-Israel-
## Wave1.Final.sav: Unrecognized record type 7, subtype 18 encountered in
## system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## to.data.frame = to.data.frame, : ./data/unshared/raw/TILDA-Wave1.Final.sav:
## Unrecognized record type 7, subtype 18 encountered in system file
names(data_list) <- studyNames # name the elements of the data list
dto[["unitData"]] <- data_list # include data list into the main list as another element
names(dto) # elements in the main list object
## [1] "studyName" "filePath" "unitData"
names(dto[["unitData"]]) # elements in the subelement
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
# at this point the object `dto` contains components:
names(dto)
## [1] "studyName" "filePath" "unitData"
# dto contains: "studyName" , "filePath", "unitData"
# we have just added the (3rd) element, a list of datasets:
names(dto[["unitData"]])
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
#
### dto (4) : collect metadata
#
# inspect the variable names and their labels in the raw data files
# names_labels(dto[["unitData"]][["alsa"]])
# names_labels(dto[["unitData"]][["lbsl"]])
# names_labels(dto[["unitData"]][["satsa"]])
# names_labels(dto[["unitData"]][["share"]])
# names_labels(dto[["unitData"]][["tilda"]])
# rename "MAR4" because it can be confused by machines for March-4
dto[["unitData"]][["tilda"]] <- plyr::rename(dto[["unitData"]][["tilda"]], replace = c("MAR4"= "MAR_4"))
# rename personal identifiers for consistency
dto[["unitData"]][["alsa"]] <- plyr::rename(dto[["unitData"]][["alsa"]], replace = c("SEQNUM"= "id"))
dto[["unitData"]][["lbsl"]] <- plyr::rename(dto[["unitData"]][["lbsl"]], replace = c("ID"= "id"))
dto[["unitData"]][["satsa"]] <- plyr::rename(dto[["unitData"]][["satsa"]], replace = c("ID"= "id"))
dto[["unitData"]][["share"]] <- plyr::rename(dto[["unitData"]][["share"]], replace = c("SAMPID.rec"= "id"))
dto[["unitData"]][["tilda"]] <- plyr::rename(dto[["unitData"]][["tilda"]], replace = c("ID"= "id"))
# to prepare for the final step in which we add metadata to the dto
# we begin by extracting the names and (hopefuly their) labels of variables from each dataset
# and combine them in a single rectanguar object, long/stacked with respect to study names
for(i in studyNames){
save_csv <- names_labels(dto[["unitData"]][[i]])
write.csv(save_csv, paste0("./data/meta/names-labels-live/nl-",i,".csv"),
row.names = T)
}
# these 5 individual .cvs contain the original variable names and labels
# now we combine these files to create the starter for our metadata object
dum <- list()
for(i in studyNames){
dum[[i]] <- read.csv(paste0("./data/meta/names-labels-live/nl-",i,".csv"),
header = T, stringsAsFactors = F )
}
mdsraw <- plyr::ldply(dum, data.frame,.id = "study_name") # convert list of ds into a single ds
mdsraw["X"] <- NULL # remove native counter variable, not needed
write.csv(mdsraw, "./data/meta/names-labels-live/names-labels-live.csv", row.names = T)
# after the final version of the data files used in the excerside have been obtained
# we made a dead copy of `./data/shared/derived/meta-raw-live.csv` and named it `./data/shared/meta-data-map.csv`
# decisions on variables' renaming and classification is encoded in this map
# reproduce ellis-island script every time you make changes to `meta-data-map.csv`
dsm <- read.csv("./data/meta/meta-data-map.csv")
dsm["X"] <- NULL # remove native counter variable, not needed
dsm["X.1"] <- NULL # remove native counter variable, not needed
# dsm$url <- if(is.na(dsm$url){paste0("[link](", dsm$url,")")
dsm$url <- as.character(dsm$url)
for(i in seq_along(dsm$url)){ # i <- 20
if(!dsm[i,"url"]==""){
dsm[i,"url"] <- paste0("[link](",dsm[i,"url"],")")
}
}
# attach metadata object as the 4th element of the dto
dto[["metaData"]] <- dsm
# testit::assert("`model_name` should be a unique value", sum(duplicated(ds$model_name))==0L)
# testit::assert("`miles_per_gallon` should be a positive value.", all(ds$miles_per_gallon>0))
# testit::assert("`weight_gear_z` should be a positive or missing value.", all(is.na(ds$miles_per_gallon) | (ds$miles_per_gallon>0)))
# Save as a compress, binary R dataset. It's no longer readable with a text editor, but it saves metadata (eg, factor information).
saveRDS(dto, file="./data/unshared/derived/dto.rds", compress="xz")
# the production of the dto object is now complete
# we verify its structure and content:
dto <- readRDS("./data/unshared/derived/dto.rds")
# each element this list is another list:
names(dto)
## [1] "studyName" "filePath" "unitData" "metaData"
# 1st element - names of the studies as character vector
dto[["studyName"]]
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
# 2nd element - file paths of the data files for each study
dto[["filePath"]]
## [1] "./data/unshared/raw/ALSA-Wave1.Final.sav"
## [2] "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
## [3] "./data/unshared/raw/SATSA-Q3.Final.sav"
## [4] "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
## [5] "./data/unshared/raw/TILDA-Wave1.Final.sav"
# 3rd element - list objects with
names(dto[["unitData"]])
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
dplyr::tbl_df(dto[["unitData"]][["alsa"]])
## Source: local data frame [2,087 x 26]
##
## id EXRTHOUS HWMNWK2W LSVEXC2W LSVIGEXC TMHVYEXR TMVEXC2W VIGEXC2W
## (int) (fctr) (int) (int) (fctr) (int) (int) (int)
## 1 41 No 14 NA No NA NA NA
## 2 42 No 14 4 Yes NA NA NA
## 3 61 No NA NA No NA NA NA
## 4 71 No 14 NA No NA NA NA
## 5 91 No 28 NA No NA NA NA
## 6 121 No NA NA No NA NA NA
## 7 181 No NA NA No NA NA NA
## 8 201 No NA NA No NA NA NA
## 9 221 No NA NA No NA NA NA
## 10 261 No NA NA No NA NA NA
## .. ... ... ... ... ... ... ... ...
## Variables not shown: VIGEXCS (fctr), WALK2WKS (fctr), BTSM12MN (fctr),
## HLTHBTSM (fctr), HLTHLIFE (fctr), AGE (int), SEX (fctr), MARITST (fctr),
## SCHOOL (fctr), TYPQUAL (fctr), RETIRED (fctr), SMOKER (fctr), FR6ORMOR
## (fctr), NOSTDRNK (fctr), FREQALCH (fctr), WEIGHT (dbl), PIPCIGAR (fctr),
## CURRWORK (fctr)
# 4th element - dataset with augmented names and labels for variables from all involved studies
dplyr::tbl_df(dto[["metaData"]])
## Source: local data frame [150 x 10]
##
## study_name name label_short item construct
## (fctr) (fctr) (fctr) (fctr) (fctr)
## 1 alsa SEQNUM Sequence Number id id
## 2 alsa EXRTHOUS Exertion around house physact
## 3 alsa HWMNWK2W Times walked in past two weeks physact
## 4 alsa LSVEXC2W Less vigor sessions last 2 weeks physact
## 5 alsa LSVIGEXC Less vigor past 2 weeks physact
## 6 alsa TMHVYEXR Time heavy physical exertion physact
## 7 alsa TMVEXC2W Vigor Time past 2 weeks physact
## 8 alsa VIGEXC2W Vigor Sessions in past 2 weeks physact
## 9 alsa VIGEXCS Vigorous exercise physact
## 10 alsa WALK2WKS Walking past 2 weeks physact
## .. ... ... ... ... ...
## Variables not shown: type (fctr), categories (int), label (fctr), url
## (chr), notes (fctr)
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.2.5 (2016-04-14)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows >= 8 x64 (build 9200)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_2.1.0 knitr_1.12.3 magrittr_1.5
##
## loaded via a namespace (and not attached):
## [1] splines_3.2.5 lattice_0.20-33 colorspace_1.2-6
## [4] htmltools_0.3.5 mgcv_1.8-12 yaml_2.1.13
## [7] chron_2.3-47 survival_2.38-3 nloptr_1.0.4
## [10] foreign_0.8-66 DBI_0.4-1 RColorBrewer_1.1-2
## [13] plyr_1.8.3 stringr_1.0.0 MatrixModels_0.4-1
## [16] munsell_0.4.3 gtable_0.2.0 htmlwidgets_0.6
## [19] evaluate_0.9 labeling_0.3 latticeExtra_0.6-28
## [22] SparseM_1.7 extrafont_0.17 quantreg_5.21
## [25] pbkrtest_0.4-6 parallel_3.2.5 markdown_0.7.7
## [28] highr_0.5.1 Rttf2pt1_1.3.3 Rcpp_0.12.5
## [31] readr_0.2.2 acepack_1.3-3.3 scales_0.4.0
## [34] DT_0.1.40 formatR_1.3 Hmisc_3.17-4
## [37] jsonlite_0.9.20 lme4_1.1-12 gridExtra_2.2.1
## [40] testit_0.5 digest_0.6.9 stringi_1.0-1
## [43] dplyr_0.4.3 grid_3.2.5 tools_3.2.5
## [46] lazyeval_0.1.10 dichromat_2.0-0 Formula_1.2-1
## [49] cluster_2.0.3 tidyr_0.4.1 extrafontdb_1.0
## [52] car_2.1-2 MASS_7.3-45 Matrix_1.2-4
## [55] rsconnect_0.4.2.1 data.table_1.9.6 assertthat_0.1
## [58] minqa_1.2.4 rmarkdown_0.9.6 R6_2.1.2
## [61] rpart_4.1-10 nnet_7.3-12 nlme_3.1-126
Sys.time()
## [1] "2016-06-06 17:04:35 PDT"