# The purpose of this script is to create a data object (dto)
# (dto) which will hold all data and metadata from each candidate study of the exercise
# Run the line below to stitch a basic html output. For elaborated report, run the corresponding .Rmd file
# knitr::stitch_rmd(script="./manipulation/0-ellis-island.R", output="./manipulation/stitched-output/")
# These first few lines run only when the file is run in RStudio, !!NOT when an Rmd/Rnw file calls it!!
rm(list=ls(all=TRUE)) #Clear the variables from previous runs.
cat("\f") # clear console
# Call `base::source()` on any repo file that defines functions needed below. Ideally, no real operations are performed.
# Attach these packages so their functions don't need to be qualified:
library(magrittr) #Pipes
# Verify these packages are available on the machine, but their functions need to be qualified:
requireNamespace("dplyr") #Avoid attaching dplyr, b/c its function names conflict with a lot of packages (esp base, stats, and plyr).
requireNamespace("testit") #For asserting conditions meet expected patterns.
# There will be a total of (4) elements in (dto): (1) studyName, (2) filePath, (3) unitDat, and (4) metaData
dto <- list() # creates empty list object to populate with script to follow
### dto (1) : names of candidate studies
# inspect what files there are
(listFiles <- list.files("./data/unshared/raw", full.names = T, pattern = ".sav", recursive = F))
## [1] "./data/unshared/raw/ALSA-Wave1.Final.sav"
## [2] "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
## [3] "./data/unshared/raw/SATSA-Q3.Final.sav"
## [4] "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
## [5] "./data/unshared/raw/TILDA-Wave1.Final.sav"
# list the names of the studies to be used in subsequent code
studyNames <- c("alsa", "lbsl", "satsa", "share", "tilda")
dto[["studyName"]] <- studyNames
### dto (2) : file paths to corresponding data files
# manually declare the file paths to enforce the order and prevent mismatching
alsa_path_input <- "./data/unshared/raw/ALSA-Wave1.Final.sav"
lbsl_path_input <- "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
satsa_path_input <- "./data/unshared/raw/SATSA-Q3.Final.sav"
share_path_input <- "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
tilda_path_input <- "./data/unshared/raw/TILDA-Wave1.Final.sav"
# combine file paths into a single object
filePaths <- c(alsa_path_input, lbsl_path_input, satsa_path_input, share_path_input, tilda_path_input )
dto[["filePath"]] <- filePaths
# declare where the derived data object should be placed
path_output_folder <- "./data/unshared/derived/"
figure_path <- 'manipulation/stitched-output/'
### dto (3) : datasets with raw source data from each study
# at this point the object `dto` contains components:
## [1] "studyName" "filePath"
# next, we will add another element to this list `dto` and call it "unitData"
# it will be a list object in itself, storing datasets from studies as seperate elements
# no we will reach to the file paths in `dto[["filePath"]][[i]] and input raw data sets
# where `i` is iteratively each study in `dto[["studyName"]][[i]]
data_list <- list() # declare a list to populate
for(i in seq_along(dto[["studyName"]])){
# i <- 1
# input the 5 SPSS files in .SAV extension provided with the exercise
data_list[[i]] <- Hmisc::spss.get(dto[["filePath"]][i], use.value.labels = TRUE)
## Warning in read.spss(file, use.value.labels = use.value.labels,
## =, : ./data/unshared/raw/ALSA-Wave1.Final.sav:
## Unrecognized record type 7, subtype 18 encountered in system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## =, : ./data/unshared/raw/LBSL-Panel2-
## Wave1.Final.sav: Unrecognized record type 7, subtype 18 encountered in
## system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## =, : ./data/unshared/raw/SATSA-Q3.Final.sav:
## Unrecognized record type 7, subtype 18 encountered in system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## =, : ./data/unshared/raw/SHARE-Israel-
## Wave1.Final.sav: Unrecognized record type 7, subtype 18 encountered in
## system file
## Warning in read.spss(file, use.value.labels = use.value.labels,
## =, : ./data/unshared/raw/TILDA-Wave1.Final.sav:
## Unrecognized record type 7, subtype 18 encountered in system file
names(data_list) <- studyNames # name the elements of the data list
dto[["unitData"]] <- data_list # include data list into the main list as another element
names(dto) # elements in the main list object
## [1] "studyName" "filePath" "unitData"
names(dto[["unitData"]]) # elements in the subelement
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
# at this point the object `dto` contains components:
## [1] "studyName" "filePath" "unitData"
# dto contains: "studyName" , "filePath", "unitData"
# we have just added the (3rd) element, a list of datasets:
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
### dto (4) : collect metadata
# inspect the variable names and their labels in the raw data files
# names_labels(dto[["unitData"]][["alsa"]])
# names_labels(dto[["unitData"]][["lbsl"]])
# names_labels(dto[["unitData"]][["satsa"]])
# names_labels(dto[["unitData"]][["share"]])
# names_labels(dto[["unitData"]][["tilda"]])
# rename "MAR4" because it can be confused by machines for March-4
dto[["unitData"]][["tilda"]] <- plyr::rename(dto[["unitData"]][["tilda"]], replace = c("MAR4"= "MAR_4"))
# rename personal identifiers for consistency
dto[["unitData"]][["alsa"]] <- plyr::rename(dto[["unitData"]][["alsa"]], replace = c("SEQNUM"= "id"))
dto[["unitData"]][["lbsl"]] <- plyr::rename(dto[["unitData"]][["lbsl"]], replace = c("ID"= "id"))
dto[["unitData"]][["satsa"]] <- plyr::rename(dto[["unitData"]][["satsa"]], replace = c("ID"= "id"))
dto[["unitData"]][["share"]] <- plyr::rename(dto[["unitData"]][["share"]], replace = c("SAMPID.rec"= "id"))
dto[["unitData"]][["tilda"]] <- plyr::rename(dto[["unitData"]][["tilda"]], replace = c("ID"= "id"))
# to prepare for the final step in which we add metadata to the dto
# we begin by extracting the names and (hopefuly their) labels of variables from each dataset
# and combine them in a single rectanguar object, long/stacked with respect to study names
for(i in studyNames){
save_csv <- names_labels(dto[["unitData"]][[i]])
write.csv(save_csv, paste0("./data/meta/names-labels-live/nl-",i,".csv"),
row.names = T)
# these 5 individual .cvs contain the original variable names and labels
# now we combine these files to create the starter for our metadata object
dum <- list()
for(i in studyNames){
dum[[i]] <- read.csv(paste0("./data/meta/names-labels-live/nl-",i,".csv"),
header = T, stringsAsFactors = F )
mdsraw <- plyr::ldply(dum, data.frame,.id = "study_name") # convert list of ds into a single ds
mdsraw["X"] <- NULL # remove native counter variable, not needed
write.csv(mdsraw, "./data/meta/names-labels-live/names-labels-live.csv", row.names = T)
# after the final version of the data files used in the excerside have been obtained
# we made a dead copy of `./data/shared/derived/meta-raw-live.csv` and named it `./data/shared/meta-data-map.csv`
# decisions on variables' renaming and classification is encoded in this map
# reproduce ellis-island script every time you make changes to `meta-data-map.csv`
dsm <- read.csv("./data/meta/meta-data-map.csv")
dsm["X"] <- NULL # remove native counter variable, not needed
dsm["X.1"] <- NULL # remove native counter variable, not needed
# dsm$url <- if($url){paste0("[link](", dsm$url,")")
dsm$url <- as.character(dsm$url)
for(i in seq_along(dsm$url)){ # i <- 20
dsm[i,"url"] <- paste0("[link](",dsm[i,"url"],")")
# attach metadata object as the 4th element of the dto
dto[["metaData"]] <- dsm
# testit::assert("`model_name` should be a unique value", sum(duplicated(ds$model_name))==0L)
# testit::assert("`miles_per_gallon` should be a positive value.", all(ds$miles_per_gallon>0))
# testit::assert("`weight_gear_z` should be a positive or missing value.", all($miles_per_gallon) | (ds$miles_per_gallon>0)))
# Save as a compress, binary R dataset. It's no longer readable with a text editor, but it saves metadata (eg, factor information).
saveRDS(dto, file="./data/unshared/derived/dto.rds", compress="xz")
# the production of the dto object is now complete
# we verify its structure and content:
dto <- readRDS("./data/unshared/derived/dto.rds")
# each element this list is another list:
## [1] "studyName" "filePath" "unitData" "metaData"
# 1st element - names of the studies as character vector
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
# 2nd element - file paths of the data files for each study
## [1] "./data/unshared/raw/ALSA-Wave1.Final.sav"
## [2] "./data/unshared/raw/LBSL-Panel2-Wave1.Final.sav"
## [3] "./data/unshared/raw/SATSA-Q3.Final.sav"
## [4] "./data/unshared/raw/SHARE-Israel-Wave1.Final.sav"
## [5] "./data/unshared/raw/TILDA-Wave1.Final.sav"
# 3rd element - list objects with
## [1] "alsa" "lbsl" "satsa" "share" "tilda"
## Source: local data frame [2,087 x 26]
## (int) (fctr) (int) (int) (fctr) (int) (int) (int)
## 1 41 No 14 NA No NA NA NA
## 2 42 No 14 4 Yes NA NA NA
## 3 61 No NA NA No NA NA NA
## 4 71 No 14 NA No NA NA NA
## 5 91 No 28 NA No NA NA NA
## 6 121 No NA NA No NA NA NA
## 7 181 No NA NA No NA NA NA
## 8 201 No NA NA No NA NA NA
## 9 221 No NA NA No NA NA NA
## 10 261 No NA NA No NA NA NA
## .. ... ... ... ... ... ... ... ...
## Variables not shown: VIGEXCS (fctr), WALK2WKS (fctr), BTSM12MN (fctr),
## HLTHBTSM (fctr), HLTHLIFE (fctr), AGE (int), SEX (fctr), MARITST (fctr),
## SCHOOL (fctr), TYPQUAL (fctr), RETIRED (fctr), SMOKER (fctr), FR6ORMOR
## (fctr), NOSTDRNK (fctr), FREQALCH (fctr), WEIGHT (dbl), PIPCIGAR (fctr),
## CURRWORK (fctr)
# 4th element - dataset with augmented names and labels for variables from all involved studies
## Source: local data frame [150 x 10]
## study_name name label_short item construct
## (fctr) (fctr) (fctr) (fctr) (fctr)
## 1 alsa SEQNUM Sequence Number id id
## 2 alsa EXRTHOUS Exertion around house physact
## 3 alsa HWMNWK2W Times walked in past two weeks physact
## 4 alsa LSVEXC2W Less vigor sessions last 2 weeks physact
## 5 alsa LSVIGEXC Less vigor past 2 weeks physact
## 6 alsa TMHVYEXR Time heavy physical exertion physact
## 7 alsa TMVEXC2W Vigor Time past 2 weeks physact
## 8 alsa VIGEXC2W Vigor Sessions in past 2 weeks physact
## 9 alsa VIGEXCS Vigorous exercise physact
## 10 alsa WALK2WKS Walking past 2 weeks physact
## .. ... ... ... ... ...
## Variables not shown: type (fctr), categories (int), label (fctr), url
## (chr), notes (fctr)
