Solutions to exercise: tidying and transforming data

Part I – tidying data

Read in the simulated clinical dataset, clinical-data.txt.

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

clinical_data <- read_tsv("clinical-data.txt")

## Parsed with column specification:
## cols(
##   Subject = col_character(),
##   Placebo.1 = col_double(),
##   Placebo.2 = col_double(),
##   Drug1.1 = col_double(),
##   Drug1.2 = col_double(),
##   Drug2.1 = col_double(),
##   Drug2.2 = col_double()
## )

Take a look at the data and answer the following questions:

What variables and observations do we have?
What might a ‘tidy’ version of this dataset look like?

Transform the data into a tidy form using the gather function from the tidyr package.

clinical_data <- gather(clinical_data, key = "Treatment", value = "Value", -Subject)

Display the range of values for each drug and placebo treatment as a box plot

ggplot(clinical_data, mapping = aes(x = Treatment, y = Value)) +
  geom_boxplot()

Part II – selecting columns

Read in the patients dataset

patients <- read_tsv("patient-data.txt")

## Parsed with column specification:
## cols(
##   ID = col_character(),
##   Name = col_character(),
##   Sex = col_character(),
##   Smokes = col_character(),
##   Height = col_character(),
##   Weight = col_character(),
##   Birth = col_date(format = ""),
##   State = col_character(),
##   Grade_Level = col_double(),
##   Died = col_logical(),
##   Count = col_double(),
##   Date.Entered.Study = col_date(format = "")
## )

Select all the columns between Height and Grade_Level

select(patients, Height:Grade_Level)

## # A tibble: 100 x 5
##    Height   Weight  Birth      State      Grade_Level
##    <chr>    <chr>   <date>     <chr>            <dbl>
##  1 182.87cm 76.57kg 1972-02-06 Georgia              2
##  2 179.12cm 80.43kg 1972-06-15 Colorado             2
##  3 169.15cm 75.48kg 1972-07-09 New Jersey           2
##  4 175.66cm 94.54kg 1972-08-17 Colorado             1
##  5 164.47cm 71.78kg 1973-06-12 Georgia              2
##  6 158.27cm 69.9kg  1973-07-01 Indiana              2
##  7 161.69cm 68.85kg 1972-03-26 New York             1
##  8 165.84cm 70.44kg 1973-05-11 Colorado             1
##  9 181.32cm 76.9kg  1971-12-31 Georgia              1
## 10 167.37cm 79.06kg 1973-07-19 New Jersey           2
## # … with 90 more rows

Select all the columns between Height and Grade_Level, but not State

select(patients, Height:Grade_Level, -State)

## # A tibble: 100 x 4
##    Height   Weight  Birth      Grade_Level
##    <chr>    <chr>   <date>           <dbl>
##  1 182.87cm 76.57kg 1972-02-06           2
##  2 179.12cm 80.43kg 1972-06-15           2
##  3 169.15cm 75.48kg 1972-07-09           2
##  4 175.66cm 94.54kg 1972-08-17           1
##  5 164.47cm 71.78kg 1973-06-12           2
##  6 158.27cm 69.9kg  1973-07-01           2
##  7 161.69cm 68.85kg 1972-03-26           1
##  8 165.84cm 70.44kg 1973-05-11           1
##  9 181.32cm 76.9kg  1971-12-31           1
## 10 167.37cm 79.06kg 1973-07-19           2
## # … with 90 more rows

Select all columns ending in ‘eight’, i.e. Height and Weight

select(patients, ends_with("eight"))

## # A tibble: 100 x 2
##    Height   Weight 
##    <chr>    <chr>  
##  1 182.87cm 76.57kg
##  2 179.12cm 80.43kg
##  3 169.15cm 75.48kg
##  4 175.66cm 94.54kg
##  5 164.47cm 71.78kg
##  6 158.27cm 69.9kg 
##  7 161.69cm 68.85kg
##  8 165.84cm 70.44kg
##  9 181.32cm 76.9kg 
## 10 167.37cm 79.06kg
## # … with 90 more rows

Select all columns except Birth, State and Grade_Level

select(patients, -Birth, -State, -Grade_Level)

## # A tibble: 100 x 9
##    ID      Name    Sex   Smokes Height Weight Died   Count Date.Entered.St…
##    <chr>   <chr>   <chr> <chr>  <chr>  <chr>  <lgl>  <dbl> <date>          
##  1 AC/AH/… Michael Male  FALSE  182.8… 76.57… FALSE  0.01  2015-12-01      
##  2 AC/AH/… Derek   Male  FALSE  179.1… 80.43… FALSE -1.31  NA              
##  3 AC/AH/… Todd    Male  FALSE  169.1… 75.48… FALSE -0.17  NA              
##  4 AC/AH/… Ronald  Male  FALSE  175.6… 94.54… FALSE -1.1   NA              
##  5 AC/AH/… Christ… Fema… FALSE  164.4… 71.78… TRUE   1.42  NA              
##  6 AC/AH/… Dana    Fema… TRUE   158.2… 69.9kg FALSE  0.290 NA              
##  7 AC/AH/… Erin    Fema… FALSE  161.6… 68.85… FALSE  0.16  NA              
##  8 AC/AH/… Rachel  Fema… No     165.8… 70.44… FALSE -0.07  NA              
##  9 AC/AH/… Ronald  Male  FALSE  181.3… 76.9kg FALSE -1.43  NA              
## 10 AC/AH/… Bryan   Male  FALSE  167.3… 79.06… FALSE  0.54  2015-12-31      
## # … with 90 more rows

select(patients, -c(Birth, State, Grade_Level))

## # A tibble: 100 x 9
##    ID      Name    Sex   Smokes Height Weight Died   Count Date.Entered.St…
##    <chr>   <chr>   <chr> <chr>  <chr>  <chr>  <lgl>  <dbl> <date>          
##  1 AC/AH/… Michael Male  FALSE  182.8… 76.57… FALSE  0.01  2015-12-01      
##  2 AC/AH/… Derek   Male  FALSE  179.1… 80.43… FALSE -1.31  NA              
##  3 AC/AH/… Todd    Male  FALSE  169.1… 75.48… FALSE -0.17  NA              
##  4 AC/AH/… Ronald  Male  FALSE  175.6… 94.54… FALSE -1.1   NA              
##  5 AC/AH/… Christ… Fema… FALSE  164.4… 71.78… TRUE   1.42  NA              
##  6 AC/AH/… Dana    Fema… TRUE   158.2… 69.9kg FALSE  0.290 NA              
##  7 AC/AH/… Erin    Fema… FALSE  161.6… 68.85… FALSE  0.16  NA              
##  8 AC/AH/… Rachel  Fema… No     165.8… 70.44… FALSE -0.07  NA              
##  9 AC/AH/… Ronald  Male  FALSE  181.3… 76.9kg FALSE -1.43  NA              
## 10 AC/AH/… Bryan   Male  FALSE  167.3… 79.06… FALSE  0.54  2015-12-31      
## # … with 90 more rows

select(patients, -(Birth:Grade_Level))

## # A tibble: 100 x 9
##    ID      Name    Sex   Smokes Height Weight Died   Count Date.Entered.St…
##    <chr>   <chr>   <chr> <chr>  <chr>  <chr>  <lgl>  <dbl> <date>          
##  1 AC/AH/… Michael Male  FALSE  182.8… 76.57… FALSE  0.01  2015-12-01      
##  2 AC/AH/… Derek   Male  FALSE  179.1… 80.43… FALSE -1.31  NA              
##  3 AC/AH/… Todd    Male  FALSE  169.1… 75.48… FALSE -0.17  NA              
##  4 AC/AH/… Ronald  Male  FALSE  175.6… 94.54… FALSE -1.1   NA              
##  5 AC/AH/… Christ… Fema… FALSE  164.4… 71.78… TRUE   1.42  NA              
##  6 AC/AH/… Dana    Fema… TRUE   158.2… 69.9kg FALSE  0.290 NA              
##  7 AC/AH/… Erin    Fema… FALSE  161.6… 68.85… FALSE  0.16  NA              
##  8 AC/AH/… Rachel  Fema… No     165.8… 70.44… FALSE -0.07  NA              
##  9 AC/AH/… Ronald  Male  FALSE  181.3… 76.9kg FALSE -1.43  NA              
## 10 AC/AH/… Bryan   Male  FALSE  167.3… 79.06… FALSE  0.54  2015-12-31      
## # … with 90 more rows

Part III - data transformations

The study that collected the patient data is interested in the relationship between body mass index (BMI) and smoking.

Which the relevant variables (columns) in our patients dataset and what problems can you see with these columns as they currently stand?

Clean the Height and Weight columns to remove the ‘cm’ and ‘kg’ units respectively.

patients <- mutate(patients, Height = as.numeric(str_remove(Height, pattern = "cm$")))
patients <- mutate(patients, Weight = as.numeric(str_remove(Weight, pattern = "kg$")))
patients

## # A tibble: 100 x 12
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <chr>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Mich… Male  FALSE    183.   76.6 1972-02-06 Geor…           2
##  2 AC/A… Derek Male  FALSE    179.   80.4 1972-06-15 Colo…           2
##  3 AC/A… Todd  Male  FALSE    169.   75.5 1972-07-09 New …           2
##  4 AC/A… Rona… Male  FALSE    176.   94.5 1972-08-17 Colo…           1
##  5 AC/A… Chri… Fema… FALSE    164.   71.8 1973-06-12 Geor…           2
##  6 AC/A… Dana  Fema… TRUE     158.   69.9 1973-07-01 Indi…           2
##  7 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  8 AC/A… Rach… Fema… No       166.   70.4 1973-05-11 Colo…           1
##  9 AC/A… Rona… Male  FALSE    181.   76.9 1971-12-31 Geor…           1
## 10 AC/A… Bryan Male  FALSE    167.   79.1 1973-07-19 New …           2
## # … with 90 more rows, and 3 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>

Clean the Smokes column so that it contains just TRUE and FALSE values.

Hint: use the %in% operator, e.g. c("Matt", "George") %in% c("John", "Paul", "George", "Ringo")

unique(patients$Smokes)

## [1] "FALSE" "TRUE"  "No"    "Yes"

patients <- mutate(patients, Smokes = Smokes %in% c("TRUE", "Yes"))
patients

## # A tibble: 100 x 12
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Mich… Male  FALSE    183.   76.6 1972-02-06 Geor…           2
##  2 AC/A… Derek Male  FALSE    179.   80.4 1972-06-15 Colo…           2
##  3 AC/A… Todd  Male  FALSE    169.   75.5 1972-07-09 New …           2
##  4 AC/A… Rona… Male  FALSE    176.   94.5 1972-08-17 Colo…           1
##  5 AC/A… Chri… Fema… FALSE    164.   71.8 1973-06-12 Geor…           2
##  6 AC/A… Dana  Fema… TRUE     158.   69.9 1973-07-01 Indi…           2
##  7 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  8 AC/A… Rach… Fema… FALSE    166.   70.4 1973-05-11 Colo…           1
##  9 AC/A… Rona… Male  FALSE    181.   76.9 1971-12-31 Geor…           1
## 10 AC/A… Bryan Male  FALSE    167.   79.1 1973-07-19 New …           2
## # … with 90 more rows, and 3 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>

Calculate the body mass index (BMI) for each of our patients using the formula \(BMI = Weight / (Height)^2\), where the weight is measured in kilograms and the height is in metres.

patients <- mutate(patients, BMI = Weight / (Height / 100) ** 2)
select(patients, ID, Smokes, Height, Weight, BMI)

## # A tibble: 100 x 5
##    ID        Smokes Height Weight   BMI
##    <chr>     <lgl>   <dbl>  <dbl> <dbl>
##  1 AC/AH/001 FALSE    183.   76.6  22.9
##  2 AC/AH/017 FALSE    179.   80.4  25.1
##  3 AC/AH/020 FALSE    169.   75.5  26.4
##  4 AC/AH/022 FALSE    176.   94.5  30.6
##  5 AC/AH/029 FALSE    164.   71.8  26.5
##  6 AC/AH/033 TRUE     158.   69.9  27.9
##  7 AC/AH/037 FALSE    162.   68.8  26.3
##  8 AC/AH/044 FALSE    166.   70.4  25.6
##  9 AC/AH/045 FALSE    181.   76.9  23.4
## 10 AC/AH/048 FALSE    167.   79.1  28.2
## # … with 90 more rows

Create a new variable to indicate which individuals are overweight, i.e. those with a BMI of above 25.

patients <- mutate(patients, Overweight = BMI > 25)
select(patients, ID, Smokes, Height, Weight, BMI, Overweight)

## # A tibble: 100 x 6
##    ID        Smokes Height Weight   BMI Overweight
##    <chr>     <lgl>   <dbl>  <dbl> <dbl> <lgl>     
##  1 AC/AH/001 FALSE    183.   76.6  22.9 FALSE     
##  2 AC/AH/017 FALSE    179.   80.4  25.1 TRUE      
##  3 AC/AH/020 FALSE    169.   75.5  26.4 TRUE      
##  4 AC/AH/022 FALSE    176.   94.5  30.6 TRUE      
##  5 AC/AH/029 FALSE    164.   71.8  26.5 TRUE      
##  6 AC/AH/033 TRUE     158.   69.9  27.9 TRUE      
##  7 AC/AH/037 FALSE    162.   68.8  26.3 TRUE      
##  8 AC/AH/044 FALSE    166.   70.4  25.6 TRUE      
##  9 AC/AH/045 FALSE    181.   76.9  23.4 FALSE     
## 10 AC/AH/048 FALSE    167.   79.1  28.2 TRUE      
## # … with 90 more rows

What other problems can you find in the patients dataset?

Solutions to exercise: tidying and transforming data

Mark Dunning and Matt Eldridge

Last modified: 12 Mar 2019

Part I – tidying data

Part II – selecting columns

Part III - data transformations