Part I – Workflows using pipes

  1. Read in the patients dataset and rewrite the following cleaning steps as a workflow using the %>% operator.
library(tidyverse)
patients <- read_tsv("patient-data.txt")
patients <- mutate(patients, Smokes = Smokes %in% c("TRUE", "Yes"))
patients <- mutate(patients, Height = as.numeric(str_remove(Height, pattern = "cm$")))
patients <- mutate(patients, Weight = as.numeric(str_remove(Weight, pattern = "kg$")))
patients <- mutate(patients, BMI = Weight / (Height / 100) ** 2)
patients <- mutate(patients, Overweight = BMI > 25)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.1  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
patients <- read_tsv("patient-data.txt") %>%
  mutate(Smokes = Smokes %in% c("TRUE", "Yes")) %>%
  mutate(Height = as.numeric(str_remove(Height, pattern = "cm$"))) %>%
  mutate(Weight = as.numeric(str_remove(Weight, pattern = "kg$"))) %>%
  mutate(BMI = Weight / (Height / 100) ** 2) %>%
  mutate(Overweight = BMI > 25)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   Name = col_character(),
##   Sex = col_character(),
##   Smokes = col_character(),
##   Height = col_character(),
##   Weight = col_character(),
##   Birth = col_date(format = ""),
##   State = col_character(),
##   Grade_Level = col_double(),
##   Died = col_logical(),
##   Count = col_double(),
##   Date.Entered.Study = col_date(format = "")
## )
patients
## # A tibble: 100 x 14
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Mich… Male  FALSE    183.   76.6 1972-02-06 Geor…           2
##  2 AC/A… Derek Male  FALSE    179.   80.4 1972-06-15 Colo…           2
##  3 AC/A… Todd  Male  FALSE    169.   75.5 1972-07-09 New …           2
##  4 AC/A… Rona… Male  FALSE    176.   94.5 1972-08-17 Colo…           1
##  5 AC/A… Chri… Fema… FALSE    164.   71.8 1973-06-12 Geor…           2
##  6 AC/A… Dana  Fema… TRUE     158.   69.9 1973-07-01 Indi…           2
##  7 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  8 AC/A… Rach… Fema… FALSE    166.   70.4 1973-05-11 Colo…           1
##  9 AC/A… Rona… Male  FALSE    181.   76.9 1971-12-31 Geor…           1
## 10 AC/A… Bryan Male  FALSE    167.   79.1 1973-07-19 New …           2
## # … with 90 more rows, and 5 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>, BMI <dbl>, Overweight <lgl>
  1. Add a step to the workflow to round the Height, Weight and BMI to 1 decimal place.
patients <- read_tsv("patient-data.txt") %>%
  mutate(Smokes = Smokes %in% c("TRUE", "Yes")) %>%
  mutate(Height = as.numeric(str_remove(Height, pattern = "cm$"))) %>%
  mutate(Weight = as.numeric(str_remove(Weight, pattern = "kg$"))) %>%
  mutate(BMI = Weight / (Height / 100) ** 2) %>%
  mutate(Overweight = BMI > 25) %>%
  mutate_at(vars(Height, Weight, BMI), round, digits = 1)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   Name = col_character(),
##   Sex = col_character(),
##   Smokes = col_character(),
##   Height = col_character(),
##   Weight = col_character(),
##   Birth = col_date(format = ""),
##   State = col_character(),
##   Grade_Level = col_double(),
##   Died = col_logical(),
##   Count = col_double(),
##   Date.Entered.Study = col_date(format = "")
## )
patients
## # A tibble: 100 x 14
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Mich… Male  FALSE    183.   76.6 1972-02-06 Geor…           2
##  2 AC/A… Derek Male  FALSE    179.   80.4 1972-06-15 Colo…           2
##  3 AC/A… Todd  Male  FALSE    169.   75.5 1972-07-09 New …           2
##  4 AC/A… Rona… Male  FALSE    176.   94.5 1972-08-17 Colo…           1
##  5 AC/A… Chri… Fema… FALSE    164.   71.8 1973-06-12 Geor…           2
##  6 AC/A… Dana  Fema… TRUE     158.   69.9 1973-07-01 Indi…           2
##  7 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  8 AC/A… Rach… Fema… FALSE    166.   70.4 1973-05-11 Colo…           1
##  9 AC/A… Rona… Male  FALSE    181.   76.9 1971-12-31 Geor…           1
## 10 AC/A… Bryan Male  FALSE    167.   79.1 1973-07-19 New …           2
## # … with 90 more rows, and 5 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>, BMI <dbl>, Overweight <lgl>

Part II - Filtering rows

  1. Filter for female patients from New York or New Jersey.
filter(patients, Sex == "Female", State == "New York" | State == "New Jersey")
## # A tibble: 20 x 14
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  2 AC/A… Pame… Fema… FALSE    166.   67.3 1971-11-14 New …           1
##  3 AC/A… Eliz… Fema… TRUE     164.   65.8 1972-01-12 New …           3
##  4 AC/A… Paula Fema… FALSE    161.   63.5 1973-07-02 New …          99
##  5 AC/A… Julie Fema… FALSE    160    64.3 1973-09-05 New …           1
##  6 AC/A… Kris… Fema… FALSE    160.   71.9 1973-09-28 New …           2
##  7 AC/A… Tiff… Fema… FALSE    161.   64.8 1973-02-24 New …           1
##  8 AC/A… Bran… Fema… FALSE    160.   68   1972-12-08 New …           2
##  9 AC/A… Alic… Fema… TRUE     170    66.7 1972-09-03 New …          99
## 10 AC/A… Tanya Fema… TRUE     165.   73   1972-01-31 New …           1
## 11 AC/A… Wendy Fema… FALSE    164    66.7 1971-12-29 New …           3
## 12 AC/A… Barb… Fema… FALSE    160.   65.9 1973-01-31 New …          99
## 13 AC/A… Holly Fema… TRUE     160.   68.3 1972-04-29 New …           2
## 14 AC/S… Barb… Fema… TRUE     162.   66.5 1972-02-21 New …           3
## 15 AC/S… Carr… Fema… FALSE    164.   71.5 1973-03-18 New …           3
## 16 AC/S… Laura Fema… FALSE    158.   69.7 1972-06-08 New …           3
## 17 AC/S… Holly Fema… FALSE    159.   70.7 1973-06-27 New …           3
## 18 AC/S… Jill  Fema… FALSE    159.   66.2 1972-11-14 New …           1
## 19 AC/S… Rhon… Fema… FALSE    164.   70.7 1972-06-23 New …           3
## 20 AC/S… Shar… Fema… FALSE    161.   71.5 1972-11-17 New …           1
## # … with 5 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>, BMI <dbl>, Overweight <lgl>
filter(patients, Sex == "Female", State %in% c("New York", "New Jersey"))
## # A tibble: 20 x 14
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  2 AC/A… Pame… Fema… FALSE    166.   67.3 1971-11-14 New …           1
##  3 AC/A… Eliz… Fema… TRUE     164.   65.8 1972-01-12 New …           3
##  4 AC/A… Paula Fema… FALSE    161.   63.5 1973-07-02 New …          99
##  5 AC/A… Julie Fema… FALSE    160    64.3 1973-09-05 New …           1
##  6 AC/A… Kris… Fema… FALSE    160.   71.9 1973-09-28 New …           2
##  7 AC/A… Tiff… Fema… FALSE    161.   64.8 1973-02-24 New …           1
##  8 AC/A… Bran… Fema… FALSE    160.   68   1972-12-08 New …           2
##  9 AC/A… Alic… Fema… TRUE     170    66.7 1972-09-03 New …          99
## 10 AC/A… Tanya Fema… TRUE     165.   73   1972-01-31 New …           1
## 11 AC/A… Wendy Fema… FALSE    164    66.7 1971-12-29 New …           3
## 12 AC/A… Barb… Fema… FALSE    160.   65.9 1973-01-31 New …          99
## 13 AC/A… Holly Fema… TRUE     160.   68.3 1972-04-29 New …           2
## 14 AC/S… Barb… Fema… TRUE     162.   66.5 1972-02-21 New …           3
## 15 AC/S… Carr… Fema… FALSE    164.   71.5 1973-03-18 New …           3
## 16 AC/S… Laura Fema… FALSE    158.   69.7 1972-06-08 New …           3
## 17 AC/S… Holly Fema… FALSE    159.   70.7 1973-06-27 New …           3
## 18 AC/S… Jill  Fema… FALSE    159.   66.2 1972-11-14 New …           1
## 19 AC/S… Rhon… Fema… FALSE    164.   70.7 1972-06-23 New …           3
## 20 AC/S… Shar… Fema… FALSE    161.   71.5 1972-11-17 New …           1
## # … with 5 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>, BMI <dbl>, Overweight <lgl>
filter(patients, Sex == "Female", str_starts(State, "New "))
## # A tibble: 20 x 14
##    ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level
##    <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl>
##  1 AC/A… Erin  Fema… FALSE    162.   68.8 1972-03-26 New …           1
##  2 AC/A… Pame… Fema… FALSE    166.   67.3 1971-11-14 New …           1
##  3 AC/A… Eliz… Fema… TRUE     164.   65.8 1972-01-12 New …           3
##  4 AC/A… Paula Fema… FALSE    161.   63.5 1973-07-02 New …          99
##  5 AC/A… Julie Fema… FALSE    160    64.3 1973-09-05 New …           1
##  6 AC/A… Kris… Fema… FALSE    160.   71.9 1973-09-28 New …           2
##  7 AC/A… Tiff… Fema… FALSE    161.   64.8 1973-02-24 New …           1
##  8 AC/A… Bran… Fema… FALSE    160.   68   1972-12-08 New …           2
##  9 AC/A… Alic… Fema… TRUE     170    66.7 1972-09-03 New …          99
## 10 AC/A… Tanya Fema… TRUE     165.   73   1972-01-31 New …           1
## 11 AC/A… Wendy Fema… FALSE    164    66.7 1971-12-29 New …           3
## 12 AC/A… Barb… Fema… FALSE    160.   65.9 1973-01-31 New …          99
## 13 AC/A… Holly Fema… TRUE     160.   68.3 1972-04-29 New …           2
## 14 AC/S… Barb… Fema… TRUE     162.   66.5 1972-02-21 New …           3
## 15 AC/S… Carr… Fema… FALSE    164.   71.5 1973-03-18 New …           3
## 16 AC/S… Laura Fema… FALSE    158.   69.7 1972-06-08 New …           3
## 17 AC/S… Holly Fema… FALSE    159.   70.7 1973-06-27 New …           3
## 18 AC/S… Jill  Fema… FALSE    159.   66.2 1972-11-14 New …           1
## 19 AC/S… Rhon… Fema… FALSE    164.   70.7 1972-06-23 New …           3
## 20 AC/S… Shar… Fema… FALSE    161.   71.5 1972-11-17 New …           1
## # … with 5 more variables: Died <lgl>, Count <dbl>,
## #   Date.Entered.Study <date>, BMI <dbl>, Overweight <lgl>
  1. Filter for overweight smokers that are still alive.
filter(patients, Overweight, Smokes, !Died)
## # A tibble: 6 x 14
##   ID    Name  Sex   Smokes Height Weight Birth      State Grade_Level Died 
##   <chr> <chr> <chr> <lgl>   <dbl>  <dbl> <date>     <chr>       <dbl> <lgl>
## 1 AC/A… Dana  Fema… TRUE     158.   69.9 1973-07-01 Indi…           2 FALSE
## 2 AC/A… Kenn… Male  TRUE     175.   92.2 1972-03-22 Colo…           3 FALSE
## 3 AC/A… Stacy Fema… TRUE     165.   75.7 1971-11-22 Colo…           1 FALSE
## 4 AC/A… Tanya Fema… TRUE     165.   73   1972-01-31 New …           1 FALSE
## 5 AC/S… Stacy Fema… TRUE     162.   67.9 1973-07-09 indi…           3 FALSE
## 6 AC/S… Barb… Fema… TRUE     162.   66.5 1972-02-21 New …           3 FALSE
## # … with 4 more variables: Count <dbl>, Date.Entered.Study <date>,
## #   BMI <dbl>, Overweight <lgl>