clinical-data.txt
.library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.1
## ✔ tibble 2.0.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
clinical_data <- read_tsv("clinical-data.txt")
## Parsed with column specification:
## cols(
## Subject = col_character(),
## Placebo.1 = col_double(),
## Placebo.2 = col_double(),
## Drug1.1 = col_double(),
## Drug1.2 = col_double(),
## Drug2.1 = col_double(),
## Drug2.2 = col_double()
## )
Take a look at the data and answer the following questions:
gather
function from the tidyr
package.clinical_data <- gather(clinical_data, key = "Treatment", value = "Value", -Subject)
ggplot(clinical_data, mapping = aes(x = Treatment, y = Value)) +
geom_boxplot()
patients <- read_tsv("patient-data.txt")
## Parsed with column specification:
## cols(
## ID = col_character(),
## Name = col_character(),
## Sex = col_character(),
## Smokes = col_character(),
## Height = col_character(),
## Weight = col_character(),
## Birth = col_date(format = ""),
## State = col_character(),
## Grade_Level = col_double(),
## Died = col_logical(),
## Count = col_double(),
## Date.Entered.Study = col_date(format = "")
## )
Height
and Grade_Level
select(patients, Height:Grade_Level)
## # A tibble: 100 x 5
## Height Weight Birth State Grade_Level
## <chr> <chr> <date> <chr> <dbl>
## 1 182.87cm 76.57kg 1972-02-06 Georgia 2
## 2 179.12cm 80.43kg 1972-06-15 Colorado 2
## 3 169.15cm 75.48kg 1972-07-09 New Jersey 2
## 4 175.66cm 94.54kg 1972-08-17 Colorado 1
## 5 164.47cm 71.78kg 1973-06-12 Georgia 2
## 6 158.27cm 69.9kg 1973-07-01 Indiana 2
## 7 161.69cm 68.85kg 1972-03-26 New York 1
## 8 165.84cm 70.44kg 1973-05-11 Colorado 1
## 9 181.32cm 76.9kg 1971-12-31 Georgia 1
## 10 167.37cm 79.06kg 1973-07-19 New Jersey 2
## # … with 90 more rows
Height
and Grade_Level
, but not State
select(patients, Height:Grade_Level, -State)
## # A tibble: 100 x 4
## Height Weight Birth Grade_Level
## <chr> <chr> <date> <dbl>
## 1 182.87cm 76.57kg 1972-02-06 2
## 2 179.12cm 80.43kg 1972-06-15 2
## 3 169.15cm 75.48kg 1972-07-09 2
## 4 175.66cm 94.54kg 1972-08-17 1
## 5 164.47cm 71.78kg 1973-06-12 2
## 6 158.27cm 69.9kg 1973-07-01 2
## 7 161.69cm 68.85kg 1972-03-26 1
## 8 165.84cm 70.44kg 1973-05-11 1
## 9 181.32cm 76.9kg 1971-12-31 1
## 10 167.37cm 79.06kg 1973-07-19 2
## # … with 90 more rows
Height
and Weight
select(patients, ends_with("eight"))
## # A tibble: 100 x 2
## Height Weight
## <chr> <chr>
## 1 182.87cm 76.57kg
## 2 179.12cm 80.43kg
## 3 169.15cm 75.48kg
## 4 175.66cm 94.54kg
## 5 164.47cm 71.78kg
## 6 158.27cm 69.9kg
## 7 161.69cm 68.85kg
## 8 165.84cm 70.44kg
## 9 181.32cm 76.9kg
## 10 167.37cm 79.06kg
## # … with 90 more rows
Birth
, State
and Grade_Level
select(patients, -Birth, -State, -Grade_Level)
## # A tibble: 100 x 9
## ID Name Sex Smokes Height Weight Died Count Date.Entered.St…
## <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <dbl> <date>
## 1 AC/AH/… Michael Male FALSE 182.8… 76.57… FALSE 0.01 2015-12-01
## 2 AC/AH/… Derek Male FALSE 179.1… 80.43… FALSE -1.31 NA
## 3 AC/AH/… Todd Male FALSE 169.1… 75.48… FALSE -0.17 NA
## 4 AC/AH/… Ronald Male FALSE 175.6… 94.54… FALSE -1.1 NA
## 5 AC/AH/… Christ… Fema… FALSE 164.4… 71.78… TRUE 1.42 NA
## 6 AC/AH/… Dana Fema… TRUE 158.2… 69.9kg FALSE 0.290 NA
## 7 AC/AH/… Erin Fema… FALSE 161.6… 68.85… FALSE 0.16 NA
## 8 AC/AH/… Rachel Fema… No 165.8… 70.44… FALSE -0.07 NA
## 9 AC/AH/… Ronald Male FALSE 181.3… 76.9kg FALSE -1.43 NA
## 10 AC/AH/… Bryan Male FALSE 167.3… 79.06… FALSE 0.54 2015-12-31
## # … with 90 more rows
select(patients, -c(Birth, State, Grade_Level))
## # A tibble: 100 x 9
## ID Name Sex Smokes Height Weight Died Count Date.Entered.St…
## <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <dbl> <date>
## 1 AC/AH/… Michael Male FALSE 182.8… 76.57… FALSE 0.01 2015-12-01
## 2 AC/AH/… Derek Male FALSE 179.1… 80.43… FALSE -1.31 NA
## 3 AC/AH/… Todd Male FALSE 169.1… 75.48… FALSE -0.17 NA
## 4 AC/AH/… Ronald Male FALSE 175.6… 94.54… FALSE -1.1 NA
## 5 AC/AH/… Christ… Fema… FALSE 164.4… 71.78… TRUE 1.42 NA
## 6 AC/AH/… Dana Fema… TRUE 158.2… 69.9kg FALSE 0.290 NA
## 7 AC/AH/… Erin Fema… FALSE 161.6… 68.85… FALSE 0.16 NA
## 8 AC/AH/… Rachel Fema… No 165.8… 70.44… FALSE -0.07 NA
## 9 AC/AH/… Ronald Male FALSE 181.3… 76.9kg FALSE -1.43 NA
## 10 AC/AH/… Bryan Male FALSE 167.3… 79.06… FALSE 0.54 2015-12-31
## # … with 90 more rows
select(patients, -(Birth:Grade_Level))
## # A tibble: 100 x 9
## ID Name Sex Smokes Height Weight Died Count Date.Entered.St…
## <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <dbl> <date>
## 1 AC/AH/… Michael Male FALSE 182.8… 76.57… FALSE 0.01 2015-12-01
## 2 AC/AH/… Derek Male FALSE 179.1… 80.43… FALSE -1.31 NA
## 3 AC/AH/… Todd Male FALSE 169.1… 75.48… FALSE -0.17 NA
## 4 AC/AH/… Ronald Male FALSE 175.6… 94.54… FALSE -1.1 NA
## 5 AC/AH/… Christ… Fema… FALSE 164.4… 71.78… TRUE 1.42 NA
## 6 AC/AH/… Dana Fema… TRUE 158.2… 69.9kg FALSE 0.290 NA
## 7 AC/AH/… Erin Fema… FALSE 161.6… 68.85… FALSE 0.16 NA
## 8 AC/AH/… Rachel Fema… No 165.8… 70.44… FALSE -0.07 NA
## 9 AC/AH/… Ronald Male FALSE 181.3… 76.9kg FALSE -1.43 NA
## 10 AC/AH/… Bryan Male FALSE 167.3… 79.06… FALSE 0.54 2015-12-31
## # … with 90 more rows
The study that collected the patient data is interested in the relationship between body mass index (BMI) and smoking.
Which the relevant variables (columns) in our patients dataset and what problems can you see with these columns as they currently stand?
Height
and Weight
columns to remove the ‘cm’ and ‘kg’ units respectively.patients <- mutate(patients, Height = as.numeric(str_remove(Height, pattern = "cm$")))
patients <- mutate(patients, Weight = as.numeric(str_remove(Weight, pattern = "kg$")))
patients
## # A tibble: 100 x 12
## ID Name Sex Smokes Height Weight Birth State Grade_Level
## <chr> <chr> <chr> <chr> <dbl> <dbl> <date> <chr> <dbl>
## 1 AC/A… Mich… Male FALSE 183. 76.6 1972-02-06 Geor… 2
## 2 AC/A… Derek Male FALSE 179. 80.4 1972-06-15 Colo… 2
## 3 AC/A… Todd Male FALSE 169. 75.5 1972-07-09 New … 2
## 4 AC/A… Rona… Male FALSE 176. 94.5 1972-08-17 Colo… 1
## 5 AC/A… Chri… Fema… FALSE 164. 71.8 1973-06-12 Geor… 2
## 6 AC/A… Dana Fema… TRUE 158. 69.9 1973-07-01 Indi… 2
## 7 AC/A… Erin Fema… FALSE 162. 68.8 1972-03-26 New … 1
## 8 AC/A… Rach… Fema… No 166. 70.4 1973-05-11 Colo… 1
## 9 AC/A… Rona… Male FALSE 181. 76.9 1971-12-31 Geor… 1
## 10 AC/A… Bryan Male FALSE 167. 79.1 1973-07-19 New … 2
## # … with 90 more rows, and 3 more variables: Died <lgl>, Count <dbl>,
## # Date.Entered.Study <date>
Smokes
column so that it contains just TRUE
and FALSE
values.Hint: use the %in%
operator, e.g. c("Matt", "George") %in% c("John", "Paul", "George", "Ringo")
unique(patients$Smokes)
## [1] "FALSE" "TRUE" "No" "Yes"
patients <- mutate(patients, Smokes = Smokes %in% c("TRUE", "Yes"))
patients
## # A tibble: 100 x 12
## ID Name Sex Smokes Height Weight Birth State Grade_Level
## <chr> <chr> <chr> <lgl> <dbl> <dbl> <date> <chr> <dbl>
## 1 AC/A… Mich… Male FALSE 183. 76.6 1972-02-06 Geor… 2
## 2 AC/A… Derek Male FALSE 179. 80.4 1972-06-15 Colo… 2
## 3 AC/A… Todd Male FALSE 169. 75.5 1972-07-09 New … 2
## 4 AC/A… Rona… Male FALSE 176. 94.5 1972-08-17 Colo… 1
## 5 AC/A… Chri… Fema… FALSE 164. 71.8 1973-06-12 Geor… 2
## 6 AC/A… Dana Fema… TRUE 158. 69.9 1973-07-01 Indi… 2
## 7 AC/A… Erin Fema… FALSE 162. 68.8 1972-03-26 New … 1
## 8 AC/A… Rach… Fema… FALSE 166. 70.4 1973-05-11 Colo… 1
## 9 AC/A… Rona… Male FALSE 181. 76.9 1971-12-31 Geor… 1
## 10 AC/A… Bryan Male FALSE 167. 79.1 1973-07-19 New … 2
## # … with 90 more rows, and 3 more variables: Died <lgl>, Count <dbl>,
## # Date.Entered.Study <date>
patients <- mutate(patients, BMI = Weight / (Height / 100) ** 2)
select(patients, ID, Smokes, Height, Weight, BMI)
## # A tibble: 100 x 5
## ID Smokes Height Weight BMI
## <chr> <lgl> <dbl> <dbl> <dbl>
## 1 AC/AH/001 FALSE 183. 76.6 22.9
## 2 AC/AH/017 FALSE 179. 80.4 25.1
## 3 AC/AH/020 FALSE 169. 75.5 26.4
## 4 AC/AH/022 FALSE 176. 94.5 30.6
## 5 AC/AH/029 FALSE 164. 71.8 26.5
## 6 AC/AH/033 TRUE 158. 69.9 27.9
## 7 AC/AH/037 FALSE 162. 68.8 26.3
## 8 AC/AH/044 FALSE 166. 70.4 25.6
## 9 AC/AH/045 FALSE 181. 76.9 23.4
## 10 AC/AH/048 FALSE 167. 79.1 28.2
## # … with 90 more rows
patients <- mutate(patients, Overweight = BMI > 25)
select(patients, ID, Smokes, Height, Weight, BMI, Overweight)
## # A tibble: 100 x 6
## ID Smokes Height Weight BMI Overweight
## <chr> <lgl> <dbl> <dbl> <dbl> <lgl>
## 1 AC/AH/001 FALSE 183. 76.6 22.9 FALSE
## 2 AC/AH/017 FALSE 179. 80.4 25.1 TRUE
## 3 AC/AH/020 FALSE 169. 75.5 26.4 TRUE
## 4 AC/AH/022 FALSE 176. 94.5 30.6 TRUE
## 5 AC/AH/029 FALSE 164. 71.8 26.5 TRUE
## 6 AC/AH/033 TRUE 158. 69.9 27.9 TRUE
## 7 AC/AH/037 FALSE 162. 68.8 26.3 TRUE
## 8 AC/AH/044 FALSE 166. 70.4 25.6 TRUE
## 9 AC/AH/045 FALSE 181. 76.9 23.4 FALSE
## 10 AC/AH/048 FALSE 167. 79.1 28.2 TRUE
## # … with 90 more rows
What other problems can you find in the patients dataset?