---
title: "Supervised learning examples"
author: "Nicholas Horton (nhorton@amherst.edu)"
date: "July 26, 2017"
output:
  html_document:
    fig_height: 5
    fig_width: 7
    toc: true
    toc_depth: 2
  pdf_document:
    fig_height: 5
    fig_width: 7
  word_document:
    fig_height: 3
    fig_width: 5
---

```{r, setup, include=FALSE}
library(mdsr)   # Load additional packages here 
# Some customization.  You can alter or delete as desired (if you know what you are doing).
trellis.par.set(theme=theme.mosaic()) # change default color scheme for lattice
knitr::opts_chunk$set(
  tidy=FALSE,     # display code as typed
  size="small")   # slightly smaller font for code
```

## Introduction
This example builds on the statistical learning chapter from **Modern Data Science with R**: http://mdsr-book.github.io/.


## Data ingestation and processing

```{r eval=TRUE}
library(mdsr) 
census <- read.csv(
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", 
  header = FALSE)
names(census) <- c("age", "workclass", "fnlwgt", "education", 
  "education.num", "marital.status", "occupation", "relationship", 
  "race", "sex", "capital.gain", "capital.loss", "hours.per.week", 
  "native.country", "income")
glimpse(census)
```

```{r}
set.seed(364)
n <- nrow(census)
test_idx <- sample.int(n, size = round(0.2 * n))
train <- census[-test_idx, ]
nrow(train)
test <- census[test_idx, ]
nrow(test)

pi_bar <- tally(~ income, data = train, format = "percent")[2]
pi_bar
tally(~ income, data = train, format = "percent")
```

```{r}
library(rpart)
dtree <- rpart(income ~ capital.gain, data = train)
split_val <- as.data.frame(dtree$splits)$index
dtree_frame <- dtree$frame %>%
  select(var, n, dev, yval) %>%
  mutate(pct = n / nrow(train), right = (n - dev) / n)
rpart(income ~ capital.gain, data = train)
```


```{r}
split <- 5095.5
train <- train %>% mutate(hi_cap_gains = capital.gain >= split)

ggplot(data = train, aes(x = capital.gain, y = income)) +
  geom_count(aes(color = hi_cap_gains),
    position = position_jitter(width = 0, height = 0.1), alpha = 0.5) +
  geom_vline(xintercept = split, color = "dodgerblue", lty = 2) +
  scale_x_log10(labels = scales::dollar)
form <- as.formula("income ~ age + workclass + education + marital.status + 
  occupation + relationship + race + sex + capital.gain + capital.loss +
  hours.per.week")
mod_tree <- rpart(form, data = train)
mod_tree
```

```{r}
plot(mod_tree)
text(mod_tree, use.n = TRUE, all = TRUE, cex = 0.7)
```

```{r}
library(partykit)
plot(as.party(mod_tree))
```

```{r}
options(digits = 5)
mod_tree_frame <- mod_tree$frame %>%
  select(var, n, dev, yval) %>%
  mutate(pct = n / nrow(train), right = (n - dev) / n)
```

`r round(100 * mod_tree_frame[7, "right"], 1)`% of those who paid below the threshold made less than \$50k.

```{r}
train <- train %>%
  mutate(husband_or_wife = relationship %in% c(" Husband", " Wife"),
         college_degree = husband_or_wife & education %in%
           c(" Bachelors", " Doctorate", " Masters", " Prof-school"),
         income_dtree = predict(mod_tree, type = "class"))

cg_splits <- data.frame(husband_or_wife = c(TRUE, FALSE),
                        vals = c(5095.5, 7073.5))

ggplot(data = train, aes(x = capital.gain, y = income)) +
  geom_count(aes(color = income_dtree, shape = college_degree),
             position = position_jitter(width = 0, height = 0.1),
             alpha = 0.5) +
  facet_wrap(~ husband_or_wife) +
  geom_vline(data = cg_splits, aes(xintercept = vals),
             color = "dodgerblue", lty = 2) +
  scale_x_log10()
```

```{r}
printcp(mod_tree)
# plotcp(mod_tree)
```

```{r}
train <- train %>%
  mutate(income_dtree = predict(mod_tree, type = "class"))
confusion <- tally(income_dtree ~ income, data = train, format = "count")
confusion
sum(diag(confusion)) / nrow(train)
```

In this case, the accuracy of the decision tree classifier is now `r round(100 * (sum(diag(confusion)) / nrow(train)), 1)`%, a considerable improvement over the null model.

```{r}
mod_tree2 <- rpart(form, data = train, control = rpart.control(cp = 0.002))
# XX calculate accuracy
```

```{r message=FALSE}
library(randomForest)
mod_forest <- randomForest(form, data = train, ntree = 201, mtry = 3)
mod_forest
sum(diag(mod_forest$confusion)) / nrow(train)
```

```{r}
library(tibble)
importance(mod_forest) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  arrange(desc(MeanDecreaseGini))
```

```{r}
library(class)
# distance metric only works with quantitative variables
train_q <- train %>%
  select(age, education.num, capital.gain, capital.loss, hours.per.week)
income_knn <- knn(train_q, test = train_q, cl = train$income, k = 10)

confusion <- tally(income_knn ~ income, data = train, format = "count")
confusion
sum(diag(confusion)) / nrow(train)
```

```{r}
knn_error_rate <- function(x, y, numNeighbors, z = x) {
  y_hat <- knn(train = x, test = z, cl = y, k = numNeighbors)
  return(sum(y_hat != y) / nrow(x))
}
ks <- c(1:15, 20, 30, 40, 50)
train_rates <- sapply(ks, FUN = knn_error_rate, x = train_q, y = train$income)
knn_error_rates <- data.frame(k = ks, train_rate = train_rates)
ggplot(data = knn_error_rates, aes(x = k, y = train_rate)) +
  geom_point() + geom_line() + ylab("Misclassification Rate")
```

```{r}
library(e1071)
mod_nb <- naiveBayes(form, data = train)
income_nb <- predict(mod_nb, newdata = train)
confusion <- tally(income_nb ~ income, data = train, format = "count")
confusion
sum(diag(confusion)) / nrow(train)
```

```{r nnet}
library(nnet)
mod_nn <- nnet(form, data = train, size = 5)
income_nn <- predict(mod_nn, newdata = train, type = "class")
confusion <- tally(income_nn ~ income, data = train, format = "count")
confusion
sum(diag(confusion)) / nrow(train)
```

```{r}
income_ensemble <- ifelse((income_knn == " >50K") +
                           (income_nb == " >50K") +
                           (income_nn == " >50K") >= 2, " >50K", " <=50K")
confusion <- tally(income_ensemble ~ income, data = train, format = "count")
confusion
sum(diag(confusion)) / nrow(train)
```

```{r}
income_probs <- mod_nb %>%
  predict(newdata = train, type = "raw") %>%
  as.data.frame()
head(income_probs, 3)
```

```{r}
names(income_probs)
tally(~` >50K` > 0.5, data = income_probs, format = "percent")
tally(~` >50K` > 0.24, data = income_probs, format = "percent")
```

```{r}
pred <- ROCR::prediction(income_probs[,2], train$income)
perf <- ROCR::performance(pred, 'tpr', 'fpr')
class(perf) # can also plot(perf)
```

```{r}
perf_df <- data.frame(perf@x.values, perf@y.values)
names(perf_df) <- c("fpr", "tpr")
roc <- ggplot(data = perf_df, aes(x = fpr, y = tpr)) +
  geom_line(color="blue") + geom_abline(intercept=0, slope=1, lty=3) +
  ylab(perf@y.name) + xlab(perf@x.name)
```

```{r}
confusion <- tally(income_nb ~ income, data = train, format = "count")
confusion
sum(diag(confusion)) / nrow(train)
tpr <- confusion[" >50K", " >50K"] / sum(confusion[, " >50K"])
fpr <- confusion[" >50K", " <=50K"] / sum(confusion[, " <=50K"])
roc + geom_point(x = fpr, y = tpr, size = 3)
```

```{r message=FALSE, warning=TRUE}
test_q <- test %>%
  select(age, education.num, capital.gain, capital.loss, hours.per.week)
test_rates <- sapply(ks, FUN = knn_error_rate, x = train_q,
                     y = train$income, z = test_q)
knn_error_rates <- knn_error_rates %>% mutate(test_rate = test_rates)
library(tidyr)
knn_error_rates_tidy <- knn_error_rates %>%
  gather(key = "type", value = "error_rate", -k)
ggplot(data = knn_error_rates_tidy, aes(x = k, y = error_rate)) +
  geom_point(aes(color = type)) + geom_line(aes(color = type)) +
  ylab("Misclassification Rate")
```

```{r}
favstats(~ capital.gain, data = train)
favstats(~ capital.gain, data = test)
mod_null <- glm(income ~ 1, data = train, family = binomial)
mods <- list(mod_null, mod_tree, mod_forest, mod_nn, mod_nb)
lapply(mods, class)
predict_methods <- methods("predict")
predict_methods[grepl(pattern = "(glm|rpart|randomForest|nnet|naive)",
  predict_methods)]
```

```{r}
predictions_train <- data.frame(
  y = as.character(train$income),
  type = "train",
  mod_null = predict(mod_null, type = "response"),
  mod_tree = predict(mod_tree, type = "class"),
  mod_forest = predict(mod_forest, type = "class"),
  mod_nn = predict(mod_nn, type = "class"),
  mod_nb = predict(mod_nb, newdata = train, type = "class"))
predictions_test <- data.frame(
  y = as.character(test$income),
  type = "test",
  mod_null = predict(mod_null, newdata = test, type = "response"),
  mod_tree = predict(mod_tree, newdata = test, type = "class"),
  mod_forest = predict(mod_forest, newdata = test, type = "class"),
  mod_nn = predict(mod_nn, newdata = test, type = "class"),
  mod_nb = predict(mod_nb, newdata = test, type = "class"))
predictions <- bind_rows(predictions_train, predictions_test)
glimpse(predictions)
```

```{r}
predictions_tidy <- predictions %>%
  mutate(mod_null = ifelse(mod_null < 0.5, " <=50K", " >50K")) %>%
  gather(key = "model", value = "y_hat", -type, -y)
glimpse(predictions_tidy)
```

```{r}
predictions_summary <- predictions_tidy %>%
  group_by(model, type) %>%
  summarize(N = n(), correct = sum(y == y_hat, 0),
            positives = sum(y == " >50K"),
            true_pos = sum(y_hat == " >50K" & y == y_hat),
            false_pos = sum(y_hat == " >50K" & y != y_hat)) %>%
  mutate(accuracy = correct / N,
         tpr = true_pos / positives,
         fpr = false_pos / (N - positives)) %>%
  ungroup() %>%
  gather(val_type, val, -model, -type) %>%
  unite(temp1, type, val_type, sep = "_") %>%   # glue variables
  spread(temp1, val) %>%
  arrange(desc(test_accuracy)) %>%
  select(model, train_accuracy, test_accuracy, test_tpr, test_fpr)
predictions_summary
```

```{r}
outputs <- c("response", "prob", "prob", "raw", "raw")
roc_test <- mapply(predict, mods, type = outputs,
    MoreArgs = list(newdata = test)) %>%
  as.data.frame() %>%
  select(1,3,5,6,8)
names(roc_test) <-
  c("mod_null", "mod_tree", "mod_forest", "mod_nn", "mod_nb")
glimpse(roc_test)

get_roc <- function(x, y) {
  pred <- ROCR::prediction(x$y_hat, y)
  perf <- ROCR::performance(pred, 'tpr', 'fpr')
  perf_df <- data.frame(perf@x.values, perf@y.values)
  names(perf_df) <- c("fpr", "tpr")
  return(perf_df)
}

roc_tidy <- roc_test %>%
  gather(key = "model", value = "y_hat") %>%
  group_by(model) %>%
  dplyr::do(get_roc(., y = test$income))
```

```{r}
ggplot(data = roc_tidy, aes(x = fpr, y = tpr)) +
  geom_line(aes(color = model)) +
  geom_abline(intercept = 0, slope = 1, lty = 3) +
  ylab(perf@y.name) + xlab(perf@x.name) +
  geom_point(data = predictions_summary, size = 3,
    aes(x = test_fpr, y = test_tpr, color = model))
```