Code header

# Course: ECON 5300L Applied Econometric
# Title: EDA of Mortgage Loan Decisions 
# Purpose: Explore Mortgage Loan Decisions outcome
# Data: MLD Data file.csv
# Date: Mar 26, 2018
# Author: Afsar Ali

Exploratory Data Analysis (EDA)

rm(list=ls(all=TRUE))  # Clear all data in environment

#load Packages 
library(tidyverse)
library(GGally)
library(plotly)
library(car)
library(ggplot2)
library(stargazer)
library(sandwich)
library(lmtest)   # waldtest; see also coeftest.
library(psych)
library(aod)
library(Rcpp)
library(Hmisc)
library(pastecs)
library(popbio)
library(rms)
library(Hmisc)
library(kableExtra)

# Load data 
datd <- read.csv("MLD Data File.csv", header = TRUE)
#Review selected data
glimpse(datd)
## Observations: 1,989
## Variables: 8
## $ MARRIED <fct> 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ GDLIN   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ OBRAT   <dbl> 34.5, 34.1, 26.0, 37.0, 32.1, 33.0, 36.0, 37.0, 30.7, ...
## $ BLACK   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ HISPAN  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MALE    <fct> ., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ APPROVE <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ LOANPRC <dbl> 0.7542373, 0.8000000, 0.8951049, 0.6000000, 0.8955224,...
kable(summary(datd))
MARRIED
 GDLIN </th>
 OBRAT </th>
 BLACK </th>
 HISPAN </th>
MALE
APPROVE </th>
LOANPRC </th>
.: 3 Min. : 0.000 Min. : 0.00 Min. :0.00000 Min. :0.00000 .: 15 Min. :0.0000 Min. :0.02105
0: 678 1st Qu.: 1.000 1st Qu.:28.00 1st Qu.:0.00000 1st Qu.:0.00000 0: 369 1st Qu.:1.0000 1st Qu.:0.70000
1:1308 Median : 1.000 Median :33.00 Median :0.00000 Median :0.00000 1:1605 Median :1.0000 Median :0.80000
NA Mean : 1.583 Mean :32.39 Mean :0.09904 Mean :0.05581 NA Mean :0.8773 Mean :0.77064
NA 3rd Qu.: 1.000 3rd Qu.:37.00 3rd Qu.:0.00000 3rd Qu.:0.00000 NA 3rd Qu.:1.0000 3rd Qu.:0.89894
NA Max. :666.000 Max. :95.00 Max. :1.00000 Max. :1.00000 NA Max. :1.0000 Max. :2.57143

Data Discription

In light of the relatively small number of mortgage loan applications made by minorities, these extra variables were collected for all applications by blacks and Hispanics and for a random sample. The data set includes the following variables.

  • APPROVE = 1 if mortgage loan was approved, = 0 otherwise
  • GDLIN = 1 if credit history meets guidelines, = 0 otherwise
  • LOANPRC = loan amount/purchase price
  • OBRAT = other obligations as a percent of total income
  • MALE = 1 if male, = 0 otherwise
  • MARRIED = 1 if married, = 0 otherwise
  • BLACK = 1 if black, = 0 otherwise
  • HISPAN = 1 if Hispanic, = 0 otherwise

Remove 20 na’s obs based on above summary

#filter data 
dats <-
  datd %>%
  filter(GDLIN != 666) %>%
  filter(MALE != ".") %>%
  filter(MARRIED != ".") %>%
  mutate(LOANPRC = LOANPRC*100)

# Change these to factors
# dats$MARRIED<- as.factor(as.character(dats$MARRIED))
# dats$APPROVE<- as.factor(as.character(dats$APPROVE))
# dats$MALE<- as.factor(as.character(dats$MALE))
# dats$GDLIN<-  as.factor(dats$GDLIN)
# dats$BLACK<-  as.factor(dats$BLACK)
# dats$HISPAN<-  as.factor(dats$HISPAN)

#Removed 20 ob
#Changed it to integer

dats$MARRIED<- as.integer(as.character(dats$MARRIED))
dats$MALE<- as.integer(as.character(dats$MALE))

#dats$GDLIN<-  as.factor(dats$GDLIN)
#attach the file for use
attach(dats)

#Review selected data
glimpse(dats)
## Observations: 1,969
## Variables: 8
## $ MARRIED <int> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ GDLIN   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ OBRAT   <dbl> 34.1, 26.0, 37.0, 32.1, 33.0, 36.0, 37.0, 30.7, 49.0, ...
## $ BLACK   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ HISPAN  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MALE    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ APPROVE <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ LOANPRC <dbl> 80.00000, 89.51049, 60.00000, 89.55224, 80.43478, 89.8...
kable(summary(dats))
MARRIED </th>
 GDLIN </th>
 OBRAT </th>
 BLACK </th>
 HISPAN </th>
  MALE </th>
APPROVE </th>
LOANPRC </th>
Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. : 2.105
1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:28.00 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.: 70.000
Median :1.0000 Median :1.0000 Median :33.00 Median :0.00000 Median :0.00000 Median :1.0000 Median :1.0000 Median : 80.000
Mean :0.6592 Mean :0.9132 Mean :32.39 Mean :0.09903 Mean :0.05485 Mean :0.8131 Mean :0.8761 Mean : 77.032
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:37.00 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 89.888
Max. :1.0000 Max. :1.0000 Max. :95.00 Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :257.143

Review GGpair Matrix

ggpairs(dats,lower = list(continuous="smooth"))

Review Stats of each variables

##The describe() function which is part of the Hmisc package displays the following additional statistics:
#Number of rows,#Standard deviation,#Trimmed mean,#Mean absolute deviation,#Skewness,#Kurtosis,#Standard error
describe(dats)
## dats 
## 
##  8  Variables      1969  Observations
## ---------------------------------------------------------------------------
## MARRIED 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.674     1298   0.6592   0.4495 
## 
## ---------------------------------------------------------------------------
## GDLIN 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.238     1798   0.9132   0.1587 
## 
## ---------------------------------------------------------------------------
## OBRAT 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1969        0      288    0.999    32.39    8.772    18.16    22.00 
##      .25      .50      .75      .90      .95 
##    28.00    33.00    37.00    40.52    44.00 
## 
## lowest :  0.00  4.00  5.60  6.99  7.00, highest: 73.00 75.00 78.00 83.00 95.00
## ---------------------------------------------------------------------------
## BLACK 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.268      195  0.09904   0.1785 
## 
## ---------------------------------------------------------------------------
## HISPAN 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.156      108  0.05485   0.1037 
## 
## ---------------------------------------------------------------------------
## MALE 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.456     1601   0.8131   0.3041 
## 
## ---------------------------------------------------------------------------
## APPROVE 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.326     1725   0.8761   0.2172 
## 
## ---------------------------------------------------------------------------
## LOANPRC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1969        0     1096    0.998    77.03    18.91    40.00    52.38 
##      .25      .50      .75      .90      .95 
##    70.00    80.00    89.89    94.45    95.45 
## 
## lowest :   2.10526   8.77193   9.09091   9.33032  11.01322
## highest: 164.70590 183.33330 220.00000 255.52480 257.14290
## ---------------------------------------------------------------------------

Review Variance for anything that may jump out

##The stat.desc() function which is part of the pastecs package displays the following additional statistics:
#Variance,#Coefficient of variation,#Confidence interval for mean
kable(stat.desc(dats))
MARRIED GDLIN OBRAT BLACK HISPAN MALE APPROVE LOANPRC
nbr.val 1969.0000000 1969.0000000 1.969000e+03 1969.0000000 1969.0000000 1969.0000000 1969.0000000 1.969000e+03
nbr.null 671.0000000 171.0000000 1.000000e+00 1774.0000000 1861.0000000 368.0000000 244.0000000 0.000000e+00
nbr.na 0.0000000 0.0000000 0.000000e+00 0.0000000 0.0000000 0.0000000 0.0000000 0.000000e+00
min 0.0000000 0.0000000 0.000000e+00 0.0000000 0.0000000 0.0000000 0.0000000 2.105260e+00
max 1.0000000 1.0000000 9.500000e+01 1.0000000 1.0000000 1.0000000 1.0000000 2.571429e+02
range 1.0000000 1.0000000 9.500000e+01 1.0000000 1.0000000 1.0000000 1.0000000 2.550376e+02
sum 1298.0000000 1798.0000000 6.377929e+04 195.0000000 108.0000000 1601.0000000 1725.0000000 1.516761e+05
median 1.0000000 1.0000000 3.300000e+01 0.0000000 0.0000000 1.0000000 1.0000000 8.000000e+01
mean 0.6592179 0.9131539 3.239172e+01 0.0990350 0.0548502 0.8131031 0.8760792 7.703204e+01
SE.mean 0.0106842 0.0063480 1.865833e-01 0.0067334 0.0051325 0.0087874 0.0074273 4.270414e-01
CI.mean.0.95 0.0209535 0.0124494 3.659215e-01 0.0132054 0.0100657 0.0172336 0.0145662 8.375009e-01
var 0.2247638 0.0793442 6.854741e+01 0.0892724 0.0518680 0.1520437 0.1086196 3.590755e+02
std.dev 0.4740926 0.2816810 8.279336e+00 0.2987849 0.2277454 0.3899278 0.3295748 1.894929e+01
coef.var 0.7191744 0.3084704 2.556004e-01 3.0169618 4.1521365 0.4795551 0.3761930 2.459923e-01

Histogtam of the Variables

#Lot more married obs 
plot_ly(x = dats$MARRIED,
             type = "histogram")
#Small amount of obs that desn't meet credit guideline
plot_ly(x = dats$GDLIN,
             type = "histogram")
#Right Skewed with 36 in the middle 
plot_ly(x = dats$OBRAT,
             type = "histogram")
#Small amount of Black obs
plot_ly(x = dats$BLACK,
             type = "histogram")
#Small amount of Hispanic obs
plot_ly(x = dats$HISPAN,
             type = "histogram")
#Mostly Males
plot_ly(x = dats$MALE,
             type = "histogram")
#Mostly Approved
plot_ly(x = dats$APPROVE,
             type = "histogram")
#LOANPRC = loan amount/purchase price
#.80 is 20% of the data, .9 is 10% of the data
plot_ly(x = dats$LOANPRC,
             type = "histogram") 

Quantile-Quantile (QQ) Plots

# after 2 its not normal 
ggplot(dats, aes(sample = LOANPRC)) + 
  geom_qq() +
  ggtitle("Q-Q Plot for LOANPRC")

#looks normally distrubuted
ggplot(dats, aes(sample = OBRAT)) + 
  geom_qq() +
  ggtitle("Q-Q Plot for OBRAT")

Visulize the relationship & Review Logit relationships

# print the two graph side-by-side - Approve
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$APPROVE,logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$APPROVE,logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# print the two graph side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$BLACK,logi.mod=1, mainlabel="LoanPRC&BLACK",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$BLACK,logi.mod=1, mainlabel="OBRAT&BLACK",boxp=FALSE,type="hist",col="gray")

# print the two graph side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$HISPAN,logi.mod=1, mainlabel="LoanPRC&Hispanic",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$HISPAN,logi.mod=1, mainlabel="OBRAT&Hispanic",boxp=FALSE,type="hist",col="gray")

Black relationshps and Logit interactions

# Loan to Purchase Ratio, Other obligations and Approval side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE ),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male and marriedside-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="LoanPRC&Approve&",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

Hispanic relationshps and Logit interactions

# Loan to Purchase Ratio, Other obligations and Approval side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE ),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male and marriedside-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="LoanPRC&Approve&",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

EDA of different ethnicity

# filter out for only Black data
datb <-
  dats %>%
  filter(BLACK == 1) 
# filter out for only Hispanic data
dath <-
  dats %>%
  filter(HISPAN == 1)
# filter out for only Hispanic data
datw <-
  dats %>%
  filter(BLACK == 0) %>%
  filter(HISPAN == 0)

Descriptive Stats Table of different ethnicity

  • Black and Hispanic makes up relatively small proportion of the data set
  • Black and Hispanic has the lower approval rate compared to White
stargazer(datw, type = "html", title="Descriptive statistics - White Only", digits=2)
Descriptive statistics - White Only
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
MARRIED 1,666 0.66 0.47 0 0 1 1
GDLIN 1,666 0.94 0.24 0 1 1 1
OBRAT 1,666 32.03 8.23 0.00 27.60 36.60 95.00
BLACK 1,666 0.00 0.00 0 0 0 0
HISPAN 1,666 0.00 0.00 0 0 0 0
MALE 1,666 0.82 0.38 0 1 1 1
APPROVE 1,666 0.91 0.29 0 1 1 1
LOANPRC 1,666 75.65 19.01 2 68.1 89.6 257
stargazer(datb, type = "html", title="Descriptive statistics - Black Only", digits=2)
Descriptive statistics - Black Only
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
MARRIED 195 0.62 0.49 0 0 1 1
GDLIN 195 0.73 0.45 0 0 1 1
OBRAT 195 34.90 8.19 5.60 31.00 38.85 63.00
BLACK 195 1.00 0.00 1 1 1 1
HISPAN 195 0.00 0.00 0 0 0 0
MALE 195 0.74 0.44 0 0 1 1
APPROVE 195 0.67 0.47 0 0 1 1
LOANPRC 195 84.06 17.84 28.99 80.00 90.24 255.52
stargazer(dath, type = "html", title="Descriptive statistics - Hispanic Only", digits=2)
Descriptive statistics - Hispanic Only
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
MARRIED 108 0.71 0.45 0 0 1 1
GDLIN 108 0.85 0.36 0 1 1 1
OBRAT 108 33.47 8.46 14.60 29.00 38.33 62.00
BLACK 108 0.00 0.00 0 0 0 0
HISPAN 108 1.00 0.00 1 1 1 1
MALE 108 0.80 0.40 0 1 1 1
APPROVE 108 0.76 0.43 0 1 1 1
LOANPRC 108 85.63 14.50 40 80 90.5 163

logit models comparison

Black only model has only one (meets guidelines) significant variable

## fit ordered logit model
glm1 <- glm(APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glm2 <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glm3 <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glmw <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = datw, family = binomial(link="logit"))
glmb <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = datb, family = binomial(link="logit"))
glmh <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dath, family = binomial(link="logit"))

## view a summary of the Full model
summary(glm1)
## 
## Call:
## glm(formula = APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + 
##     BLACK + HISPAN, family = binomial(link = "logit"), data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8933   0.2445   0.3128   0.3742   2.3261  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.381531   0.591656   2.335 0.019542 *  
## MALE        -0.053947   0.234573  -0.230 0.818107    
## GDLIN        3.719269   0.217169  17.126  < 2e-16 ***
## LOANPRC     -0.016812   0.005074  -3.313 0.000922 ***
## OBRAT       -0.034074   0.010310  -3.305 0.000950 ***
## MARRIED      0.475742   0.192005   2.478 0.013221 *  
## BLACK       -0.815693   0.240177  -3.396 0.000683 ***
## HISPAN      -0.900010   0.310585  -2.898 0.003758 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.4  on 1968  degrees of freedom
## Residual deviance:  959.4  on 1961  degrees of freedom
## AIC: 975.4
## 
## Number of Fisher Scoring iterations: 6
#Removed Male since there is no significance 
summary(glm2)
## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8943   0.2440   0.3128   0.3748   2.3178  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.342387   0.566745   2.369 0.017856 *  
## GDLIN        3.721387   0.216988  17.150  < 2e-16 ***
## LOANPRC     -0.016773   0.005075  -3.305 0.000950 ***
## OBRAT       -0.034098   0.010310  -3.307 0.000942 ***
## MARRIED      0.460930   0.181039   2.546 0.010896 *  
## BLACK       -0.811426   0.239545  -3.387 0.000706 ***
## HISPAN      -0.897331   0.310374  -2.891 0.003839 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.43  on 1968  degrees of freedom
## Residual deviance:  959.46  on 1962  degrees of freedom
## AIC: 973.46
## 
## Number of Fisher Scoring iterations: 6
# Removed Married since there is small significance
summary(glm3)
## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + BLACK + HISPAN, 
##     family = binomial(link = "logit"), data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8328   0.2539   0.3196   0.3694   2.3016  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.668049   0.552038   3.022 0.002514 ** 
## GDLIN        3.694612   0.215000  17.184  < 2e-16 ***
## LOANPRC     -0.016585   0.005054  -3.282 0.001032 ** 
## OBRAT       -0.035117   0.010329  -3.400 0.000674 ***
## BLACK       -0.817685   0.239220  -3.418 0.000631 ***
## HISPAN      -0.858277   0.309369  -2.774 0.005532 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.43  on 1968  degrees of freedom
## Residual deviance:  965.87  on 1963  degrees of freedom
## AIC: 977.87
## 
## Number of Fisher Scoring iterations: 6
# White only data
summary(glmw)
## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = datw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9674   0.2348   0.3007   0.3537   2.0305  
## 
## Coefficients: (2 not defined because of singularities)
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.421414   0.665949   2.134 0.032809 *  
## GDLIN        3.715685   0.255150  14.563  < 2e-16 ***
## LOANPRC     -0.021921   0.005974  -3.670 0.000243 ***
## OBRAT       -0.026281   0.011723  -2.242 0.024977 *  
## MARRIED      0.598925   0.208713   2.870 0.004110 ** 
## BLACK              NA         NA      NA       NA    
## HISPAN             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.72  on 1665  degrees of freedom
## Residual deviance:  729.19  on 1661  degrees of freedom
## AIC: 739.19
## 
## Number of Fisher Scoring iterations: 6
# Black Only data
summary(glmb)
## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = datb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1675  -0.4442   0.4628   0.5179   2.3954  
## 
## Coefficients: (2 not defined because of singularities)
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.059865   1.551987  -0.683    0.495    
## GDLIN        4.034585   0.511059   7.895 2.91e-15 ***
## LOANPRC      0.003034   0.013786   0.220    0.826    
## OBRAT       -0.037604   0.029360  -1.281    0.200    
## MARRIED      0.127730   0.467598   0.273    0.785    
## BLACK              NA         NA      NA       NA    
## HISPAN             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 246.83  on 194  degrees of freedom
## Residual deviance: 139.52  on 190  degrees of freedom
## AIC: 149.52
## 
## Number of Fisher Scoring iterations: 5
# Hispanic only data
summary(glmh)
## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = dath)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8283   0.1451   0.3957   0.5731   1.8769  
## 
## Coefficients: (2 not defined because of singularities)
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.04265    2.52380   1.998  0.04571 *  
## GDLIN        3.12876    0.76075   4.113 3.91e-05 ***
## LOANPRC     -0.02755    0.02160  -1.276  0.20198    
## OBRAT       -0.10490    0.03873  -2.709  0.00675 ** 
## MARRIED     -0.41226    0.68384  -0.603  0.54661    
## BLACK             NA         NA      NA       NA    
## HISPAN            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 119.217  on 107  degrees of freedom
## Residual deviance:  79.134  on 103  degrees of freedom
## AIC: 89.134
## 
## Number of Fisher Scoring iterations: 5
#Compare the full data set
anova(glm1, glm2, glm3)
#Compare all the modles 
stargazer(glm1, glm2, glm3, glmw, glmb, glmh, type = "html", 
          column.labels = c("full Data", "white only", 
                            "black only", "hispanic only"), column.separate = c(3, 1,1,1))
Dependent variable:
APPROVE
full Data white only black only hispanic only
(1) (2) (3) (4) (5) (6)
MALE -0.054
(0.235)
GDLIN 3.719*** 3.721*** 3.695*** 3.716*** 4.035*** 3.129***
(0.217) (0.217) (0.215) (0.255) (0.511) (0.761)
LOANPRC -0.017*** -0.017*** -0.017*** -0.022*** 0.003 -0.028
(0.005) (0.005) (0.005) (0.006) (0.014) (0.022)
OBRAT -0.034*** -0.034*** -0.035*** -0.026** -0.038 -0.105***
(0.010) (0.010) (0.010) (0.012) (0.029) (0.039)
MARRIED 0.476** 0.461** 0.599*** 0.128 -0.412
(0.192) (0.181) (0.209) (0.468) (0.684)
BLACK -0.816*** -0.811*** -0.818***
(0.240) (0.240) (0.239)
HISPAN -0.900*** -0.897*** -0.858***
(0.311) (0.310) (0.309)
Constant 1.382** 1.342** 1.668*** 1.421** -1.060 5.043**
(0.592) (0.567) (0.552) (0.666) (1.552) (2.524)
Observations 1,969 1,969 1,969 1,666 195 108
Log Likelihood -479.702 -479.728 -482.936 -364.596 -69.759 -39.567
Akaike Inf. Crit. 975.403 973.456 977.873 739.191 149.518 89.134
Note: p<0.1; p<0.05; p<0.01

Odds Ratio for all 6 models

OR <- function(x) exp(x)
stargazer(glm1, glm2, glm3, type="html", apply.coef = OR, 
          title = "Odds Ratio with full data set")
Odds Ratio with full data set
Dependent variable:
APPROVE
(1) (2) (3)
MALE 0.947***
(0.235)
GDLIN 41.234*** 41.322*** 40.230***
(0.217) (0.217) (0.215)
LOANPRC 0.983*** 0.983*** 0.984***
(0.005) (0.005) (0.005)
OBRAT 0.967*** 0.966*** 0.965***
(0.010) (0.010) (0.010)
MARRIED 1.609*** 1.586***
(0.192) (0.181)
BLACK 0.442* 0.444* 0.441*
(0.240) (0.240) (0.239)
HISPAN 0.407 0.408 0.424
(0.311) (0.310) (0.309)
Constant 3.981*** 3.828*** 5.302***
(0.592) (0.567) (0.552)
Observations 1,969 1,969 1,969
Log Likelihood -479.702 -479.728 -482.936
Akaike Inf. Crit. 975.403 973.456 977.873
Note: p<0.1; p<0.05; p<0.01
stargazer(glmw, glmb, glmh, type="html",
          title = "Odds Ratio with different ethnicity",
          apply.coef = OR, column.labels = c("white only", 
                                             "black only", "hispanic only"))
Odds Ratio with different ethnicity
Dependent variable:
APPROVE
white only black only hispanic only
(1) (2) (3)
GDLIN 41.087*** 56.519*** 22.846***
(0.255) (0.511) (0.761)
LOANPRC 0.978*** 1.003*** 0.973***
(0.006) (0.014) (0.022)
OBRAT 0.974*** 0.963*** 0.900***
(0.012) (0.029) (0.039)
MARRIED 1.820*** 1.136** 0.662
(0.209) (0.468) (0.684)
BLACK
HISPAN
Constant 4.143*** 0.347 154.881***
(0.666) (1.552) (2.524)
Observations 1,666 195 108
Log Likelihood -364.596 -69.759 -39.567
Akaike Inf. Crit. 739.191 149.518 89.134
Note: p<0.1; p<0.05; p<0.01
stargazer(glm1, glm2, glm3, glmw, glmb, glmh, type="html",
          title = "Logistic Regression with t stas for significance",
          report = "vct*", column.labels = c("full Data", "white only",
                                             "black only", "hispanic only"), 
          column.separate = c(3, 1,1,1))
Logistic Regression with t stas for significance
Dependent variable:
APPROVE
full Data white only black only hispanic only
(1) (2) (3) (4) (5) (6)
MALE -0.054
t = -0.230
GDLIN 3.719 3.721 3.695 3.716 4.035 3.129
t = 17.126*** t = 17.150*** t = 17.184*** t = 14.563*** t = 7.895*** t = 4.113***
LOANPRC -0.017 -0.017 -0.017 -0.022 0.003 -0.028
t = -3.313*** t = -3.305*** t = -3.282*** t = -3.670*** t = 0.220 t = -1.276
OBRAT -0.034 -0.034 -0.035 -0.026 -0.038 -0.105
t = -3.305*** t = -3.307*** t = -3.400*** t = -2.242** t = -1.281 t = -2.709***
MARRIED 0.476 0.461 0.599 0.128 -0.412
t = 2.478** t = 2.546** t = 2.870*** t = 0.273 t = -0.603
BLACK -0.816 -0.811 -0.818
t = -3.396*** t = -3.387*** t = -3.418***
HISPAN -0.900 -0.897 -0.858
t = -2.898*** t = -2.891*** t = -2.774***
Constant 1.382 1.342 1.668 1.421 -1.060 5.043
t = 2.335** t = 2.369** t = 3.022*** t = 2.134** t = -0.683 t = 1.998**
Observations 1,969 1,969 1,969 1,666 195 108
Log Likelihood -479.702 -479.728 -482.936 -364.596 -69.759 -39.567
Akaike Inf. Crit. 975.403 973.456 977.873 739.191 149.518 89.134
Note: p<0.1; p<0.05; p<0.01
stargazer(
exp(cbind(OR = coef(glm1), confint(glm1))),
exp(cbind(OR = coef(glm2), confint(glm2))),
exp(cbind(OR = coef(glm3), confint(glm3))),
exp(cbind(OR = coef(glmw), confint(glmw))),
exp(cbind(OR = coef(glmb), confint(glmb))),
exp(cbind(OR = coef(glmh), confint(glmh))),
type="html")
OR 2.5 % 97.5 %
(Intercept) 3.981 1.255 12.840
MALE 0.947 0.592 1.488
GDLIN 41.234 27.194 63.810
LOANPRC 0.983 0.973 0.993
OBRAT 0.967 0.947 0.986
MARRIED 1.609 1.103 2.343
BLACK 0.442 0.278 0.715
HISPAN 0.407 0.226 0.766
OR 2.5 % 97.5 %
(Intercept) 3.828 1.267 11.772
GDLIN 41.322 27.261 63.923
LOANPRC 0.983 0.973 0.993
OBRAT 0.966 0.947 0.986
MARRIED 1.586 1.111 2.261
BLACK 0.444 0.280 0.717
HISPAN 0.408 0.227 0.768
OR 2.5 % 97.5 %
(Intercept) 5.302 1.811 15.872
GDLIN 40.230 26.637 61.975
LOANPRC 0.984 0.974 0.993
OBRAT 0.965 0.946 0.985
BLACK 0.441 0.278 0.712
HISPAN 0.424 0.236 0.797
OR 2.5 % 97.5 %
(Intercept) 4.143 1.126 15.466
GDLIN 41.087 25.193 68.672
LOANPRC 0.978 0.967 0.990
OBRAT 0.974 0.952 0.997
MARRIED 1.820 1.208 2.744
BLACK
HISPAN
OR 2.5 % 97.5 %
(Intercept) 0.347 0.014 5.933
GDLIN 56.519 22.228 168.802
LOANPRC 1.003 0.981 1.034
OBRAT 0.963 0.908 1.019
MARRIED 1.136 0.446 2.844
BLACK
HISPAN
OR 2.5 % 97.5 %
(Intercept) 154.881 1.327 30,628.190
GDLIN 22.846 5.732 121.709
LOANPRC 0.973 0.929 1.012
OBRAT 0.900 0.828 0.967
MARRIED 0.662 0.156 2.382
BLACK
HISPAN

Likelihood Ratio Test

lrtest(glm2, glm1)
## 
## Model 1: APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN
## Model 2: APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN
## 
## L.R. Chisq       d.f.          P 
## 0.05313217 1.00000000 0.81770003
LR.s.1 <- glm2$deviance - glm1$deviance
LR.s.2 <- -2*logLik(glm2)[1] - (-2*logLik(glm1)[1])
pchisq(LR.s.1, 1, lower.tail = FALSE)
## [1] 0.8177
pchisq(LR.s.1, 2, lower.tail = FALSE)
## [1] 0.9737837
lrm(glm2)
## Logistic Regression Model
##  
##  lrm(formula = glm2)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          1969    LR chi2     515.97    R2       0.437    C       0.846    
##   0            244    d.f.             6    g        1.252    Dxy     0.692    
##   1           1725    Pr(> chi2) <0.0001    gr       3.498    gamma   0.693    
##  max |deriv| 8e-10                          gp       0.147    tau-a   0.150    
##                                             Brier    0.066                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  1.3424 0.5667  2.37  0.0179  
##  GDLIN      3.7214 0.2170 17.15  <0.0001 
##  LOANPRC   -0.0168 0.0051 -3.31  0.0009  
##  OBRAT     -0.0341 0.0103 -3.31  0.0009  
##  MARRIED    0.4609 0.1810  2.55  0.0109  
##  BLACK     -0.8114 0.2395 -3.39  0.0007  
##  HISPAN    -0.8973 0.3104 -2.89  0.0038  
## 
lrm(glm1)
## Logistic Regression Model
##  
##  lrm(formula = glm1)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          1969    LR chi2     516.03    R2       0.437    C       0.846    
##   0            244    d.f.             7    g        1.253    Dxy     0.693    
##   1           1725    Pr(> chi2) <0.0001    gr       3.501    gamma   0.693    
##  max |deriv| 8e-10                          gp       0.147    tau-a   0.150    
##                                             Brier    0.066                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  1.3815 0.5917  2.34  0.0195  
##  MALE      -0.0539 0.2346 -0.23  0.8181  
##  GDLIN      3.7193 0.2172 17.13  <0.0001 
##  LOANPRC   -0.0168 0.0051 -3.31  0.0009  
##  OBRAT     -0.0341 0.0103 -3.31  0.0009  
##  MARRIED    0.4757 0.1920  2.48  0.0132  
##  BLACK     -0.8157 0.2402 -3.40  0.0007  
##  HISPAN    -0.9000 0.3106 -2.90  0.0038  
## 

Visulizing Model Fit Analysis

  • Model with male and married committed seems to be the best fit.
#FUll Model 
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm1)

#Ommited Gender
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm2)

#Ommited Gender and Married
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm3)

Correctly classified observations

###Correctly classified observations
#all the same
stargazer(mean((glm1$fitted.values>=0.5)==dats$APPROVE),
mean((glm2$fitted.values>=0.5)==dats$APPROVE),
mean((glm3$fitted.values>=0.5)==dats$APPROVE), type = "text")

===== 0.924 —–

===== 0.924 —–

===== 0.924 —–

Confusion matrix

Same outcome

###Confusion matrix count

#FUll Model
RP=sum((glm1$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm1$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm1$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm1$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat1<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat1)<-c("Pred Approved","Pred not Approved")
rownames(confMat1)<-c("Real Approved","Real not Approved")
#Ommited Gender
RP=sum((glm2$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm2$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm2$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm2$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat2<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat2)<-c("Pred Approved","Pred not Approved")
rownames(confMat2)<-c("Real Approved","Real not Approved")
#Ommited Gender and Married
RP=sum((glm3$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm3$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm3$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm3$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat3<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat3)<-c("Pred Approved","Pred not Approved")
rownames(confMat3)<-c("Real Approved","Real not Approved")
###Confusion matrix count for the 3 Models
kable(
  rbind(
    confMat1,
    confMat2,
    confMat3))
Pred Approved Pred not Approved
Real Approved 1686 39
Real not Approved 111 133
Real Approved 1686 39
Real not Approved 111 133
Real Approved 1686 39
Real not Approved 111 133
###Confusion matrix proportion

#FUll Model
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat1<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat1)<-c("Pred Approved","Pred not Approved")
rownames(confMat1)<-c("Real Approved","Real not Approved")
#Ommited Gender
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat2<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat2)<-c("Pred Approved","Pred not Approved")
rownames(confMat2)<-c("Real Approved","Real not Approved")
#Ommited Gender and Married
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat3<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat3)<-c("Pred Approved","Pred not Approved")
rownames(confMat3)<-c("Real Approved","Real not Approved")

###Confusion matrix proportion
kable(
  rbind(
    confMat1,
    confMat2,
    confMat3))
Pred Approved Pred not Approved
Real Approved 97.73913 2.26087
Real not Approved 45.49180 54.50820
Real Approved 97.73913 2.26087
Real not Approved 45.49180 54.50820
Real Approved 97.73913 2.26087
Real not Approved 45.49180 54.50820

Prototypical Analysis

Both Probit and Logit prediction shows that Hispanics and Blacks are less likely to get approval to loans when compared to whites

#Estimate Logit Model
LogitModel = glm(APPROVE ~ OBRAT + BLACK + HISPAN, data = dats, 
                 family = "binomial")
summary(LogitModel)
## 
## Call:
## glm(formula = APPROVE ~ OBRAT + BLACK + HISPAN, family = "binomial", 
##     data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7549   0.3498   0.4226   0.4828   1.6930  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.164374   0.306121  13.604  < 2e-16 ***
## OBRAT       -0.056052   0.008426  -6.652 2.89e-11 ***
## BLACK       -1.458365   0.178007  -8.193 2.55e-16 ***
## HISPAN      -1.086268   0.245317  -4.428 9.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.4  on 1968  degrees of freedom
## Residual deviance: 1347.0  on 1965  degrees of freedom
## AIC: 1355
## 
## Number of Fisher Scoring iterations: 5
#Generate Odds Ratios
exp(coef(LogitModel))
## (Intercept)       OBRAT       BLACK      HISPAN 
##  64.3523600   0.9454897   0.2326164   0.3374738
#Define prototypical loan applicants (you will need more than 3)
prototype1 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 1, HISPAN = 0)
prototype2 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 0, HISPAN = 1)
prototype3 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 0, HISPAN = 0)

#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (LogitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (LogitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (LogitModel, newdata = prototype3, type ="response")
#logit Probability
kable(rbind( prototype1,
       prototype2,
       prototype3))
OBRAT BLACK HISPAN predictedprob
32.39172 1 0 0.7089684
32.39172 0 1 0.7794520
32.39172 0 0 0.9128343
#Estimate Probit Model
ProbitModel = glm(APPROVE ~ OBRAT + BLACK + HISPAN, data = dats, 
                  family = "binomial" (link = "probit"))
summary(ProbitModel)
## 
## Call:
## glm(formula = APPROVE ~ OBRAT + BLACK + HISPAN, family = binomial(link = "probit"), 
##     data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8082   0.3507   0.4264   0.4875   1.4748  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.264355   0.161312  14.037  < 2e-16 ***
## OBRAT       -0.028262   0.004583  -6.166 7.00e-10 ***
## BLACK       -0.823087   0.103822  -7.928 2.23e-15 ***
## HISPAN      -0.588310   0.140902  -4.175 2.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.4  on 1968  degrees of freedom
## Residual deviance: 1350.3  on 1965  degrees of freedom
## AIC: 1358.3
## 
## Number of Fisher Scoring iterations: 5
#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (ProbitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (ProbitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (ProbitModel, newdata = prototype3, type ="response")
# Probit Probability
kable(rbind(prototype1,
      prototype2,
      prototype3))
OBRAT BLACK HISPAN predictedprob
32.39172 1 0 0.7004903
32.39172 0 1 0.7765483
32.39172 0 0 0.9113151
#Impose appropriate sample selection criteria here
MLDsubsample <- subset(dats, LOANPRC >= 50 & MALE != "." & MARRIED != "." & (GDLIN == 1 | GDLIN == 0))

 
#Estimate Logit Model (I am not necessarily recommending this model -- this is purely illustrative)
LogitModel <- glm(APPROVE ~ OBRAT + GDLIN + LOANPRC + MALE + MARRIED + BLACK + HISPAN, data = MLDsubsample, 
                 family = "binomial")
summary(LogitModel)
## 
## Call:
## glm(formula = APPROVE ~ OBRAT + GDLIN + LOANPRC + MALE + MARRIED + 
##     BLACK + HISPAN, family = "binomial", data = MLDsubsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8093   0.2653   0.3238   0.3836   2.3226  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.348569   0.633915   2.127 0.033390 *  
## OBRAT       -0.033061   0.010452  -3.163 0.001561 ** 
## GDLIN        3.708162   0.217682  17.035  < 2e-16 ***
## LOANPRC     -0.016723   0.005637  -2.967 0.003009 ** 
## MALE        -0.068301   0.237847  -0.287 0.773987    
## MARRIED      0.473623   0.193889   2.443 0.014576 *  
## BLACK       -0.820291   0.241013  -3.404 0.000665 ***
## HISPAN      -0.812085   0.316951  -2.562 0.010402 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1421.09  on 1812  degrees of freedom
## Residual deviance:  931.16  on 1805  degrees of freedom
## AIC: 947.16
## 
## Number of Fisher Scoring iterations: 5
#Generate Log-Likelihood
logLik(LogitModel)
## 'log Lik.' -465.5778 (df=8)

Probit Model Analysis

Stick with Logit Model since its easier to translate

# this gives main effects AND interactions
ad_aov <- aov(APPROVE ~ OBRAT * BLACK + HISPAN * OBRAT + LOANPRC * BLACK + LOANPRC * HISPAN, data = dats, family = "binomial" (link = "probit"))
# this would give ONLY main effects
ad_aov2 <- aov(APPROVE ~ OBRAT + BLACK + HISPAN + MARRIED + LOANPRC + GDLIN, data = dats, family = "binomial" (link = "probit"))
summary(ad_aov)
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## OBRAT             1   6.59   6.588  66.720 5.54e-16 ***
## BLACK             1   7.63   7.627  77.238  < 2e-16 ***
## HISPAN            1   1.97   1.972  19.973 8.30e-06 ***
## LOANPRC           1   1.99   1.986  20.113 7.72e-06 ***
## OBRAT:BLACK       1   0.28   0.284   2.877 0.090008 .  
## OBRAT:HISPAN      1   1.29   1.287  13.030 0.000314 ***
## BLACK:LOANPRC     1   0.19   0.194   1.965 0.161184    
## HISPAN:LOANPRC    1   0.30   0.295   2.988 0.084046 .  
## Residuals      1960 193.53   0.099                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(ad_aov2)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## OBRAT          1   6.59    6.59  99.693  < 2e-16 ***
## BLACK          1   7.63    7.63 115.410  < 2e-16 ***
## HISPAN         1   1.97    1.97  29.844 5.28e-08 ***
## MARRIED        1   0.61    0.61   9.247  0.00239 ** 
## LOANPRC        1   1.92    1.92  29.050 7.90e-08 ***
## GDLIN          1  65.39   65.39 989.570  < 2e-16 ***
## Residuals   1962 129.65    0.07                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tidy_ad_aov <- broom::tidy(ad_aov)
kable(tidy_ad_aov)
term df sumsq meansq statistic p.value
OBRAT 1 6.5879509 6.5879509 66.719936 0.0000000
BLACK 1 7.6265125 7.6265125 77.238041 0.0000000
HISPAN 1 1.9721477 1.9721477 19.973065 0.0000083
LOANPRC 1 1.9859170 1.9859170 20.112514 0.0000077
OBRAT:BLACK 1 0.2840827 0.2840827 2.877067 0.0900083
OBRAT:HISPAN 1 1.2865938 1.2865938 13.030069 0.0003142
BLACK:LOANPRC 1 0.1939820 0.1939820 1.964567 0.1611841
HISPAN:LOANPRC 1 0.2950266 0.2950266 2.987902 0.0840463
Residuals 1960 193.5311184 0.0987404 NA NA