Code header

# Course: ECON 5300L Applied Econometric
# Title: EDA of Mortgage Loan Decisions 
# Purpose: Explore Mortgage Loan Decisions outcome
# Data: MLD Data file.csv
# Date: Mar 26, 2018
# Author: Afsar Ali

Exploratory Data Analysis (EDA)

rm(list=ls(all=TRUE))  # Clear all data in environment

#load Packages 
library(tidyverse)
library(GGally)
library(plotly)
library(car)
library(ggplot2)
library(stargazer)
library(sandwich)
library(lmtest)   # waldtest; see also coeftest.
library(psych)
library(aod)
library(Rcpp)
library(Hmisc)
library(pastecs)
library(popbio)
library(rms)
library(Hmisc)
library(kableExtra)

# Load data 
datd <- read.csv("MLD Data File.csv", header = TRUE)
#Review selected data
glimpse(datd)

## Observations: 1,989
## Variables: 8
## $ MARRIED <fct> 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ GDLIN   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ OBRAT   <dbl> 34.5, 34.1, 26.0, 37.0, 32.1, 33.0, 36.0, 37.0, 30.7, ...
## $ BLACK   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ HISPAN  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MALE    <fct> ., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ APPROVE <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ LOANPRC <dbl> 0.7542373, 0.8000000, 0.8951049, 0.6000000, 0.8955224,...

kable(summary(datd))

MARRIED	`GDLIN </th>`	`OBRAT </th>`	`BLACK </th>`	`HISPAN </th>`	MALE	`APPROVE </th>`	`LOANPRC </th>`
.: 3	Min. : 0.000	Min. : 0.00	Min. :0.00000	Min. :0.00000	.: 15	Min. :0.0000	Min. :0.02105
0: 678	1st Qu.: 1.000	1st Qu.:28.00	1st Qu.:0.00000	1st Qu.:0.00000	0: 369	1st Qu.:1.0000	1st Qu.:0.70000
1:1308	Median : 1.000	Median :33.00	Median :0.00000	Median :0.00000	1:1605	Median :1.0000	Median :0.80000
NA	Mean : 1.583	Mean :32.39	Mean :0.09904	Mean :0.05581	NA	Mean :0.8773	Mean :0.77064
NA	3rd Qu.: 1.000	3rd Qu.:37.00	3rd Qu.:0.00000	3rd Qu.:0.00000	NA	3rd Qu.:1.0000	3rd Qu.:0.89894
NA	Max. :666.000	Max. :95.00	Max. :1.00000	Max. :1.00000	NA	Max. :1.0000	Max. :2.57143

Data Discription

In light of the relatively small number of mortgage loan applications made by minorities, these extra variables were collected for all applications by blacks and Hispanics and for a random sample. The data set includes the following variables.

APPROVE = 1 if mortgage loan was approved, = 0 otherwise
GDLIN = 1 if credit history meets guidelines, = 0 otherwise
LOANPRC = loan amount/purchase price
OBRAT = other obligations as a percent of total income
MALE = 1 if male, = 0 otherwise
MARRIED = 1 if married, = 0 otherwise
BLACK = 1 if black, = 0 otherwise
HISPAN = 1 if Hispanic, = 0 otherwise

Remove 20 na’s obs based on above summary

#filter data 
dats <-
  datd %>%
  filter(GDLIN != 666) %>%
  filter(MALE != ".") %>%
  filter(MARRIED != ".") %>%
  mutate(LOANPRC = LOANPRC*100)

# Change these to factors
# dats$MARRIED<- as.factor(as.character(dats$MARRIED))
# dats$APPROVE<- as.factor(as.character(dats$APPROVE))
# dats$MALE<- as.factor(as.character(dats$MALE))
# dats$GDLIN<-  as.factor(dats$GDLIN)
# dats$BLACK<-  as.factor(dats$BLACK)
# dats$HISPAN<-  as.factor(dats$HISPAN)

#Removed 20 ob
#Changed it to integer

dats$MARRIED<- as.integer(as.character(dats$MARRIED))
dats$MALE<- as.integer(as.character(dats$MALE))

#dats$GDLIN<-  as.factor(dats$GDLIN)
#attach the file for use
attach(dats)

#Review selected data
glimpse(dats)

## Observations: 1,969
## Variables: 8
## $ MARRIED <int> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ GDLIN   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ OBRAT   <dbl> 34.1, 26.0, 37.0, 32.1, 33.0, 36.0, 37.0, 30.7, 49.0, ...
## $ BLACK   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ HISPAN  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MALE    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ APPROVE <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ LOANPRC <dbl> 80.00000, 89.51049, 60.00000, 89.55224, 80.43478, 89.8...

kable(summary(dats))

`MARRIED </th>`	`GDLIN </th>`	`OBRAT </th>`	`BLACK </th>`	`HISPAN </th>`	`MALE </th>`	`APPROVE </th>`	`LOANPRC </th>`
Min. :0.0000	Min. :0.0000	Min. : 0.00	Min. :0.00000	Min. :0.00000	Min. :0.0000	Min. :0.0000	Min. : 2.105
1st Qu.:0.0000	1st Qu.:1.0000	1st Qu.:28.00	1st Qu.:0.00000	1st Qu.:0.00000	1st Qu.:1.0000	1st Qu.:1.0000	1st Qu.: 70.000
Median :1.0000	Median :1.0000	Median :33.00	Median :0.00000	Median :0.00000	Median :1.0000	Median :1.0000	Median : 80.000
Mean :0.6592	Mean :0.9132	Mean :32.39	Mean :0.09903	Mean :0.05485	Mean :0.8131	Mean :0.8761	Mean : 77.032
3rd Qu.:1.0000	3rd Qu.:1.0000	3rd Qu.:37.00	3rd Qu.:0.00000	3rd Qu.:0.00000	3rd Qu.:1.0000	3rd Qu.:1.0000	3rd Qu.: 89.888
Max. :1.0000	Max. :1.0000	Max. :95.00	Max. :1.00000	Max. :1.00000	Max. :1.0000	Max. :1.0000	Max. :257.143

Review GGpair Matrix

ggpairs(dats,lower = list(continuous="smooth"))

Review Stats of each variables

##The describe() function which is part of the Hmisc package displays the following additional statistics:
#Number of rows,#Standard deviation,#Trimmed mean,#Mean absolute deviation,#Skewness,#Kurtosis,#Standard error
describe(dats)

## dats 
## 
##  8  Variables      1969  Observations
## ---------------------------------------------------------------------------
## MARRIED 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.674     1298   0.6592   0.4495 
## 
## ---------------------------------------------------------------------------
## GDLIN 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.238     1798   0.9132   0.1587 
## 
## ---------------------------------------------------------------------------
## OBRAT 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1969        0      288    0.999    32.39    8.772    18.16    22.00 
##      .25      .50      .75      .90      .95 
##    28.00    33.00    37.00    40.52    44.00 
## 
## lowest :  0.00  4.00  5.60  6.99  7.00, highest: 73.00 75.00 78.00 83.00 95.00
## ---------------------------------------------------------------------------
## BLACK 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.268      195  0.09904   0.1785 
## 
## ---------------------------------------------------------------------------
## HISPAN 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.156      108  0.05485   0.1037 
## 
## ---------------------------------------------------------------------------
## MALE 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.456     1601   0.8131   0.3041 
## 
## ---------------------------------------------------------------------------
## APPROVE 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1969        0        2    0.326     1725   0.8761   0.2172 
## 
## ---------------------------------------------------------------------------
## LOANPRC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1969        0     1096    0.998    77.03    18.91    40.00    52.38 
##      .25      .50      .75      .90      .95 
##    70.00    80.00    89.89    94.45    95.45 
## 
## lowest :   2.10526   8.77193   9.09091   9.33032  11.01322
## highest: 164.70590 183.33330 220.00000 255.52480 257.14290
## ---------------------------------------------------------------------------

Review Variance for anything that may jump out

##The stat.desc() function which is part of the pastecs package displays the following additional statistics:
#Variance,#Coefficient of variation,#Confidence interval for mean
kable(stat.desc(dats))

	MARRIED	GDLIN	OBRAT	BLACK	HISPAN	MALE	APPROVE	LOANPRC
nbr.val	1969.0000000	1969.0000000	1.969000e+03	1969.0000000	1969.0000000	1969.0000000	1969.0000000	1.969000e+03
nbr.null	671.0000000	171.0000000	1.000000e+00	1774.0000000	1861.0000000	368.0000000	244.0000000	0.000000e+00
nbr.na	0.0000000	0.0000000	0.000000e+00	0.0000000	0.0000000	0.0000000	0.0000000	0.000000e+00
min	0.0000000	0.0000000	0.000000e+00	0.0000000	0.0000000	0.0000000	0.0000000	2.105260e+00
max	1.0000000	1.0000000	9.500000e+01	1.0000000	1.0000000	1.0000000	1.0000000	2.571429e+02
range	1.0000000	1.0000000	9.500000e+01	1.0000000	1.0000000	1.0000000	1.0000000	2.550376e+02
sum	1298.0000000	1798.0000000	6.377929e+04	195.0000000	108.0000000	1601.0000000	1725.0000000	1.516761e+05
median	1.0000000	1.0000000	3.300000e+01	0.0000000	0.0000000	1.0000000	1.0000000	8.000000e+01
mean	0.6592179	0.9131539	3.239172e+01	0.0990350	0.0548502	0.8131031	0.8760792	7.703204e+01
SE.mean	0.0106842	0.0063480	1.865833e-01	0.0067334	0.0051325	0.0087874	0.0074273	4.270414e-01
CI.mean.0.95	0.0209535	0.0124494	3.659215e-01	0.0132054	0.0100657	0.0172336	0.0145662	8.375009e-01
var	0.2247638	0.0793442	6.854741e+01	0.0892724	0.0518680	0.1520437	0.1086196	3.590755e+02
std.dev	0.4740926	0.2816810	8.279336e+00	0.2987849	0.2277454	0.3899278	0.3295748	1.894929e+01
coef.var	0.7191744	0.3084704	2.556004e-01	3.0169618	4.1521365	0.4795551	0.3761930	2.459923e-01

Histogtam of the Variables

#Lot more married obs 
plot_ly(x = dats$MARRIED,
             type = "histogram")

#Small amount of obs that desn't meet credit guideline
plot_ly(x = dats$GDLIN,
             type = "histogram")

#Right Skewed with 36 in the middle 
plot_ly(x = dats$OBRAT,
             type = "histogram")

#Small amount of Black obs
plot_ly(x = dats$BLACK,
             type = "histogram")

#Small amount of Hispanic obs
plot_ly(x = dats$HISPAN,
             type = "histogram")

#Mostly Males
plot_ly(x = dats$MALE,
             type = "histogram")

#Mostly Approved
plot_ly(x = dats$APPROVE,
             type = "histogram")

#LOANPRC = loan amount/purchase price
#.80 is 20% of the data, .9 is 10% of the data
plot_ly(x = dats$LOANPRC,
             type = "histogram")

Quantile-Quantile (QQ) Plots

# after 2 its not normal 
ggplot(dats, aes(sample = LOANPRC)) + 
  geom_qq() +
  ggtitle("Q-Q Plot for LOANPRC")

#looks normally distrubuted
ggplot(dats, aes(sample = OBRAT)) + 
  geom_qq() +
  ggtitle("Q-Q Plot for OBRAT")

Visulize the relationship & Review Logit relationships

# print the two graph side-by-side - Approve
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$APPROVE,logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$APPROVE,logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# print the two graph side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$BLACK,logi.mod=1, mainlabel="LoanPRC&BLACK",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$BLACK,logi.mod=1, mainlabel="OBRAT&BLACK",boxp=FALSE,type="hist",col="gray")

# print the two graph side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$HISPAN,logi.mod=1, mainlabel="LoanPRC&Hispanic",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$HISPAN,logi.mod=1, mainlabel="OBRAT&Hispanic",boxp=FALSE,type="hist",col="gray")

Black relationshps and Logit interactions

# Loan to Purchase Ratio, Other obligations and Approval side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE ),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male and marriedside-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="LoanPRC&Approve&",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

Hispanic relationshps and Logit interactions

# Loan to Purchase Ratio, Other obligations and Approval side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE ),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male and marriedside-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="LoanPRC&Approve&",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")

EDA of different ethnicity

# filter out for only Black data
datb <-
  dats %>%
  filter(BLACK == 1) 
# filter out for only Hispanic data
dath <-
  dats %>%
  filter(HISPAN == 1)
# filter out for only Hispanic data
datw <-
  dats %>%
  filter(BLACK == 0) %>%
  filter(HISPAN == 0)

Descriptive Stats Table of different ethnicity

Black and Hispanic makes up relatively small proportion of the data set
Black and Hispanic has the lower approval rate compared to White

stargazer(datw, type = "html", title="Descriptive statistics - White Only", digits=2)

**Descriptive statistics - White Only**

Statistic	N	Mean	St. Dev.	Min	Pctl(25)	Pctl(75)	Max

MARRIED	1,666	0.66	0.47	0	0	1	1
GDLIN	1,666	0.94	0.24	0	1	1	1
OBRAT	1,666	32.03	8.23	0.00	27.60	36.60	95.00
BLACK	1,666	0.00	0.00	0	0	0	0
HISPAN	1,666	0.00	0.00	0	0	0	0
MALE	1,666	0.82	0.38	0	1	1	1
APPROVE	1,666	0.91	0.29	0	1	1	1
LOANPRC	1,666	75.65	19.01	2	68.1	89.6	257

stargazer(datb, type = "html", title="Descriptive statistics - Black Only", digits=2)

**Descriptive statistics - Black Only**

Statistic	N	Mean	St. Dev.	Min	Pctl(25)	Pctl(75)	Max

MARRIED	195	0.62	0.49	0	0	1	1
GDLIN	195	0.73	0.45	0	0	1	1
OBRAT	195	34.90	8.19	5.60	31.00	38.85	63.00
BLACK	195	1.00	0.00	1	1	1	1
HISPAN	195	0.00	0.00	0	0	0	0
MALE	195	0.74	0.44	0	0	1	1
APPROVE	195	0.67	0.47	0	0	1	1
LOANPRC	195	84.06	17.84	28.99	80.00	90.24	255.52

stargazer(dath, type = "html", title="Descriptive statistics - Hispanic Only", digits=2)

**Descriptive statistics - Hispanic Only**

Statistic	N	Mean	St. Dev.	Min	Pctl(25)	Pctl(75)	Max

MARRIED	108	0.71	0.45	0	0	1	1
GDLIN	108	0.85	0.36	0	1	1	1
OBRAT	108	33.47	8.46	14.60	29.00	38.33	62.00
BLACK	108	0.00	0.00	0	0	0	0
HISPAN	108	1.00	0.00	1	1	1	1
MALE	108	0.80	0.40	0	1	1	1
APPROVE	108	0.76	0.43	0	1	1	1
LOANPRC	108	85.63	14.50	40	80	90.5	163

logit models comparison

Black only model has only one (meets guidelines) significant variable

## fit ordered logit model
glm1 <- glm(APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glm2 <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glm3 <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glmw <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = datw, family = binomial(link="logit"))
glmb <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = datb, family = binomial(link="logit"))
glmh <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dath, family = binomial(link="logit"))

## view a summary of the Full model
summary(glm1)

## 
## Call:
## glm(formula = APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + 
##     BLACK + HISPAN, family = binomial(link = "logit"), data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8933   0.2445   0.3128   0.3742   2.3261  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.381531   0.591656   2.335 0.019542 *  
## MALE        -0.053947   0.234573  -0.230 0.818107    
## GDLIN        3.719269   0.217169  17.126  < 2e-16 ***
## LOANPRC     -0.016812   0.005074  -3.313 0.000922 ***
## OBRAT       -0.034074   0.010310  -3.305 0.000950 ***
## MARRIED      0.475742   0.192005   2.478 0.013221 *  
## BLACK       -0.815693   0.240177  -3.396 0.000683 ***
## HISPAN      -0.900010   0.310585  -2.898 0.003758 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.4  on 1968  degrees of freedom
## Residual deviance:  959.4  on 1961  degrees of freedom
## AIC: 975.4
## 
## Number of Fisher Scoring iterations: 6

#Removed Male since there is no significance 
summary(glm2)

## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8943   0.2440   0.3128   0.3748   2.3178  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.342387   0.566745   2.369 0.017856 *  
## GDLIN        3.721387   0.216988  17.150  < 2e-16 ***
## LOANPRC     -0.016773   0.005075  -3.305 0.000950 ***
## OBRAT       -0.034098   0.010310  -3.307 0.000942 ***
## MARRIED      0.460930   0.181039   2.546 0.010896 *  
## BLACK       -0.811426   0.239545  -3.387 0.000706 ***
## HISPAN      -0.897331   0.310374  -2.891 0.003839 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.43  on 1968  degrees of freedom
## Residual deviance:  959.46  on 1962  degrees of freedom
## AIC: 973.46
## 
## Number of Fisher Scoring iterations: 6

# Removed Married since there is small significance
summary(glm3)

## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + BLACK + HISPAN, 
##     family = binomial(link = "logit"), data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8328   0.2539   0.3196   0.3694   2.3016  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.668049   0.552038   3.022 0.002514 ** 
## GDLIN        3.694612   0.215000  17.184  < 2e-16 ***
## LOANPRC     -0.016585   0.005054  -3.282 0.001032 ** 
## OBRAT       -0.035117   0.010329  -3.400 0.000674 ***
## BLACK       -0.817685   0.239220  -3.418 0.000631 ***
## HISPAN      -0.858277   0.309369  -2.774 0.005532 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.43  on 1968  degrees of freedom
## Residual deviance:  965.87  on 1963  degrees of freedom
## AIC: 977.87
## 
## Number of Fisher Scoring iterations: 6

# White only data
summary(glmw)

## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = datw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9674   0.2348   0.3007   0.3537   2.0305  
## 
## Coefficients: (2 not defined because of singularities)
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.421414   0.665949   2.134 0.032809 *  
## GDLIN        3.715685   0.255150  14.563  < 2e-16 ***
## LOANPRC     -0.021921   0.005974  -3.670 0.000243 ***
## OBRAT       -0.026281   0.011723  -2.242 0.024977 *  
## MARRIED      0.598925   0.208713   2.870 0.004110 ** 
## BLACK              NA         NA      NA       NA    
## HISPAN             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1026.72  on 1665  degrees of freedom
## Residual deviance:  729.19  on 1661  degrees of freedom
## AIC: 739.19
## 
## Number of Fisher Scoring iterations: 6

# Black Only data
summary(glmb)

## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = datb)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1675  -0.4442   0.4628   0.5179   2.3954  
## 
## Coefficients: (2 not defined because of singularities)
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.059865   1.551987  -0.683    0.495    
## GDLIN        4.034585   0.511059   7.895 2.91e-15 ***
## LOANPRC      0.003034   0.013786   0.220    0.826    
## OBRAT       -0.037604   0.029360  -1.281    0.200    
## MARRIED      0.127730   0.467598   0.273    0.785    
## BLACK              NA         NA      NA       NA    
## HISPAN             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 246.83  on 194  degrees of freedom
## Residual deviance: 139.52  on 190  degrees of freedom
## AIC: 149.52
## 
## Number of Fisher Scoring iterations: 5

# Hispanic only data
summary(glmh)

## 
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN, family = binomial(link = "logit"), data = dath)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8283   0.1451   0.3957   0.5731   1.8769  
## 
## Coefficients: (2 not defined because of singularities)
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.04265    2.52380   1.998  0.04571 *  
## GDLIN        3.12876    0.76075   4.113 3.91e-05 ***
## LOANPRC     -0.02755    0.02160  -1.276  0.20198    
## OBRAT       -0.10490    0.03873  -2.709  0.00675 ** 
## MARRIED     -0.41226    0.68384  -0.603  0.54661    
## BLACK             NA         NA      NA       NA    
## HISPAN            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 119.217  on 107  degrees of freedom
## Residual deviance:  79.134  on 103  degrees of freedom
## AIC: 89.134
## 
## Number of Fisher Scoring iterations: 5

#Compare the full data set
anova(glm1, glm2, glm3)

#Compare all the modles 
stargazer(glm1, glm2, glm3, glmw, glmb, glmh, type = "html", 
          column.labels = c("full Data", "white only", 
                            "black only", "hispanic only"), column.separate = c(3, 1,1,1))


	Dependent variable:

	APPROVE
	full Data			white only	black only	hispanic only
	(1)	(2)	(3)	(4)	(5)	(6)

MALE	-0.054
	(0.235)

GDLIN	3.719^***	3.721^***	3.695^***	3.716^***	4.035^***	3.129^***
	(0.217)	(0.217)	(0.215)	(0.255)	(0.511)	(0.761)

LOANPRC	-0.017^***	-0.017^***	-0.017^***	-0.022^***	0.003	-0.028
	(0.005)	(0.005)	(0.005)	(0.006)	(0.014)	(0.022)

OBRAT	-0.034^***	-0.034^***	-0.035^***	-0.026^**	-0.038	-0.105^***
	(0.010)	(0.010)	(0.010)	(0.012)	(0.029)	(0.039)

MARRIED	0.476^**	0.461^**		0.599^***	0.128	-0.412
	(0.192)	(0.181)		(0.209)	(0.468)	(0.684)

BLACK	-0.816^***	-0.811^***	-0.818^***
	(0.240)	(0.240)	(0.239)

HISPAN	-0.900^***	-0.897^***	-0.858^***
	(0.311)	(0.310)	(0.309)

Constant	1.382^**	1.342^**	1.668^***	1.421^**	-1.060	5.043^**
	(0.592)	(0.567)	(0.552)	(0.666)	(1.552)	(2.524)


Observations	1,969	1,969	1,969	1,666	195	108
Log Likelihood	-479.702	-479.728	-482.936	-364.596	-69.759	-39.567
Akaike Inf. Crit.	975.403	973.456	977.873	739.191	149.518	89.134

Note:	p<0.1; p<0.05; p<0.01

Odds Ratio for all 6 models

OR <- function(x) exp(x)
stargazer(glm1, glm2, glm3, type="html", apply.coef = OR, 
          title = "Odds Ratio with full data set")

**Odds Ratio with full data set**

	Dependent variable:

	APPROVE
	(1)	(2)	(3)

MALE	0.947^***
	(0.235)

GDLIN	41.234^***	41.322^***	40.230^***
	(0.217)	(0.217)	(0.215)

LOANPRC	0.983^***	0.983^***	0.984^***
	(0.005)	(0.005)	(0.005)

OBRAT	0.967^***	0.966^***	0.965^***
	(0.010)	(0.010)	(0.010)

MARRIED	1.609^***	1.586^***
	(0.192)	(0.181)

BLACK	0.442^*	0.444^*	0.441^*
	(0.240)	(0.240)	(0.239)

HISPAN	0.407	0.408	0.424
	(0.311)	(0.310)	(0.309)

Constant	3.981^***	3.828^***	5.302^***
	(0.592)	(0.567)	(0.552)


Observations	1,969	1,969	1,969
Log Likelihood	-479.702	-479.728	-482.936
Akaike Inf. Crit.	975.403	973.456	977.873

Note:	p<0.1; p<0.05; p<0.01

stargazer(glmw, glmb, glmh, type="html",
          title = "Odds Ratio with different ethnicity",
          apply.coef = OR, column.labels = c("white only", 
                                             "black only", "hispanic only"))

**Odds Ratio with different ethnicity**

	Dependent variable:

	APPROVE
	white only	black only	hispanic only
	(1)	(2)	(3)

GDLIN	41.087^***	56.519^***	22.846^***
	(0.255)	(0.511)	(0.761)

LOANPRC	0.978^***	1.003^***	0.973^***
	(0.006)	(0.014)	(0.022)

OBRAT	0.974^***	0.963^***	0.900^***
	(0.012)	(0.029)	(0.039)

MARRIED	1.820^***	1.136^**	0.662
	(0.209)	(0.468)	(0.684)

BLACK


HISPAN


Constant	4.143^***	0.347	154.881^***
	(0.666)	(1.552)	(2.524)


Observations	1,666	195	108
Log Likelihood	-364.596	-69.759	-39.567
Akaike Inf. Crit.	739.191	149.518	89.134

Note:	p<0.1; p<0.05; p<0.01

stargazer(glm1, glm2, glm3, glmw, glmb, glmh, type="html",
          title = "Logistic Regression with t stas for significance",
          report = "vct*", column.labels = c("full Data", "white only",
                                             "black only", "hispanic only"), 
          column.separate = c(3, 1,1,1))

**Logistic Regression with t stas for significance**

	Dependent variable:

	APPROVE
	full Data			white only	black only	hispanic only
	(1)	(2)	(3)	(4)	(5)	(6)

MALE	-0.054
	t = -0.230

GDLIN	3.719	3.721	3.695	3.716	4.035	3.129
	t = 17.126^***	t = 17.150^***	t = 17.184^***	t = 14.563^***	t = 7.895^***	t = 4.113^***

LOANPRC	-0.017	-0.017	-0.017	-0.022	0.003	-0.028
	t = -3.313^***	t = -3.305^***	t = -3.282^***	t = -3.670^***	t = 0.220	t = -1.276

OBRAT	-0.034	-0.034	-0.035	-0.026	-0.038	-0.105
	t = -3.305^***	t = -3.307^***	t = -3.400^***	t = -2.242^**	t = -1.281	t = -2.709^***

MARRIED	0.476	0.461		0.599	0.128	-0.412
	t = 2.478^**	t = 2.546^**		t = 2.870^***	t = 0.273	t = -0.603

BLACK	-0.816	-0.811	-0.818
	t = -3.396^***	t = -3.387^***	t = -3.418^***

HISPAN	-0.900	-0.897	-0.858
	t = -2.898^***	t = -2.891^***	t = -2.774^***

Constant	1.382	1.342	1.668	1.421	-1.060	5.043
	t = 2.335^**	t = 2.369^**	t = 3.022^***	t = 2.134^**	t = -0.683	t = 1.998^**


Observations	1,969	1,969	1,969	1,666	195	108
Log Likelihood	-479.702	-479.728	-482.936	-364.596	-69.759	-39.567
Akaike Inf. Crit.	975.403	973.456	977.873	739.191	149.518	89.134

Note:	p<0.1; p<0.05; p<0.01

stargazer(
exp(cbind(OR = coef(glm1), confint(glm1))),
exp(cbind(OR = coef(glm2), confint(glm2))),
exp(cbind(OR = coef(glm3), confint(glm3))),
exp(cbind(OR = coef(glmw), confint(glmw))),
exp(cbind(OR = coef(glmb), confint(glmb))),
exp(cbind(OR = coef(glmh), confint(glmh))),
type="html")


	OR	2.5 %	97.5 %

(Intercept)	3.981	1.255	12.840
MALE	0.947	0.592	1.488
GDLIN	41.234	27.194	63.810
LOANPRC	0.983	0.973	0.993
OBRAT	0.967	0.947	0.986
MARRIED	1.609	1.103	2.343
BLACK	0.442	0.278	0.715
HISPAN	0.407	0.226	0.766


	OR	2.5 %	97.5 %

(Intercept)	3.828	1.267	11.772
GDLIN	41.322	27.261	63.923
LOANPRC	0.983	0.973	0.993
OBRAT	0.966	0.947	0.986
MARRIED	1.586	1.111	2.261
BLACK	0.444	0.280	0.717
HISPAN	0.408	0.227	0.768


	OR	2.5 %	97.5 %

(Intercept)	5.302	1.811	15.872
GDLIN	40.230	26.637	61.975
LOANPRC	0.984	0.974	0.993
OBRAT	0.965	0.946	0.985
BLACK	0.441	0.278	0.712
HISPAN	0.424	0.236	0.797


	OR	2.5 %	97.5 %

(Intercept)	4.143	1.126	15.466
GDLIN	41.087	25.193	68.672
LOANPRC	0.978	0.967	0.990
OBRAT	0.974	0.952	0.997
MARRIED	1.820	1.208	2.744
BLACK
HISPAN


	OR	2.5 %	97.5 %

(Intercept)	0.347	0.014	5.933
GDLIN	56.519	22.228	168.802
LOANPRC	1.003	0.981	1.034
OBRAT	0.963	0.908	1.019
MARRIED	1.136	0.446	2.844
BLACK
HISPAN


	OR	2.5 %	97.5 %

(Intercept)	154.881	1.327	30,628.190
GDLIN	22.846	5.732	121.709
LOANPRC	0.973	0.929	1.012
OBRAT	0.900	0.828	0.967
MARRIED	0.662	0.156	2.382
BLACK
HISPAN

Likelihood Ratio Test

lrtest(glm2, glm1)

## 
## Model 1: APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN
## Model 2: APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + 
##     HISPAN
## 
## L.R. Chisq       d.f.          P 
## 0.05313217 1.00000000 0.81770003

LR.s.1 <- glm2$deviance - glm1$deviance
LR.s.2 <- -2*logLik(glm2)[1] - (-2*logLik(glm1)[1])
pchisq(LR.s.1, 1, lower.tail = FALSE)

## [1] 0.8177

pchisq(LR.s.1, 2, lower.tail = FALSE)

## [1] 0.9737837

lrm(glm2)

## Logistic Regression Model
##  
##  lrm(formula = glm2)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          1969    LR chi2     515.97    R2       0.437    C       0.846    
##   0            244    d.f.             6    g        1.252    Dxy     0.692    
##   1           1725    Pr(> chi2) <0.0001    gr       3.498    gamma   0.693    
##  max |deriv| 8e-10                          gp       0.147    tau-a   0.150    
##                                             Brier    0.066                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  1.3424 0.5667  2.37  0.0179  
##  GDLIN      3.7214 0.2170 17.15  <0.0001 
##  LOANPRC   -0.0168 0.0051 -3.31  0.0009  
##  OBRAT     -0.0341 0.0103 -3.31  0.0009  
##  MARRIED    0.4609 0.1810  2.55  0.0109  
##  BLACK     -0.8114 0.2395 -3.39  0.0007  
##  HISPAN    -0.8973 0.3104 -2.89  0.0038  
##

lrm(glm1)

## Logistic Regression Model
##  
##  lrm(formula = glm1)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          1969    LR chi2     516.03    R2       0.437    C       0.846    
##   0            244    d.f.             7    g        1.253    Dxy     0.693    
##   1           1725    Pr(> chi2) <0.0001    gr       3.501    gamma   0.693    
##  max |deriv| 8e-10                          gp       0.147    tau-a   0.150    
##                                             Brier    0.066                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  1.3815 0.5917  2.34  0.0195  
##  MALE      -0.0539 0.2346 -0.23  0.8181  
##  GDLIN      3.7193 0.2172 17.13  <0.0001 
##  LOANPRC   -0.0168 0.0051 -3.31  0.0009  
##  OBRAT     -0.0341 0.0103 -3.31  0.0009  
##  MARRIED    0.4757 0.1920  2.48  0.0132  
##  BLACK     -0.8157 0.2402 -3.40  0.0007  
##  HISPAN    -0.9000 0.3106 -2.90  0.0038  
##

Visulizing Model Fit Analysis

Model with male and married committed seems to be the best fit.

#FUll Model 
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm1)

#Ommited Gender
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm2)

#Ommited Gender and Married
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm3)

Correctly classified observations

###Correctly classified observations
#all the same
stargazer(mean((glm1$fitted.values>=0.5)==dats$APPROVE),
mean((glm2$fitted.values>=0.5)==dats$APPROVE),
mean((glm3$fitted.values>=0.5)==dats$APPROVE), type = "text")

===== 0.924 —–

Confusion matrix

Same outcome

###Confusion matrix count

#FUll Model
RP=sum((glm1$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm1$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm1$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm1$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat1<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat1)<-c("Pred Approved","Pred not Approved")
rownames(confMat1)<-c("Real Approved","Real not Approved")
#Ommited Gender
RP=sum((glm2$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm2$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm2$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm2$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat2<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat2)<-c("Pred Approved","Pred not Approved")
rownames(confMat2)<-c("Real Approved","Real not Approved")
#Ommited Gender and Married
RP=sum((glm3$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm3$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm3$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm3$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat3<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat3)<-c("Pred Approved","Pred not Approved")
rownames(confMat3)<-c("Real Approved","Real not Approved")
###Confusion matrix count for the 3 Models
kable(
  rbind(
    confMat1,
    confMat2,
    confMat3))

	Pred Approved	Pred not Approved
Real Approved	1686	39
Real not Approved	111	133
Real Approved	1686	39
Real not Approved	111	133
Real Approved	1686	39
Real not Approved	111	133

###Confusion matrix proportion

#FUll Model
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat1<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat1)<-c("Pred Approved","Pred not Approved")
rownames(confMat1)<-c("Real Approved","Real not Approved")
#Ommited Gender
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat2<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat2)<-c("Pred Approved","Pred not Approved")
rownames(confMat2)<-c("Real Approved","Real not Approved")
#Ommited Gender and Married
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat3<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat3)<-c("Pred Approved","Pred not Approved")
rownames(confMat3)<-c("Real Approved","Real not Approved")

###Confusion matrix proportion
kable(
  rbind(
    confMat1,
    confMat2,
    confMat3))

	Pred Approved	Pred not Approved
Real Approved	97.73913	2.26087
Real not Approved	45.49180	54.50820
Real Approved	97.73913	2.26087
Real not Approved	45.49180	54.50820
Real Approved	97.73913	2.26087
Real not Approved	45.49180	54.50820

Prototypical Analysis

Both Probit and Logit prediction shows that Hispanics and Blacks are less likely to get approval to loans when compared to whites

#Estimate Logit Model
LogitModel = glm(APPROVE ~ OBRAT + BLACK + HISPAN, data = dats, 
                 family = "binomial")
summary(LogitModel)

## 
## Call:
## glm(formula = APPROVE ~ OBRAT + BLACK + HISPAN, family = "binomial", 
##     data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7549   0.3498   0.4226   0.4828   1.6930  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.164374   0.306121  13.604  < 2e-16 ***
## OBRAT       -0.056052   0.008426  -6.652 2.89e-11 ***
## BLACK       -1.458365   0.178007  -8.193 2.55e-16 ***
## HISPAN      -1.086268   0.245317  -4.428 9.51e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.4  on 1968  degrees of freedom
## Residual deviance: 1347.0  on 1965  degrees of freedom
## AIC: 1355
## 
## Number of Fisher Scoring iterations: 5

#Generate Odds Ratios
exp(coef(LogitModel))

## (Intercept)       OBRAT       BLACK      HISPAN 
##  64.3523600   0.9454897   0.2326164   0.3374738

#Define prototypical loan applicants (you will need more than 3)
prototype1 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 1, HISPAN = 0)
prototype2 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 0, HISPAN = 1)
prototype3 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 0, HISPAN = 0)

#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (LogitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (LogitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (LogitModel, newdata = prototype3, type ="response")
#logit Probability
kable(rbind( prototype1,
       prototype2,
       prototype3))

OBRAT	BLACK	HISPAN	predictedprob
32.39172	1	0	0.7089684
32.39172	0	1	0.7794520
32.39172	0	0	0.9128343

#Estimate Probit Model
ProbitModel = glm(APPROVE ~ OBRAT + BLACK + HISPAN, data = dats, 
                  family = "binomial" (link = "probit"))
summary(ProbitModel)

## 
## Call:
## glm(formula = APPROVE ~ OBRAT + BLACK + HISPAN, family = binomial(link = "probit"), 
##     data = dats)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8082   0.3507   0.4264   0.4875   1.4748  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.264355   0.161312  14.037  < 2e-16 ***
## OBRAT       -0.028262   0.004583  -6.166 7.00e-10 ***
## BLACK       -0.823087   0.103822  -7.928 2.23e-15 ***
## HISPAN      -0.588310   0.140902  -4.175 2.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1475.4  on 1968  degrees of freedom
## Residual deviance: 1350.3  on 1965  degrees of freedom
## AIC: 1358.3
## 
## Number of Fisher Scoring iterations: 5

#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (ProbitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (ProbitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (ProbitModel, newdata = prototype3, type ="response")
# Probit Probability
kable(rbind(prototype1,
      prototype2,
      prototype3))

OBRAT	BLACK	HISPAN	predictedprob
32.39172	1	0	0.7004903
32.39172	0	1	0.7765483
32.39172	0	0	0.9113151

#Impose appropriate sample selection criteria here
MLDsubsample <- subset(dats, LOANPRC >= 50 & MALE != "." & MARRIED != "." & (GDLIN == 1 | GDLIN == 0))

 
#Estimate Logit Model (I am not necessarily recommending this model -- this is purely illustrative)
LogitModel <- glm(APPROVE ~ OBRAT + GDLIN + LOANPRC + MALE + MARRIED + BLACK + HISPAN, data = MLDsubsample, 
                 family = "binomial")
summary(LogitModel)

## 
## Call:
## glm(formula = APPROVE ~ OBRAT + GDLIN + LOANPRC + MALE + MARRIED + 
##     BLACK + HISPAN, family = "binomial", data = MLDsubsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8093   0.2653   0.3238   0.3836   2.3226  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.348569   0.633915   2.127 0.033390 *  
## OBRAT       -0.033061   0.010452  -3.163 0.001561 ** 
## GDLIN        3.708162   0.217682  17.035  < 2e-16 ***
## LOANPRC     -0.016723   0.005637  -2.967 0.003009 ** 
## MALE        -0.068301   0.237847  -0.287 0.773987    
## MARRIED      0.473623   0.193889   2.443 0.014576 *  
## BLACK       -0.820291   0.241013  -3.404 0.000665 ***
## HISPAN      -0.812085   0.316951  -2.562 0.010402 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1421.09  on 1812  degrees of freedom
## Residual deviance:  931.16  on 1805  degrees of freedom
## AIC: 947.16
## 
## Number of Fisher Scoring iterations: 5

#Generate Log-Likelihood
logLik(LogitModel)

## 'log Lik.' -465.5778 (df=8)

Probit Model Analysis

Stick with Logit Model since its easier to translate

# this gives main effects AND interactions
ad_aov <- aov(APPROVE ~ OBRAT * BLACK + HISPAN * OBRAT + LOANPRC * BLACK + LOANPRC * HISPAN, data = dats, family = "binomial" (link = "probit"))
# this would give ONLY main effects
ad_aov2 <- aov(APPROVE ~ OBRAT + BLACK + HISPAN + MARRIED + LOANPRC + GDLIN, data = dats, family = "binomial" (link = "probit"))
summary(ad_aov)

##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## OBRAT             1   6.59   6.588  66.720 5.54e-16 ***
## BLACK             1   7.63   7.627  77.238  < 2e-16 ***
## HISPAN            1   1.97   1.972  19.973 8.30e-06 ***
## LOANPRC           1   1.99   1.986  20.113 7.72e-06 ***
## OBRAT:BLACK       1   0.28   0.284   2.877 0.090008 .  
## OBRAT:HISPAN      1   1.29   1.287  13.030 0.000314 ***
## BLACK:LOANPRC     1   0.19   0.194   1.965 0.161184    
## HISPAN:LOANPRC    1   0.30   0.295   2.988 0.084046 .  
## Residuals      1960 193.53   0.099                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(ad_aov2)

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## OBRAT          1   6.59    6.59  99.693  < 2e-16 ***
## BLACK          1   7.63    7.63 115.410  < 2e-16 ***
## HISPAN         1   1.97    1.97  29.844 5.28e-08 ***
## MARRIED        1   0.61    0.61   9.247  0.00239 ** 
## LOANPRC        1   1.92    1.92  29.050 7.90e-08 ***
## GDLIN          1  65.39   65.39 989.570  < 2e-16 ***
## Residuals   1962 129.65    0.07                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

tidy_ad_aov <- broom::tidy(ad_aov)
kable(tidy_ad_aov)

term	df	sumsq	meansq	statistic	p.value
OBRAT	1	6.5879509	6.5879509	66.719936	0.0000000
BLACK	1	7.6265125	7.6265125	77.238041	0.0000000
HISPAN	1	1.9721477	1.9721477	19.973065	0.0000083
LOANPRC	1	1.9859170	1.9859170	20.112514	0.0000077
OBRAT:BLACK	1	0.2840827	0.2840827	2.877067	0.0900083
OBRAT:HISPAN	1	1.2865938	1.2865938	13.030069	0.0003142
BLACK:LOANPRC	1	0.1939820	0.1939820	1.964567	0.1611841
HISPAN:LOANPRC	1	0.2950266	0.2950266	2.987902	0.0840463
Residuals	1960	193.5311184	0.0987404	NA	NA