Exploratory Data Analysis (EDA)
rm(list=ls(all=TRUE)) # Clear all data in environment
#load Packages
library(tidyverse)
library(GGally)
library(plotly)
library(car)
library(ggplot2)
library(stargazer)
library(sandwich)
library(lmtest) # waldtest; see also coeftest.
library(psych)
library(aod)
library(Rcpp)
library(Hmisc)
library(pastecs)
library(popbio)
library(rms)
library(Hmisc)
library(kableExtra)
# Load data
datd <- read.csv("MLD Data File.csv", header = TRUE)
#Review selected data
glimpse(datd)
## Observations: 1,989
## Variables: 8
## $ MARRIED <fct> 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ GDLIN <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ OBRAT <dbl> 34.5, 34.1, 26.0, 37.0, 32.1, 33.0, 36.0, 37.0, 30.7, ...
## $ BLACK <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ HISPAN <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MALE <fct> ., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ APPROVE <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, ...
## $ LOANPRC <dbl> 0.7542373, 0.8000000, 0.8951049, 0.6000000, 0.8955224,...
|
MARRIED
|
GDLIN </th>
|
OBRAT </th>
|
BLACK </th>
|
HISPAN </th>
|
MALE
|
APPROVE </th>
|
LOANPRC </th>
|
|
.: 3
|
Min. : 0.000
|
Min. : 0.00
|
Min. :0.00000
|
Min. :0.00000
|
.: 15
|
Min. :0.0000
|
Min. :0.02105
|
|
0: 678
|
1st Qu.: 1.000
|
1st Qu.:28.00
|
1st Qu.:0.00000
|
1st Qu.:0.00000
|
0: 369
|
1st Qu.:1.0000
|
1st Qu.:0.70000
|
|
1:1308
|
Median : 1.000
|
Median :33.00
|
Median :0.00000
|
Median :0.00000
|
1:1605
|
Median :1.0000
|
Median :0.80000
|
|
NA
|
Mean : 1.583
|
Mean :32.39
|
Mean :0.09904
|
Mean :0.05581
|
NA
|
Mean :0.8773
|
Mean :0.77064
|
|
NA
|
3rd Qu.: 1.000
|
3rd Qu.:37.00
|
3rd Qu.:0.00000
|
3rd Qu.:0.00000
|
NA
|
3rd Qu.:1.0000
|
3rd Qu.:0.89894
|
|
NA
|
Max. :666.000
|
Max. :95.00
|
Max. :1.00000
|
Max. :1.00000
|
NA
|
Max. :1.0000
|
Max. :2.57143
|
Data Discription
In light of the relatively small number of mortgage loan applications made by minorities, these extra variables were collected for all applications by blacks and Hispanics and for a random sample. The data set includes the following variables.
- APPROVE = 1 if mortgage loan was approved, = 0 otherwise
- GDLIN = 1 if credit history meets guidelines, = 0 otherwise
- LOANPRC = loan amount/purchase price
- OBRAT = other obligations as a percent of total income
- MALE = 1 if male, = 0 otherwise
- MARRIED = 1 if married, = 0 otherwise
- BLACK = 1 if black, = 0 otherwise
- HISPAN = 1 if Hispanic, = 0 otherwise
Remove 20 na’s obs based on above summary
#filter data
dats <-
datd %>%
filter(GDLIN != 666) %>%
filter(MALE != ".") %>%
filter(MARRIED != ".") %>%
mutate(LOANPRC = LOANPRC*100)
# Change these to factors
# dats$MARRIED<- as.factor(as.character(dats$MARRIED))
# dats$APPROVE<- as.factor(as.character(dats$APPROVE))
# dats$MALE<- as.factor(as.character(dats$MALE))
# dats$GDLIN<- as.factor(dats$GDLIN)
# dats$BLACK<- as.factor(dats$BLACK)
# dats$HISPAN<- as.factor(dats$HISPAN)
#Removed 20 ob
#Changed it to integer
dats$MARRIED<- as.integer(as.character(dats$MARRIED))
dats$MALE<- as.integer(as.character(dats$MALE))
#dats$GDLIN<- as.factor(dats$GDLIN)
#attach the file for use
attach(dats)
#Review selected data
glimpse(dats)
## Observations: 1,969
## Variables: 8
## $ MARRIED <int> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ GDLIN <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ OBRAT <dbl> 34.1, 26.0, 37.0, 32.1, 33.0, 36.0, 37.0, 30.7, 49.0, ...
## $ BLACK <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ HISPAN <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ MALE <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ APPROVE <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ LOANPRC <dbl> 80.00000, 89.51049, 60.00000, 89.55224, 80.43478, 89.8...
|
MARRIED </th>
|
GDLIN </th>
|
OBRAT </th>
|
BLACK </th>
|
HISPAN </th>
|
MALE </th>
|
APPROVE </th>
|
LOANPRC </th>
|
|
Min. :0.0000
|
Min. :0.0000
|
Min. : 0.00
|
Min. :0.00000
|
Min. :0.00000
|
Min. :0.0000
|
Min. :0.0000
|
Min. : 2.105
|
|
1st Qu.:0.0000
|
1st Qu.:1.0000
|
1st Qu.:28.00
|
1st Qu.:0.00000
|
1st Qu.:0.00000
|
1st Qu.:1.0000
|
1st Qu.:1.0000
|
1st Qu.: 70.000
|
|
Median :1.0000
|
Median :1.0000
|
Median :33.00
|
Median :0.00000
|
Median :0.00000
|
Median :1.0000
|
Median :1.0000
|
Median : 80.000
|
|
Mean :0.6592
|
Mean :0.9132
|
Mean :32.39
|
Mean :0.09903
|
Mean :0.05485
|
Mean :0.8131
|
Mean :0.8761
|
Mean : 77.032
|
|
3rd Qu.:1.0000
|
3rd Qu.:1.0000
|
3rd Qu.:37.00
|
3rd Qu.:0.00000
|
3rd Qu.:0.00000
|
3rd Qu.:1.0000
|
3rd Qu.:1.0000
|
3rd Qu.: 89.888
|
|
Max. :1.0000
|
Max. :1.0000
|
Max. :95.00
|
Max. :1.00000
|
Max. :1.00000
|
Max. :1.0000
|
Max. :1.0000
|
Max. :257.143
|
Review GGpair Matrix
ggpairs(dats,lower = list(continuous="smooth"))
Review Stats of each variables
##The describe() function which is part of the Hmisc package displays the following additional statistics:
#Number of rows,#Standard deviation,#Trimmed mean,#Mean absolute deviation,#Skewness,#Kurtosis,#Standard error
describe(dats)
## dats
##
## 8 Variables 1969 Observations
## ---------------------------------------------------------------------------
## MARRIED
## n missing distinct Info Sum Mean Gmd
## 1969 0 2 0.674 1298 0.6592 0.4495
##
## ---------------------------------------------------------------------------
## GDLIN
## n missing distinct Info Sum Mean Gmd
## 1969 0 2 0.238 1798 0.9132 0.1587
##
## ---------------------------------------------------------------------------
## OBRAT
## n missing distinct Info Mean Gmd .05 .10
## 1969 0 288 0.999 32.39 8.772 18.16 22.00
## .25 .50 .75 .90 .95
## 28.00 33.00 37.00 40.52 44.00
##
## lowest : 0.00 4.00 5.60 6.99 7.00, highest: 73.00 75.00 78.00 83.00 95.00
## ---------------------------------------------------------------------------
## BLACK
## n missing distinct Info Sum Mean Gmd
## 1969 0 2 0.268 195 0.09904 0.1785
##
## ---------------------------------------------------------------------------
## HISPAN
## n missing distinct Info Sum Mean Gmd
## 1969 0 2 0.156 108 0.05485 0.1037
##
## ---------------------------------------------------------------------------
## MALE
## n missing distinct Info Sum Mean Gmd
## 1969 0 2 0.456 1601 0.8131 0.3041
##
## ---------------------------------------------------------------------------
## APPROVE
## n missing distinct Info Sum Mean Gmd
## 1969 0 2 0.326 1725 0.8761 0.2172
##
## ---------------------------------------------------------------------------
## LOANPRC
## n missing distinct Info Mean Gmd .05 .10
## 1969 0 1096 0.998 77.03 18.91 40.00 52.38
## .25 .50 .75 .90 .95
## 70.00 80.00 89.89 94.45 95.45
##
## lowest : 2.10526 8.77193 9.09091 9.33032 11.01322
## highest: 164.70590 183.33330 220.00000 255.52480 257.14290
## ---------------------------------------------------------------------------
Review Variance for anything that may jump out
##The stat.desc() function which is part of the pastecs package displays the following additional statistics:
#Variance,#Coefficient of variation,#Confidence interval for mean
kable(stat.desc(dats))
|
MARRIED
|
GDLIN
|
OBRAT
|
BLACK
|
HISPAN
|
MALE
|
APPROVE
|
LOANPRC
|
nbr.val
|
1969.0000000
|
1969.0000000
|
1.969000e+03
|
1969.0000000
|
1969.0000000
|
1969.0000000
|
1969.0000000
|
1.969000e+03
|
nbr.null
|
671.0000000
|
171.0000000
|
1.000000e+00
|
1774.0000000
|
1861.0000000
|
368.0000000
|
244.0000000
|
0.000000e+00
|
nbr.na
|
0.0000000
|
0.0000000
|
0.000000e+00
|
0.0000000
|
0.0000000
|
0.0000000
|
0.0000000
|
0.000000e+00
|
min
|
0.0000000
|
0.0000000
|
0.000000e+00
|
0.0000000
|
0.0000000
|
0.0000000
|
0.0000000
|
2.105260e+00
|
max
|
1.0000000
|
1.0000000
|
9.500000e+01
|
1.0000000
|
1.0000000
|
1.0000000
|
1.0000000
|
2.571429e+02
|
range
|
1.0000000
|
1.0000000
|
9.500000e+01
|
1.0000000
|
1.0000000
|
1.0000000
|
1.0000000
|
2.550376e+02
|
sum
|
1298.0000000
|
1798.0000000
|
6.377929e+04
|
195.0000000
|
108.0000000
|
1601.0000000
|
1725.0000000
|
1.516761e+05
|
median
|
1.0000000
|
1.0000000
|
3.300000e+01
|
0.0000000
|
0.0000000
|
1.0000000
|
1.0000000
|
8.000000e+01
|
mean
|
0.6592179
|
0.9131539
|
3.239172e+01
|
0.0990350
|
0.0548502
|
0.8131031
|
0.8760792
|
7.703204e+01
|
SE.mean
|
0.0106842
|
0.0063480
|
1.865833e-01
|
0.0067334
|
0.0051325
|
0.0087874
|
0.0074273
|
4.270414e-01
|
CI.mean.0.95
|
0.0209535
|
0.0124494
|
3.659215e-01
|
0.0132054
|
0.0100657
|
0.0172336
|
0.0145662
|
8.375009e-01
|
var
|
0.2247638
|
0.0793442
|
6.854741e+01
|
0.0892724
|
0.0518680
|
0.1520437
|
0.1086196
|
3.590755e+02
|
std.dev
|
0.4740926
|
0.2816810
|
8.279336e+00
|
0.2987849
|
0.2277454
|
0.3899278
|
0.3295748
|
1.894929e+01
|
coef.var
|
0.7191744
|
0.3084704
|
2.556004e-01
|
3.0169618
|
4.1521365
|
0.4795551
|
0.3761930
|
2.459923e-01
|
Histogtam of the Variables
#Lot more married obs
plot_ly(x = dats$MARRIED,
type = "histogram")
#Small amount of obs that desn't meet credit guideline
plot_ly(x = dats$GDLIN,
type = "histogram")
#Right Skewed with 36 in the middle
plot_ly(x = dats$OBRAT,
type = "histogram")
#Small amount of Black obs
plot_ly(x = dats$BLACK,
type = "histogram")
#Small amount of Hispanic obs
plot_ly(x = dats$HISPAN,
type = "histogram")
#Mostly Males
plot_ly(x = dats$MALE,
type = "histogram")
#Mostly Approved
plot_ly(x = dats$APPROVE,
type = "histogram")
#LOANPRC = loan amount/purchase price
#.80 is 20% of the data, .9 is 10% of the data
plot_ly(x = dats$LOANPRC,
type = "histogram")
Quantile-Quantile (QQ) Plots
# after 2 its not normal
ggplot(dats, aes(sample = LOANPRC)) +
geom_qq() +
ggtitle("Q-Q Plot for LOANPRC")
#looks normally distrubuted
ggplot(dats, aes(sample = OBRAT)) +
geom_qq() +
ggtitle("Q-Q Plot for OBRAT")
Visulize the relationship & Review Logit relationships
# print the two graph side-by-side - Approve
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$APPROVE,logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$APPROVE,logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# print the two graph side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$BLACK,logi.mod=1, mainlabel="LoanPRC&BLACK",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$BLACK,logi.mod=1, mainlabel="OBRAT&BLACK",boxp=FALSE,type="hist",col="gray")
# print the two graph side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,dats$HISPAN,logi.mod=1, mainlabel="LoanPRC&Hispanic",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,dats$HISPAN,logi.mod=1, mainlabel="OBRAT&Hispanic",boxp=FALSE,type="hist",col="gray")
Black relationshps and Logit interactions
# Loan to Purchase Ratio, Other obligations and Approval side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# Loan to Purchase Ratio, Other obligations and Approval and meets guidline side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male side-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE ),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male and marriedside-by-side - BLACK
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="LoanPRC&Approve&",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$BLACK*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
Hispanic relationshps and Logit interactions
# Loan to Purchase Ratio, Other obligations and Approval side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# Loan to Purchase Ratio, Other obligations and Approval and meets guidline side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male side-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE ),logi.mod=1, mainlabel="LoanPRC&Approve",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
# Loan to Purchase Ratio, Other obligations and Approval and meets guidline and male and marriedside-by-side - Hispanic
par(mfrow=c(1,2))
logi.hist.plot(dats$LOANPRC,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="LoanPRC&Approve&",boxp=FALSE,type="hist",col="gray")
logi.hist.plot(dats$OBRAT,(dats$APPROVE*dats$HISPAN*dats$GDLIN*dats$MALE*dats$MARRIED),logi.mod=1, mainlabel="OBRAT&Approve",boxp=FALSE,type="hist",col="gray")
logit models comparison
Black only model has only one (meets guidelines) significant variable
## fit ordered logit model
glm1 <- glm(APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glm2 <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glm3 <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + BLACK + HISPAN, data = dats, family = binomial(link="logit"))
glmw <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = datw, family = binomial(link="logit"))
glmb <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = datb, family = binomial(link="logit"))
glmh <- glm(APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN, data = dath, family = binomial(link="logit"))
## view a summary of the Full model
summary(glm1)
##
## Call:
## glm(formula = APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED +
## BLACK + HISPAN, family = binomial(link = "logit"), data = dats)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8933 0.2445 0.3128 0.3742 2.3261
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.381531 0.591656 2.335 0.019542 *
## MALE -0.053947 0.234573 -0.230 0.818107
## GDLIN 3.719269 0.217169 17.126 < 2e-16 ***
## LOANPRC -0.016812 0.005074 -3.313 0.000922 ***
## OBRAT -0.034074 0.010310 -3.305 0.000950 ***
## MARRIED 0.475742 0.192005 2.478 0.013221 *
## BLACK -0.815693 0.240177 -3.396 0.000683 ***
## HISPAN -0.900010 0.310585 -2.898 0.003758 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1475.4 on 1968 degrees of freedom
## Residual deviance: 959.4 on 1961 degrees of freedom
## AIC: 975.4
##
## Number of Fisher Scoring iterations: 6
#Removed Male since there is no significance
summary(glm2)
##
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK +
## HISPAN, family = binomial(link = "logit"), data = dats)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8943 0.2440 0.3128 0.3748 2.3178
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.342387 0.566745 2.369 0.017856 *
## GDLIN 3.721387 0.216988 17.150 < 2e-16 ***
## LOANPRC -0.016773 0.005075 -3.305 0.000950 ***
## OBRAT -0.034098 0.010310 -3.307 0.000942 ***
## MARRIED 0.460930 0.181039 2.546 0.010896 *
## BLACK -0.811426 0.239545 -3.387 0.000706 ***
## HISPAN -0.897331 0.310374 -2.891 0.003839 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1475.43 on 1968 degrees of freedom
## Residual deviance: 959.46 on 1962 degrees of freedom
## AIC: 973.46
##
## Number of Fisher Scoring iterations: 6
# Removed Married since there is small significance
summary(glm3)
##
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + BLACK + HISPAN,
## family = binomial(link = "logit"), data = dats)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8328 0.2539 0.3196 0.3694 2.3016
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.668049 0.552038 3.022 0.002514 **
## GDLIN 3.694612 0.215000 17.184 < 2e-16 ***
## LOANPRC -0.016585 0.005054 -3.282 0.001032 **
## OBRAT -0.035117 0.010329 -3.400 0.000674 ***
## BLACK -0.817685 0.239220 -3.418 0.000631 ***
## HISPAN -0.858277 0.309369 -2.774 0.005532 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1475.43 on 1968 degrees of freedom
## Residual deviance: 965.87 on 1963 degrees of freedom
## AIC: 977.87
##
## Number of Fisher Scoring iterations: 6
# White only data
summary(glmw)
##
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK +
## HISPAN, family = binomial(link = "logit"), data = datw)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9674 0.2348 0.3007 0.3537 2.0305
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.421414 0.665949 2.134 0.032809 *
## GDLIN 3.715685 0.255150 14.563 < 2e-16 ***
## LOANPRC -0.021921 0.005974 -3.670 0.000243 ***
## OBRAT -0.026281 0.011723 -2.242 0.024977 *
## MARRIED 0.598925 0.208713 2.870 0.004110 **
## BLACK NA NA NA NA
## HISPAN NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1026.72 on 1665 degrees of freedom
## Residual deviance: 729.19 on 1661 degrees of freedom
## AIC: 739.19
##
## Number of Fisher Scoring iterations: 6
# Black Only data
summary(glmb)
##
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK +
## HISPAN, family = binomial(link = "logit"), data = datb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1675 -0.4442 0.4628 0.5179 2.3954
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.059865 1.551987 -0.683 0.495
## GDLIN 4.034585 0.511059 7.895 2.91e-15 ***
## LOANPRC 0.003034 0.013786 0.220 0.826
## OBRAT -0.037604 0.029360 -1.281 0.200
## MARRIED 0.127730 0.467598 0.273 0.785
## BLACK NA NA NA NA
## HISPAN NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 246.83 on 194 degrees of freedom
## Residual deviance: 139.52 on 190 degrees of freedom
## AIC: 149.52
##
## Number of Fisher Scoring iterations: 5
# Hispanic only data
summary(glmh)
##
## Call:
## glm(formula = APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK +
## HISPAN, family = binomial(link = "logit"), data = dath)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8283 0.1451 0.3957 0.5731 1.8769
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.04265 2.52380 1.998 0.04571 *
## GDLIN 3.12876 0.76075 4.113 3.91e-05 ***
## LOANPRC -0.02755 0.02160 -1.276 0.20198
## OBRAT -0.10490 0.03873 -2.709 0.00675 **
## MARRIED -0.41226 0.68384 -0.603 0.54661
## BLACK NA NA NA NA
## HISPAN NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 119.217 on 107 degrees of freedom
## Residual deviance: 79.134 on 103 degrees of freedom
## AIC: 89.134
##
## Number of Fisher Scoring iterations: 5
#Compare the full data set
anova(glm1, glm2, glm3)
#Compare all the modles
stargazer(glm1, glm2, glm3, glmw, glmb, glmh, type = "html",
column.labels = c("full Data", "white only",
"black only", "hispanic only"), column.separate = c(3, 1,1,1))
|
|
Dependent variable:
|
|
|
|
APPROVE
|
|
full Data
|
white only
|
black only
|
hispanic only
|
|
(1)
|
(2)
|
(3)
|
(4)
|
(5)
|
(6)
|
|
MALE
|
-0.054
|
|
|
|
|
|
|
(0.235)
|
|
|
|
|
|
|
|
|
|
|
|
|
GDLIN
|
3.719***
|
3.721***
|
3.695***
|
3.716***
|
4.035***
|
3.129***
|
|
(0.217)
|
(0.217)
|
(0.215)
|
(0.255)
|
(0.511)
|
(0.761)
|
|
|
|
|
|
|
|
LOANPRC
|
-0.017***
|
-0.017***
|
-0.017***
|
-0.022***
|
0.003
|
-0.028
|
|
(0.005)
|
(0.005)
|
(0.005)
|
(0.006)
|
(0.014)
|
(0.022)
|
|
|
|
|
|
|
|
OBRAT
|
-0.034***
|
-0.034***
|
-0.035***
|
-0.026**
|
-0.038
|
-0.105***
|
|
(0.010)
|
(0.010)
|
(0.010)
|
(0.012)
|
(0.029)
|
(0.039)
|
|
|
|
|
|
|
|
MARRIED
|
0.476**
|
0.461**
|
|
0.599***
|
0.128
|
-0.412
|
|
(0.192)
|
(0.181)
|
|
(0.209)
|
(0.468)
|
(0.684)
|
|
|
|
|
|
|
|
BLACK
|
-0.816***
|
-0.811***
|
-0.818***
|
|
|
|
|
(0.240)
|
(0.240)
|
(0.239)
|
|
|
|
|
|
|
|
|
|
|
HISPAN
|
-0.900***
|
-0.897***
|
-0.858***
|
|
|
|
|
(0.311)
|
(0.310)
|
(0.309)
|
|
|
|
|
|
|
|
|
|
|
Constant
|
1.382**
|
1.342**
|
1.668***
|
1.421**
|
-1.060
|
5.043**
|
|
(0.592)
|
(0.567)
|
(0.552)
|
(0.666)
|
(1.552)
|
(2.524)
|
|
|
|
|
|
|
|
|
Observations
|
1,969
|
1,969
|
1,969
|
1,666
|
195
|
108
|
Log Likelihood
|
-479.702
|
-479.728
|
-482.936
|
-364.596
|
-69.759
|
-39.567
|
Akaike Inf. Crit.
|
975.403
|
973.456
|
977.873
|
739.191
|
149.518
|
89.134
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
Odds Ratio for all 6 models
OR <- function(x) exp(x)
stargazer(glm1, glm2, glm3, type="html", apply.coef = OR,
title = "Odds Ratio with full data set")
Odds Ratio with full data set
|
|
Dependent variable:
|
|
|
|
APPROVE
|
|
(1)
|
(2)
|
(3)
|
|
MALE
|
0.947***
|
|
|
|
(0.235)
|
|
|
|
|
|
|
GDLIN
|
41.234***
|
41.322***
|
40.230***
|
|
(0.217)
|
(0.217)
|
(0.215)
|
|
|
|
|
LOANPRC
|
0.983***
|
0.983***
|
0.984***
|
|
(0.005)
|
(0.005)
|
(0.005)
|
|
|
|
|
OBRAT
|
0.967***
|
0.966***
|
0.965***
|
|
(0.010)
|
(0.010)
|
(0.010)
|
|
|
|
|
MARRIED
|
1.609***
|
1.586***
|
|
|
(0.192)
|
(0.181)
|
|
|
|
|
|
BLACK
|
0.442*
|
0.444*
|
0.441*
|
|
(0.240)
|
(0.240)
|
(0.239)
|
|
|
|
|
HISPAN
|
0.407
|
0.408
|
0.424
|
|
(0.311)
|
(0.310)
|
(0.309)
|
|
|
|
|
Constant
|
3.981***
|
3.828***
|
5.302***
|
|
(0.592)
|
(0.567)
|
(0.552)
|
|
|
|
|
|
Observations
|
1,969
|
1,969
|
1,969
|
Log Likelihood
|
-479.702
|
-479.728
|
-482.936
|
Akaike Inf. Crit.
|
975.403
|
973.456
|
977.873
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
stargazer(glmw, glmb, glmh, type="html",
title = "Odds Ratio with different ethnicity",
apply.coef = OR, column.labels = c("white only",
"black only", "hispanic only"))
Odds Ratio with different ethnicity
|
|
Dependent variable:
|
|
|
|
APPROVE
|
|
white only
|
black only
|
hispanic only
|
|
(1)
|
(2)
|
(3)
|
|
GDLIN
|
41.087***
|
56.519***
|
22.846***
|
|
(0.255)
|
(0.511)
|
(0.761)
|
|
|
|
|
LOANPRC
|
0.978***
|
1.003***
|
0.973***
|
|
(0.006)
|
(0.014)
|
(0.022)
|
|
|
|
|
OBRAT
|
0.974***
|
0.963***
|
0.900***
|
|
(0.012)
|
(0.029)
|
(0.039)
|
|
|
|
|
MARRIED
|
1.820***
|
1.136**
|
0.662
|
|
(0.209)
|
(0.468)
|
(0.684)
|
|
|
|
|
BLACK
|
|
|
|
|
|
|
|
|
|
|
|
HISPAN
|
|
|
|
|
|
|
|
|
|
|
|
Constant
|
4.143***
|
0.347
|
154.881***
|
|
(0.666)
|
(1.552)
|
(2.524)
|
|
|
|
|
|
Observations
|
1,666
|
195
|
108
|
Log Likelihood
|
-364.596
|
-69.759
|
-39.567
|
Akaike Inf. Crit.
|
739.191
|
149.518
|
89.134
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
stargazer(glm1, glm2, glm3, glmw, glmb, glmh, type="html",
title = "Logistic Regression with t stas for significance",
report = "vct*", column.labels = c("full Data", "white only",
"black only", "hispanic only"),
column.separate = c(3, 1,1,1))
Logistic Regression with t stas for significance
|
|
Dependent variable:
|
|
|
|
APPROVE
|
|
full Data
|
white only
|
black only
|
hispanic only
|
|
(1)
|
(2)
|
(3)
|
(4)
|
(5)
|
(6)
|
|
MALE
|
-0.054
|
|
|
|
|
|
|
t = -0.230
|
|
|
|
|
|
|
|
|
|
|
|
|
GDLIN
|
3.719
|
3.721
|
3.695
|
3.716
|
4.035
|
3.129
|
|
t = 17.126***
|
t = 17.150***
|
t = 17.184***
|
t = 14.563***
|
t = 7.895***
|
t = 4.113***
|
|
|
|
|
|
|
|
LOANPRC
|
-0.017
|
-0.017
|
-0.017
|
-0.022
|
0.003
|
-0.028
|
|
t = -3.313***
|
t = -3.305***
|
t = -3.282***
|
t = -3.670***
|
t = 0.220
|
t = -1.276
|
|
|
|
|
|
|
|
OBRAT
|
-0.034
|
-0.034
|
-0.035
|
-0.026
|
-0.038
|
-0.105
|
|
t = -3.305***
|
t = -3.307***
|
t = -3.400***
|
t = -2.242**
|
t = -1.281
|
t = -2.709***
|
|
|
|
|
|
|
|
MARRIED
|
0.476
|
0.461
|
|
0.599
|
0.128
|
-0.412
|
|
t = 2.478**
|
t = 2.546**
|
|
t = 2.870***
|
t = 0.273
|
t = -0.603
|
|
|
|
|
|
|
|
BLACK
|
-0.816
|
-0.811
|
-0.818
|
|
|
|
|
t = -3.396***
|
t = -3.387***
|
t = -3.418***
|
|
|
|
|
|
|
|
|
|
|
HISPAN
|
-0.900
|
-0.897
|
-0.858
|
|
|
|
|
t = -2.898***
|
t = -2.891***
|
t = -2.774***
|
|
|
|
|
|
|
|
|
|
|
Constant
|
1.382
|
1.342
|
1.668
|
1.421
|
-1.060
|
5.043
|
|
t = 2.335**
|
t = 2.369**
|
t = 3.022***
|
t = 2.134**
|
t = -0.683
|
t = 1.998**
|
|
|
|
|
|
|
|
|
Observations
|
1,969
|
1,969
|
1,969
|
1,666
|
195
|
108
|
Log Likelihood
|
-479.702
|
-479.728
|
-482.936
|
-364.596
|
-69.759
|
-39.567
|
Akaike Inf. Crit.
|
975.403
|
973.456
|
977.873
|
739.191
|
149.518
|
89.134
|
|
Note:
|
p<0.1; p<0.05; p<0.01
|
stargazer(
exp(cbind(OR = coef(glm1), confint(glm1))),
exp(cbind(OR = coef(glm2), confint(glm2))),
exp(cbind(OR = coef(glm3), confint(glm3))),
exp(cbind(OR = coef(glmw), confint(glmw))),
exp(cbind(OR = coef(glmb), confint(glmb))),
exp(cbind(OR = coef(glmh), confint(glmh))),
type="html")
|
|
OR
|
2.5 %
|
97.5 %
|
|
(Intercept)
|
3.981
|
1.255
|
12.840
|
MALE
|
0.947
|
0.592
|
1.488
|
GDLIN
|
41.234
|
27.194
|
63.810
|
LOANPRC
|
0.983
|
0.973
|
0.993
|
OBRAT
|
0.967
|
0.947
|
0.986
|
MARRIED
|
1.609
|
1.103
|
2.343
|
BLACK
|
0.442
|
0.278
|
0.715
|
HISPAN
|
0.407
|
0.226
|
0.766
|
|
|
|
OR
|
2.5 %
|
97.5 %
|
|
(Intercept)
|
3.828
|
1.267
|
11.772
|
GDLIN
|
41.322
|
27.261
|
63.923
|
LOANPRC
|
0.983
|
0.973
|
0.993
|
OBRAT
|
0.966
|
0.947
|
0.986
|
MARRIED
|
1.586
|
1.111
|
2.261
|
BLACK
|
0.444
|
0.280
|
0.717
|
HISPAN
|
0.408
|
0.227
|
0.768
|
|
|
|
OR
|
2.5 %
|
97.5 %
|
|
(Intercept)
|
5.302
|
1.811
|
15.872
|
GDLIN
|
40.230
|
26.637
|
61.975
|
LOANPRC
|
0.984
|
0.974
|
0.993
|
OBRAT
|
0.965
|
0.946
|
0.985
|
BLACK
|
0.441
|
0.278
|
0.712
|
HISPAN
|
0.424
|
0.236
|
0.797
|
|
|
|
OR
|
2.5 %
|
97.5 %
|
|
(Intercept)
|
4.143
|
1.126
|
15.466
|
GDLIN
|
41.087
|
25.193
|
68.672
|
LOANPRC
|
0.978
|
0.967
|
0.990
|
OBRAT
|
0.974
|
0.952
|
0.997
|
MARRIED
|
1.820
|
1.208
|
2.744
|
BLACK
|
|
|
|
HISPAN
|
|
|
|
|
|
|
OR
|
2.5 %
|
97.5 %
|
|
(Intercept)
|
0.347
|
0.014
|
5.933
|
GDLIN
|
56.519
|
22.228
|
168.802
|
LOANPRC
|
1.003
|
0.981
|
1.034
|
OBRAT
|
0.963
|
0.908
|
1.019
|
MARRIED
|
1.136
|
0.446
|
2.844
|
BLACK
|
|
|
|
HISPAN
|
|
|
|
|
|
|
OR
|
2.5 %
|
97.5 %
|
|
(Intercept)
|
154.881
|
1.327
|
30,628.190
|
GDLIN
|
22.846
|
5.732
|
121.709
|
LOANPRC
|
0.973
|
0.929
|
1.012
|
OBRAT
|
0.900
|
0.828
|
0.967
|
MARRIED
|
0.662
|
0.156
|
2.382
|
BLACK
|
|
|
|
HISPAN
|
|
|
|
|
Likelihood Ratio Test
##
## Model 1: APPROVE ~ GDLIN + LOANPRC + OBRAT + MARRIED + BLACK + HISPAN
## Model 2: APPROVE ~ MALE + GDLIN + LOANPRC + OBRAT + MARRIED + BLACK +
## HISPAN
##
## L.R. Chisq d.f. P
## 0.05313217 1.00000000 0.81770003
LR.s.1 <- glm2$deviance - glm1$deviance
LR.s.2 <- -2*logLik(glm2)[1] - (-2*logLik(glm1)[1])
pchisq(LR.s.1, 1, lower.tail = FALSE)
## [1] 0.8177
pchisq(LR.s.1, 2, lower.tail = FALSE)
## [1] 0.9737837
## Logistic Regression Model
##
## lrm(formula = glm2)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1969 LR chi2 515.97 R2 0.437 C 0.846
## 0 244 d.f. 6 g 1.252 Dxy 0.692
## 1 1725 Pr(> chi2) <0.0001 gr 3.498 gamma 0.693
## max |deriv| 8e-10 gp 0.147 tau-a 0.150
## Brier 0.066
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 1.3424 0.5667 2.37 0.0179
## GDLIN 3.7214 0.2170 17.15 <0.0001
## LOANPRC -0.0168 0.0051 -3.31 0.0009
## OBRAT -0.0341 0.0103 -3.31 0.0009
## MARRIED 0.4609 0.1810 2.55 0.0109
## BLACK -0.8114 0.2395 -3.39 0.0007
## HISPAN -0.8973 0.3104 -2.89 0.0038
##
## Logistic Regression Model
##
## lrm(formula = glm1)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 1969 LR chi2 516.03 R2 0.437 C 0.846
## 0 244 d.f. 7 g 1.253 Dxy 0.693
## 1 1725 Pr(> chi2) <0.0001 gr 3.501 gamma 0.693
## max |deriv| 8e-10 gp 0.147 tau-a 0.150
## Brier 0.066
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 1.3815 0.5917 2.34 0.0195
## MALE -0.0539 0.2346 -0.23 0.8181
## GDLIN 3.7193 0.2172 17.13 <0.0001
## LOANPRC -0.0168 0.0051 -3.31 0.0009
## OBRAT -0.0341 0.0103 -3.31 0.0009
## MARRIED 0.4757 0.1920 2.48 0.0132
## BLACK -0.8157 0.2402 -3.40 0.0007
## HISPAN -0.9000 0.3106 -2.90 0.0038
##
Visulizing Model Fit Analysis
- Model with male and married committed seems to be the best fit.
#FUll Model
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm1)
#Ommited Gender
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm2)
#Ommited Gender and Married
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(glm3)
Correctly classified observations
###Correctly classified observations
#all the same
stargazer(mean((glm1$fitted.values>=0.5)==dats$APPROVE),
mean((glm2$fitted.values>=0.5)==dats$APPROVE),
mean((glm3$fitted.values>=0.5)==dats$APPROVE), type = "text")
===== 0.924 —–
===== 0.924 —–
===== 0.924 —–
Confusion matrix
Same outcome
###Confusion matrix count
#FUll Model
RP=sum((glm1$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm1$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm1$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm1$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat1<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat1)<-c("Pred Approved","Pred not Approved")
rownames(confMat1)<-c("Real Approved","Real not Approved")
#Ommited Gender
RP=sum((glm2$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm2$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm2$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm2$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat2<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat2)<-c("Pred Approved","Pred not Approved")
rownames(confMat2)<-c("Real Approved","Real not Approved")
#Ommited Gender and Married
RP=sum((glm3$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==1)
FP=sum((glm3$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==0)
RN=sum((glm3$fitted.values>=0.5)==dats$APPROVE & dats$APPROVE==0)
FN=sum((glm3$fitted.values>=0.5)!=dats$APPROVE & dats$APPROVE==1)
confMat3<-matrix(c(RP,FP,FN,RN),ncol = 2)
colnames(confMat3)<-c("Pred Approved","Pred not Approved")
rownames(confMat3)<-c("Real Approved","Real not Approved")
###Confusion matrix count for the 3 Models
kable(
rbind(
confMat1,
confMat2,
confMat3))
|
Pred Approved
|
Pred not Approved
|
Real Approved
|
1686
|
39
|
Real not Approved
|
111
|
133
|
Real Approved
|
1686
|
39
|
Real not Approved
|
111
|
133
|
Real Approved
|
1686
|
39
|
Real not Approved
|
111
|
133
|
###Confusion matrix proportion
#FUll Model
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat1<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat1)<-c("Pred Approved","Pred not Approved")
rownames(confMat1)<-c("Real Approved","Real not Approved")
#Ommited Gender
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat2<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat2)<-c("Pred Approved","Pred not Approved")
rownames(confMat2)<-c("Real Approved","Real not Approved")
#Ommited Gender and Married
RPR=RP/sum(dats$APPROVE==1)*100
FPR=FP/sum(dats$APPROVE==0)*100
RNR=RN/sum(dats$APPROVE==0)*100
FNR=FN/sum(dats$APPROVE==1)*100
confMat3<-matrix(c(RPR,FPR,FNR,RNR),ncol = 2)
colnames(confMat3)<-c("Pred Approved","Pred not Approved")
rownames(confMat3)<-c("Real Approved","Real not Approved")
###Confusion matrix proportion
kable(
rbind(
confMat1,
confMat2,
confMat3))
|
Pred Approved
|
Pred not Approved
|
Real Approved
|
97.73913
|
2.26087
|
Real not Approved
|
45.49180
|
54.50820
|
Real Approved
|
97.73913
|
2.26087
|
Real not Approved
|
45.49180
|
54.50820
|
Real Approved
|
97.73913
|
2.26087
|
Real not Approved
|
45.49180
|
54.50820
|
Prototypical Analysis
Both Probit and Logit prediction shows that Hispanics and Blacks are less likely to get approval to loans when compared to whites
#Estimate Logit Model
LogitModel = glm(APPROVE ~ OBRAT + BLACK + HISPAN, data = dats,
family = "binomial")
summary(LogitModel)
##
## Call:
## glm(formula = APPROVE ~ OBRAT + BLACK + HISPAN, family = "binomial",
## data = dats)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7549 0.3498 0.4226 0.4828 1.6930
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.164374 0.306121 13.604 < 2e-16 ***
## OBRAT -0.056052 0.008426 -6.652 2.89e-11 ***
## BLACK -1.458365 0.178007 -8.193 2.55e-16 ***
## HISPAN -1.086268 0.245317 -4.428 9.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1475.4 on 1968 degrees of freedom
## Residual deviance: 1347.0 on 1965 degrees of freedom
## AIC: 1355
##
## Number of Fisher Scoring iterations: 5
#Generate Odds Ratios
exp(coef(LogitModel))
## (Intercept) OBRAT BLACK HISPAN
## 64.3523600 0.9454897 0.2326164 0.3374738
#Define prototypical loan applicants (you will need more than 3)
prototype1 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 1, HISPAN = 0)
prototype2 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 0, HISPAN = 1)
prototype3 <- data.frame(OBRAT=mean(dats$OBRAT),BLACK = 0, HISPAN = 0)
#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (LogitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (LogitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (LogitModel, newdata = prototype3, type ="response")
#logit Probability
kable(rbind( prototype1,
prototype2,
prototype3))
OBRAT
|
BLACK
|
HISPAN
|
predictedprob
|
32.39172
|
1
|
0
|
0.7089684
|
32.39172
|
0
|
1
|
0.7794520
|
32.39172
|
0
|
0
|
0.9128343
|
#Estimate Probit Model
ProbitModel = glm(APPROVE ~ OBRAT + BLACK + HISPAN, data = dats,
family = "binomial" (link = "probit"))
summary(ProbitModel)
##
## Call:
## glm(formula = APPROVE ~ OBRAT + BLACK + HISPAN, family = binomial(link = "probit"),
## data = dats)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8082 0.3507 0.4264 0.4875 1.4748
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.264355 0.161312 14.037 < 2e-16 ***
## OBRAT -0.028262 0.004583 -6.166 7.00e-10 ***
## BLACK -0.823087 0.103822 -7.928 2.23e-15 ***
## HISPAN -0.588310 0.140902 -4.175 2.98e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1475.4 on 1968 degrees of freedom
## Residual deviance: 1350.3 on 1965 degrees of freedom
## AIC: 1358.3
##
## Number of Fisher Scoring iterations: 5
#Predict probabilities for prototypical individuals
prototype1$predictedprob <- predict (ProbitModel, newdata = prototype1, type ="response")
prototype2$predictedprob <- predict (ProbitModel, newdata = prototype2, type ="response")
prototype3$predictedprob <- predict (ProbitModel, newdata = prototype3, type ="response")
# Probit Probability
kable(rbind(prototype1,
prototype2,
prototype3))
OBRAT
|
BLACK
|
HISPAN
|
predictedprob
|
32.39172
|
1
|
0
|
0.7004903
|
32.39172
|
0
|
1
|
0.7765483
|
32.39172
|
0
|
0
|
0.9113151
|
#Impose appropriate sample selection criteria here
MLDsubsample <- subset(dats, LOANPRC >= 50 & MALE != "." & MARRIED != "." & (GDLIN == 1 | GDLIN == 0))
#Estimate Logit Model (I am not necessarily recommending this model -- this is purely illustrative)
LogitModel <- glm(APPROVE ~ OBRAT + GDLIN + LOANPRC + MALE + MARRIED + BLACK + HISPAN, data = MLDsubsample,
family = "binomial")
summary(LogitModel)
##
## Call:
## glm(formula = APPROVE ~ OBRAT + GDLIN + LOANPRC + MALE + MARRIED +
## BLACK + HISPAN, family = "binomial", data = MLDsubsample)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8093 0.2653 0.3238 0.3836 2.3226
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.348569 0.633915 2.127 0.033390 *
## OBRAT -0.033061 0.010452 -3.163 0.001561 **
## GDLIN 3.708162 0.217682 17.035 < 2e-16 ***
## LOANPRC -0.016723 0.005637 -2.967 0.003009 **
## MALE -0.068301 0.237847 -0.287 0.773987
## MARRIED 0.473623 0.193889 2.443 0.014576 *
## BLACK -0.820291 0.241013 -3.404 0.000665 ***
## HISPAN -0.812085 0.316951 -2.562 0.010402 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1421.09 on 1812 degrees of freedom
## Residual deviance: 931.16 on 1805 degrees of freedom
## AIC: 947.16
##
## Number of Fisher Scoring iterations: 5
#Generate Log-Likelihood
logLik(LogitModel)
## 'log Lik.' -465.5778 (df=8)