Executive Summary

This is an Econometric Analysis of the Earnings Premiums for Educational Attainment by Gender of roughly 30,000 full-time workers using data from the 2011 American Community Survey (ACS) to find the answer for two questions:

  1. How do the earnings of full-time workers vary at different levels of educational attainment?
  2. How does the earnings premium for educational attainment vary by gender?

Code header

# Course: ECON 5300
# Title: Team Project on ACS Labor Market
# Purpose: Data wrangling
# Data: ACS Data for LMA Project-2.csv
# Date: Jan 11, 2018
# Author: Afsar Ali
rm(list=ls(all=TRUE))  # Clear all data in environment

# The tidyverse package contains ggplot2, tibble, tidyr, readr, purr, and dplyr among others
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots
#library(gridExtra)
library(GGally)
#library(knitr)
#library(htmlTable)
#library(kableExtra)
#library(stringr)
# Package used for making interactive plots
library(plotly)
library(car)
library(ggplot2)
library(stargazer)
library(car)
library(olsrr)
library(gvlma)
library(MASS)
#library(flexdashboard)

# Load data 
datd <- read.csv("ACS Data for LMA Project-2.csv", header = TRUE)

#Wrangle data
dat <-
  datd %>%
  filter(Earnings.Past.12.Months >= 10000) %>%
  filter(Usual.Weekly.Hours >= 35) %>%
  filter(Worked.40..Weeks.During.Past.12.Months == 1) %>%
  filter(Biracial == 0) %>%
  filter(Hawaiian.or.Pacific.Islander == 0) %>%
  filter(Other.Race == 0) %>%
  filter(American.Indian.or.Native.American == 0)

#remove unwanted variables
dat <-
  dat[,-c(15, 18, 20, 21, 22, 25, 26)]

#attach the file for use
attach(dat)
glimpse(dat)
## Observations: 30,149
## Variables: 19
## $ Age                       <int> 58, 45, 54, 44, 48, 47, 58, 60, 53, ...
## $ Earnings.Past.12.Months   <int> 43500, 23200, 61000, 13000, 30500, 2...
## $ Usual.Weekly.Hours        <int> 37, 35, 40, 40, 45, 56, 50, 39, 40, ...
## $ Female                    <int> 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...
## $ Married                   <int> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...
## $ No.High.School.Degree     <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...
## $ High.School.Degree.or.GED <int> 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, ...
## $ Some.College              <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ Associates.Degree         <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Bachelors.Degree          <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ Masters.Degree            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Professional.Degree       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Doctorate                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Educational.Attainment    <fct> Associates Degree, High School Degre...
## $ White                     <int> 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...
## $ Black                     <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ Asian                     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ Hispanic                  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ Race.Ethnicity            <fct> White, Hispanic, White, White, Black...
summary(dat)
##       Age        Earnings.Past.12.Months Usual.Weekly.Hours
##  Min.   :18.00   Min.   : 10000          Min.   :35.00     
##  1st Qu.:33.00   1st Qu.: 28000          1st Qu.:40.00     
##  Median :44.00   Median : 42000          Median :40.00     
##  Mean   :43.24   Mean   : 56241          Mean   :43.89     
##  3rd Qu.:53.00   3rd Qu.: 65000          3rd Qu.:45.00     
##  Max.   :64.00   Max.   :577000          Max.   :99.00     
##                                                            
##      Female          Married       No.High.School.Degree
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000      
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000      
##  Median :0.0000   Median :1.0000   Median :0.00000      
##  Mean   :0.4457   Mean   :0.6274   Mean   :0.07025      
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000      
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000      
##                                                         
##  High.School.Degree.or.GED  Some.College    Associates.Degree
##  Min.   :0.0000            Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.0000            1st Qu.:0.0000   1st Qu.:0.0000   
##  Median :0.0000            Median :0.0000   Median :0.0000   
##  Mean   :0.2484            Mean   :0.2195   Mean   :0.0938   
##  3rd Qu.:0.0000            3rd Qu.:0.0000   3rd Qu.:0.0000   
##  Max.   :1.0000            Max.   :1.0000   Max.   :1.0000   
##                                                              
##  Bachelors.Degree Masters.Degree    Professional.Degree   Doctorate      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000     Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000     1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000     Median :0.00000  
##  Mean   :0.2274   Mean   :0.09914   Mean   :0.02554     Mean   :0.01589  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000     3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000     Max.   :1.00000  
##                                                                          
##            Educational.Attainment     White            Black        
##  High School Degree   :7490       Min.   :0.0000   Min.   :0.00000  
##  Bachelors Degree     :6857       1st Qu.:0.0000   1st Qu.:0.00000  
##  Some College         :6618       Median :1.0000   Median :0.00000  
##  Masters Degree       :2989       Mean   :0.7276   Mean   :0.09221  
##  Associates Degree    :2828       3rd Qu.:1.0000   3rd Qu.:0.00000  
##  No High School Degree:2118       Max.   :1.0000   Max.   :1.00000  
##  (Other)              :1249                                         
##      Asian            Hispanic       Race.Ethnicity 
##  Min.   :0.00000   Min.   :0.0000   White   :21937  
##  1st Qu.:0.00000   1st Qu.:0.0000   Hispanic: 3788  
##  Median :0.00000   Median :0.0000   Black   : 2780  
##  Mean   :0.05453   Mean   :0.1256   Asian   : 1644  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   Biracial:    0  
##  Max.   :1.00000   Max.   :1.0000   Hawaiian:    0  
##                                     (Other) :    0

Data Summary Analysis

# Create subset of the data
white_male <-
  dat %>%
  filter(White==1) %>%
  filter(Female==0)

white_female <-
  dat %>%
  filter(White==1) %>%
  filter(Female==1)

nonwhite_male <-
  dat %>%
  filter(White==0) %>%
  filter(Female==0)

nonwhite_female <-
  dat %>%
  filter(White==0) %>%
  filter(Female==1)

#Order of highest average earner by groups:
#White_male - 12336
#Nonwhite_male - 4376
#White_female - 9601
#Nonwhite_female - 3836

Histogram of some selected Variables

-Age: Doesn’t look like there are Any outliers. -Race: Mostly White -Education: Varies, Mostly High School. Maybe omit No High school as dummy variable

hist(Age)

# Most common age is between 45 and 55.
# Very few under 20.
# Between 25 and 55 is fairly evenly represented.
hist(Earnings.Past.12.Months, breaks = seq(0, 600000, by = 25000))

# Highly right skewed.
# Majority of occurrences are under 100,000.
# Most common frequency is between 25,000 and 50000.
# Highest earners are making 550,000.
hist(Usual.Weekly.Hours, breaks = seq(35, 100, by = 1))

# Highly right skewed.
# Most common frequency is 40 hours.
# Largest amount of hours worked in a week is 99.
hist(Female) #Histogram of gender

# There are more males than females in the data set, but not by a huge amount.
hist(Married)#Histogram of married

# There are much more married than unmarried in the data set.
plot_ly(x = Race.Ethnicity,
             type = "histogram")
AsianBiracialBlackHawaiianHispanicNative AmericanOther RaceWhite05k10k15k20k
plot_ly(x = Educational.Attainment,
             type = "histogram")
Associates DegreeBachelors DegreeDoctorateHigh School DegreeMasters DegreeNo High School DegreeProfessional DegreeSome College01000200030004000500060007000

Bar Plots of categorical data

#Bar plot of educational attainment
dat %>%
  ggplot(aes(Educational.Attainment)) +
  geom_bar() +
  coord_flip()

#Bar plot of race/ethnicity
dat %>%
  ggplot(aes(Race.Ethnicity)) +
  geom_bar() +
  coord_flip()

Descriptive Stats

stargazer(dat, type = "html", title="Descriptive statistics", digits=2)
Descriptive statistics
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
Age 30,149 43.24 11.70 18 33 53 64
Earnings.Past.12.Months 30,149 56,240.57 53,739.72 10,000 28,000 65,000 577,000
Usual.Weekly.Hours 30,149 43.89 8.08 35 40 45 99
Female 30,149 0.45 0.50 0 0 1 1
Married 30,149 0.63 0.48 0 0 1 1
No.High.School.Degree 30,149 0.07 0.26 0 0 0 1
High.School.Degree.or.GED 30,149 0.25 0.43 0 0 0 1
Some.College 30,149 0.22 0.41 0 0 0 1
Associates.Degree 30,149 0.09 0.29 0 0 0 1
Bachelors.Degree 30,149 0.23 0.42 0 0 0 1
Masters.Degree 30,149 0.10 0.30 0 0 0 1
Professional.Degree 30,149 0.03 0.16 0 0 0 1
Doctorate 30,149 0.02 0.13 0 0 0 1
White 30,149 0.73 0.45 0 0 1 1
Black 30,149 0.09 0.29 0 0 0 1
Asian 30,149 0.05 0.23 0 0 0 1
Hispanic 30,149 0.13 0.33 0 0 0 1

Model Analysis

# ANOVA
baseMod <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment + Race.Ethnicity, dat)
mod1 <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment + Race.Ethnicity, dat)
mod2 <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment + Race.Ethnicity, dat)
mod3 <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment, dat)
mod4 <- lm(Earnings.Past.12.Months ~ Age + Female + Married, dat)
anova(baseMod, mod1, mod2, mod3, mod4)
ABCDEFGHIJ0123456789
 
 
Res.Df
<dbl>
RSS
<dbl>
Df
<dbl>
Sum of Sq
<dbl>
F
<dbl>
1301356.486861e+13NANANA
2301356.486861e+1300.000000e+00NA
3301356.486861e+1300.000000e+00NA
4301386.507534e+13-3-2.067300e+1132.01245
5301458.091997e+13-7-1.584464e+131051.52800
# summary(baseMod)
# summary(mod1)
# summary(mod2)
# summary(mod3)
# summary(mod4)
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(baseMod)

plot_ly(x = log(Earnings.Past.12.Months),
             type = "histogram")
9.51010.51111.51212.51302004006008001000120014001600
#Create Age Squared to account for downward trend 
dat$Age.Squared <- (dat$Age)^2 
#looks much bettter 
mod5 <- lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married + 
               Black + Asian + Hispanic + High.School.Degree.or.GED +  Some.College + Associates.Degree +
               Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate , data = dat)

summary(mod5)
## 
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared + 
##     Married + Black + Asian + Hispanic + High.School.Degree.or.GED + 
##     Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree + 
##     Professional.Degree + Doctorate, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.53610 -0.35382 -0.00938  0.33934  2.84393 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                8.490e+00  4.456e-02 190.521   <2e-16 ***
## Age                        7.200e-02  2.133e-03  33.754   <2e-16 ***
## Age.Squared               -7.097e-04  2.481e-05 -28.605   <2e-16 ***
## Married                    1.323e-01  7.000e-03  18.906   <2e-16 ***
## Black                     -1.572e-01  1.134e-02 -13.860   <2e-16 ***
## Asian                     -2.286e-02  1.433e-02  -1.595    0.111    
## Hispanic                  -1.299e-01  1.037e-02 -12.528   <2e-16 ***
## High.School.Degree.or.GED  1.842e-01  1.411e-02  13.062   <2e-16 ***
## Some.College               3.366e-01  1.435e-02  23.464   <2e-16 ***
## Associates.Degree          4.001e-01  1.645e-02  24.329   <2e-16 ***
## Bachelors.Degree           7.020e-01  1.446e-02  48.531   <2e-16 ***
## Masters.Degree             8.536e-01  1.637e-02  52.152   <2e-16 ***
## Professional.Degree        1.298e+00  2.378e-02  54.601   <2e-16 ***
## Doctorate                  1.016e+00  2.853e-02  35.611   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5559 on 30135 degrees of freedom
## Multiple R-squared:  0.3214, Adjusted R-squared:  0.3211 
## F-statistic:  1098 on 13 and 30135 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(mod5)

#qplot(mod3, geom="histogram") doesnt work

Model 5 testing

##Breush Pagan Test

lmtest::bptest(mod5)  # Breusch-Pagan test
## 
##  studentized Breusch-Pagan test
## 
## data:  mod5
## BP = 679.5, df = 13, p-value < 2.2e-16
    #studentized Breusch-Pagan test

#NCV Test

car::ncvTest(mod5)  # Breusch-Pagan test
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 808.9328, Df = 1, p = < 2.22e-16

Heteroskedasticity Test

#Use independent variables of the model and perform multiple tests

ols_test_f(mod5, rhs = TRUE, multiple = TRUE)
## 
##  F Test for Heteroskedasticity
##  -----------------------------
##  Ho: Variance is homogenous
##  Ha: Variance is not homogenous
## 
##  Variables: Age Age.Squared Married Black Asian Hispanic High.School.Degree.or.GED Some.College Associates.Degree Bachelors.Degree Masters.Degree Professional.Degree Doctorate 
## 
##         Test Summary          
##  -----------------------------
##  Num DF     =    13 
##  Den DF     =    30135 
##  F          =    53.450 
##  Prob > F   =    1.981424e-138

Outliers Test

# Assessing Outliers
outlierTest(mod5) # Bonferonni p-value for most extreme obs
##       rstudent unadjusted p-value Bonferonni p
## 29413 5.118939         3.0915e-07    0.0093204
## 13560 4.940577         7.8309e-07    0.0236090
## 4365  4.827338         1.3905e-06    0.0419210
qqPlot(mod5, main="QQ Plot") #qq plot for studentized resid 

## [1] 13560 29413
leveragePlots(mod5) # leverage plots

Influential Test

# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1) 
cutoff <- 4/((nrow(dat)-length(mod5$coefficients)-2)) 
plot(mod5, which=4, cook.levels=cutoff)

# Influence Plot 
influencePlot(mod5, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )
ABCDEFGHIJ0123456789
 
 
StudRes
<dbl>
Hat
<dbl>
CookD
<dbl>
135604.9405770.00089715911.564405e-03
192292.8430300.00290516171.681771e-03
20011-4.0095370.00216058522.485157e-03
23442-3.7292160.00212391312.113398e-03
245180.1318160.00283819433.532636e-06
294135.1189390.00034127666.384451e-04

Residuals Test

# Normality of Residuals
# qq plot for studentized resid
qqPlot(mod5, main="QQ Plot")

## [1] 13560 29413
# distribution of studentized residuals
sresid <- studres(mod5) 
hist(sresid, freq=FALSE, 
   main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40) 
yfit<-dnorm(xfit) 
lines(xfit, yfit)

#### Homoscedasticity Test

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(mod5)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 808.9328, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values 
spreadLevelPlot(mod5)

## 
## Suggested power transformation:  -1.711617

Collinearity Test

# Evaluate Collinearity
vif(mod5) # variance inflation factors 
##                       Age               Age.Squared 
##                 60.799851                 60.137210 
##                   Married                     Black 
##                  1.117507                  1.050817 
##                     Asian                  Hispanic 
##                  1.033464                  1.152416 
## High.School.Degree.or.GED              Some.College 
##                  3.624201                  3.439634 
##         Associates.Degree          Bachelors.Degree 
##                  2.243221                  3.586620 
##            Masters.Degree       Professional.Degree 
##                  2.334136                  1.372574 
##                 Doctorate 
##                  1.241500
sqrt(vif(mod5)) > 2 # problem?
##                       Age               Age.Squared 
##                      TRUE                      TRUE 
##                   Married                     Black 
##                     FALSE                     FALSE 
##                     Asian                  Hispanic 
##                     FALSE                     FALSE 
## High.School.Degree.or.GED              Some.College 
##                     FALSE                     FALSE 
##         Associates.Degree          Bachelors.Degree 
##                     FALSE                     FALSE 
##            Masters.Degree       Professional.Degree 
##                     FALSE                     FALSE 
##                 Doctorate 
##                     FALSE
# Evaluate Nonlinearity
# component + residual plot 
crPlots(mod5)

# Ceres plots 
#ceresPlots(mod5) #didnt work

Autocorrelated Test

# Test for Autocorrelated Errors
durbinWatsonTest(mod5)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.02698786      1.945985       0
##  Alternative hypothesis: rho != 0

Global Model Test

# Global test of model assumptions
#The gvlma( ) function in the gvlma package, performs a global validation of linear model assumptions as well separate evaluations of skewness, kurtosis, and heteroscedasticity.
# Global test of model assumptions
gvmodel <- gvlma(mod5) 
summary(gvmodel)
## 
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared + 
##     Married + Black + Asian + Hispanic + High.School.Degree.or.GED + 
##     Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree + 
##     Professional.Degree + Doctorate, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.53610 -0.35382 -0.00938  0.33934  2.84393 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                8.490e+00  4.456e-02 190.521   <2e-16 ***
## Age                        7.200e-02  2.133e-03  33.754   <2e-16 ***
## Age.Squared               -7.097e-04  2.481e-05 -28.605   <2e-16 ***
## Married                    1.323e-01  7.000e-03  18.906   <2e-16 ***
## Black                     -1.572e-01  1.134e-02 -13.860   <2e-16 ***
## Asian                     -2.286e-02  1.433e-02  -1.595    0.111    
## Hispanic                  -1.299e-01  1.037e-02 -12.528   <2e-16 ***
## High.School.Degree.or.GED  1.842e-01  1.411e-02  13.062   <2e-16 ***
## Some.College               3.366e-01  1.435e-02  23.464   <2e-16 ***
## Associates.Degree          4.001e-01  1.645e-02  24.329   <2e-16 ***
## Bachelors.Degree           7.020e-01  1.446e-02  48.531   <2e-16 ***
## Masters.Degree             8.536e-01  1.637e-02  52.152   <2e-16 ***
## Professional.Degree        1.298e+00  2.378e-02  54.601   <2e-16 ***
## Doctorate                  1.016e+00  2.853e-02  35.611   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5559 on 30135 degrees of freedom
## Multiple R-squared:  0.3214, Adjusted R-squared:  0.3211 
## F-statistic:  1098 on 13 and 30135 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = mod5) 
## 
##                        Value p-value                   Decision
## Global Stat        1439.5143  0.0000 Assumptions NOT satisfied!
## Skewness            267.2908  0.0000 Assumptions NOT satisfied!
## Kurtosis           1006.9427  0.0000 Assumptions NOT satisfied!
## Link Function       165.1786  0.0000 Assumptions NOT satisfied!
## Heteroscedasticity    0.1022  0.7492    Assumptions acceptable.

Test Other Models

fit1 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared, data=dat)
fit2 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared +  Married, data = dat)
fit3 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married + White + 
             Black + Asian + Hispanic, data = dat)
fit4 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married + High.School.Degree.or.GED + 
             Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
             Professional.Degree + Doctorate, data = dat)
fit5 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married + High.School.Degree.or.GED + 
             Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
             Professional.Degree + Doctorate + White + 
             Black + Asian + Hispanic, data = dat)


#summary(fit) # show results

stargazer(fit1, fit2, fit3, fit4, fit5, align=T, type="html")
Dependent variable:
log(Earnings.Past.12.Months)
(1) (2) (3) (4) (5)
Female -0.255*** -0.239*** -0.238*** -0.285*** -0.281***
(0.007) (0.007) (0.007) (0.006) (0.006)
Age 0.092*** 0.082*** 0.082*** 0.072*** 0.073***
(0.002) (0.002) (0.002) (0.002) (0.002)
Age.Squared -0.001*** -0.001*** -0.001*** -0.001*** -0.001***
(0.00003) (0.00003) (0.00003) (0.00002) (0.00002)
Married 0.167*** 0.143*** 0.112*** 0.101***
(0.008) (0.008) (0.007) (0.007)
White 0.319*** 0.128***
(0.011) (0.010)
Black 0.115*** 0.003
(0.016) (0.014)
Asian 0.401*** 0.111***
(0.018) (0.016)
Hispanic
High.School.Degree.or.GED 0.248*** 0.206***
(0.013) (0.014)
Some.College 0.417*** 0.374***
(0.014) (0.014)
Associates.Degree 0.505*** 0.456***
(0.016) (0.016)
Bachelors.Degree 0.803*** 0.747***
(0.013) (0.014)
Masters.Degree 0.965*** 0.912***
(0.015) (0.016)
Professional.Degree 1.379*** 1.324***
(0.023) (0.023)
Doctorate 1.088*** 1.031***
(0.027) (0.028)
Constant 8.668*** 8.792*** 8.559*** 8.501*** 8.442***
(0.048) (0.048) (0.048) (0.043) (0.043)
Observations 30,149 30,149 30,149 30,149 30,149
R2 0.125 0.138 0.168 0.358 0.363
Adjusted R2 0.125 0.138 0.168 0.357 0.363
Residual Std. Error 0.631 (df = 30145) 0.626 (df = 30144) 0.615 (df = 30141) 0.541 (df = 30137) 0.539 (df = 30134)
F Statistic 1,435.689*** (df = 3; 30145) 1,205.886*** (df = 4; 30144) 870.324*** (df = 7; 30141) 1,525.437*** (df = 11; 30137) 1,226.889*** (df = 14; 30134)
Note: p<0.1; p<0.05; p<0.01

Effect of Education on Earnings - Male VS Female

data<- dat
# data <-
#   data0 %>%
#     select(Earnings.Past.12.Months, Female, Age, Married, High.School.Degree.or.GED,
#          Some.College, Associates.Degree, Bachelors.Degree, Masters.Degree,
#          Professional.Degree, Doctorate, White, Black, Asian, Hispanic)
data$Age.Squared <- (data$Age)^2 

#2 tables male and female
fit1 <- lm(log(Earnings.Past.12.Months) ~ Female + Age, data=data)
fit2 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Married, data = data)
fit3 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Married + High.School.Degree.or.GED
           + Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
             Professional.Degree + Doctorate, data = data)
fit4 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Married + White + 
             Black + Asian + Hispanic, data = data)

male <- subset(data, Earnings.Past.12.Months > 0 & Female == 0)
female <- subset(data, Earnings.Past.12.Months > 0 & Female == 1)

model1 <- lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
               High.School.Degree.or.GED +  Some.College + Associates.Degree +
               Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
               Black + Asian + Hispanic, data = male)

model2 <- lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
               High.School.Degree.or.GED +  Some.College + Associates.Degree +
               Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
               Black + Asian + Hispanic, data = female)

stargazer(model1, model2, type="html", title="Effect of Education on Earnings - Male VS Female")
Effect of Education on Earnings - Male VS Female
Dependent variable:
log(Earnings.Past.12.Months)
(1) (2)
Age 0.078*** 0.063***
(0.003) (0.003)
Age.Squared -0.001*** -0.001***
(0.00003) (0.00003)
Married 0.177*** 0.021**
(0.010) (0.009)
High.School.Degree.or.GED 0.206*** 0.202***
(0.018) (0.022)
Some.College 0.366*** 0.374***
(0.018) (0.022)
Associates.Degree 0.419*** 0.488***
(0.022) (0.024)
Bachelors.Degree 0.720*** 0.763***
(0.019) (0.022)
Masters.Degree 0.899*** 0.920***
(0.022) (0.024)
Professional.Degree 1.321*** 1.304***
(0.031) (0.035)
Doctorate 0.941*** 1.161***
(0.036) (0.043)
Black -0.199*** -0.070***
(0.017) (0.014)
Asian -0.068*** 0.045**
(0.020) (0.019)
Hispanic -0.175*** -0.070***
(0.014) (0.014)
Constant 8.435*** 8.501***
(0.060) (0.061)
Observations 16,712 13,437
R2 0.357 0.329
Adjusted R2 0.357 0.329
Residual Std. Error 0.564 (df = 16698) 0.500 (df = 13423)
F Statistic 714.692*** (df = 13; 16698) 507.198*** (df = 13; 13423)
Note: p<0.1; p<0.05; p<0.01

Robust standard errors test

#Conduct BPG Test
library(lmtest)

bptest(model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 482.63, df = 13, p-value < 2.2e-16
bptestequation1 = lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
               High.School.Degree.or.GED +  Some.College + Associates.Degree +
               Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
               Black + Asian + Hispanic, data = male)
summary(bptestequation1) 
## 
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared + 
##     Married + High.School.Degree.or.GED + Some.College + Associates.Degree + 
##     Bachelors.Degree + Masters.Degree + Professional.Degree + 
##     Doctorate + Black + Asian + Hispanic, data = male)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.44928 -0.34816 -0.00211  0.32827  2.76717 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                8.435e+00  6.000e-02 140.580  < 2e-16 ***
## Age                        7.815e-02  2.898e-03  26.966  < 2e-16 ***
## Age.Squared               -7.736e-04  3.369e-05 -22.963  < 2e-16 ***
## Married                    1.773e-01  9.954e-03  17.810  < 2e-16 ***
## High.School.Degree.or.GED  2.056e-01  1.779e-02  11.555  < 2e-16 ***
## Some.College               3.660e-01  1.839e-02  19.897  < 2e-16 ***
## Associates.Degree          4.195e-01  2.204e-02  19.036  < 2e-16 ***
## Bachelors.Degree           7.203e-01  1.866e-02  38.592  < 2e-16 ***
## Masters.Degree             8.991e-01  2.195e-02  40.959  < 2e-16 ***
## Professional.Degree        1.321e+00  3.080e-02  42.894  < 2e-16 ***
## Doctorate                  9.413e-01  3.628e-02  25.947  < 2e-16 ***
## Black                     -1.990e-01  1.713e-02 -11.617  < 2e-16 ***
## Asian                     -6.808e-02  1.965e-02  -3.465 0.000531 ***
## Hispanic                  -1.745e-01  1.387e-02 -12.587  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5637 on 16698 degrees of freedom
## Multiple R-squared:  0.3575, Adjusted R-squared:  0.357 
## F-statistic: 714.7 on 13 and 16698 DF,  p-value: < 2.2e-16
bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 261.23, df = 13, p-value < 2.2e-16
bptestequation2 = lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
               High.School.Degree.or.GED +  Some.College + Associates.Degree +
               Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
               Black + Asian + Hispanic, data = female)
summary(bptestequation2) 
## 
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared + 
##     Married + High.School.Degree.or.GED + Some.College + Associates.Degree + 
##     Bachelors.Degree + Masters.Degree + Professional.Degree + 
##     Doctorate + Black + Asian + Hispanic, data = female)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27042 -0.31358 -0.00757  0.30965  2.80738 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                8.501e+00  6.133e-02 138.596  < 2e-16 ***
## Age                        6.344e-02  2.885e-03  21.993  < 2e-16 ***
## Age.Squared               -6.079e-04  3.357e-05 -18.109  < 2e-16 ***
## Married                    2.146e-02  9.109e-03   2.356   0.0185 *  
## High.School.Degree.or.GED  2.019e-01  2.155e-02   9.370  < 2e-16 ***
## Some.College               3.738e-01  2.156e-02  17.334  < 2e-16 ***
## Associates.Degree          4.876e-01  2.362e-02  20.640  < 2e-16 ***
## Bachelors.Degree           7.634e-01  2.163e-02  35.298  < 2e-16 ***
## Masters.Degree             9.200e-01  2.353e-02  39.103  < 2e-16 ***
## Professional.Degree        1.304e+00  3.463e-02  37.655  < 2e-16 ***
## Doctorate                  1.161e+00  4.278e-02  27.146  < 2e-16 ***
## Black                     -6.998e-02  1.386e-02  -5.051 4.46e-07 ***
## Asian                      4.468e-02  1.923e-02   2.324   0.0201 *  
## Hispanic                  -7.028e-02  1.433e-02  -4.905 9.45e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4998 on 13423 degrees of freedom
## Multiple R-squared:  0.3294, Adjusted R-squared:  0.3288 
## F-statistic: 507.2 on 13 and 13423 DF,  p-value: < 2.2e-16
#Install the Sandwich Package (only need to do once)
library(sandwich)

#Generate the Variance Covariance Matrix of the Parameter Estimates
vcovHC(model1, type = "HC")
##                             (Intercept)           Age   Age.Squared
## (Intercept)                3.312940e-03 -1.560587e-04  1.794279e-06
## Age                       -1.560587e-04  8.183999e-06 -9.603317e-08
## Age.Squared                1.794279e-06 -9.603317e-08  1.147828e-09
## Married                    8.891925e-05 -6.417011e-06  6.278239e-08
## High.School.Degree.or.GED -2.181476e-04 -1.887838e-07  4.981505e-09
## Some.College              -2.427131e-04  6.658479e-07 -2.248424e-09
## Associates.Degree         -2.052714e-04 -1.340413e-06  2.167623e-08
## Bachelors.Degree          -2.264834e-04 -5.613305e-07  1.490452e-08
## Masters.Degree            -1.954955e-04 -1.494549e-06  2.236430e-08
## Professional.Degree       -1.986267e-04 -1.315726e-06  1.538624e-08
## Doctorate                 -1.376020e-04 -4.371636e-06  5.407927e-08
## Black                     -1.750736e-05 -1.494467e-06  1.683085e-08
## Asian                      7.001157e-06 -2.268240e-06  3.003105e-08
## Hispanic                  -1.220009e-04  1.492156e-07  7.438570e-09
##                                 Married High.School.Degree.or.GED
## (Intercept)                8.891925e-05             -2.181476e-04
## Age                       -6.417011e-06             -1.887838e-07
## Age.Squared                6.278239e-08              4.981505e-09
## Married                    9.770800e-05             -1.282818e-06
## High.School.Degree.or.GED -1.282818e-06              2.632160e-04
## Some.College              -4.932940e-06              2.089817e-04
## Associates.Degree         -6.650048e-06              2.098084e-04
## Bachelors.Degree          -1.006439e-05              2.123174e-04
## Masters.Degree            -1.309318e-05              2.103961e-04
## Professional.Degree       -1.365821e-05              2.159196e-04
## Doctorate                 -1.007374e-05              2.115895e-04
## Black                      2.026946e-05              7.702493e-06
## Asian                      4.143261e-07              1.580588e-05
## Hispanic                   1.579005e-06              6.842506e-05
##                            Some.College Associates.Degree Bachelors.Degree
## (Intercept)               -2.427131e-04     -2.052714e-04    -2.264834e-04
## Age                        6.658479e-07     -1.340413e-06    -5.613305e-07
## Age.Squared               -2.248424e-09      2.167623e-08     1.490452e-08
## Married                   -4.932940e-06     -6.650048e-06    -1.006439e-05
## High.School.Degree.or.GED  2.089817e-04      2.098084e-04     2.123174e-04
## Some.College               2.895308e-04      2.140031e-04     2.172886e-04
## Associates.Degree          2.140031e-04      4.070893e-04     2.190791e-04
## Bachelors.Degree           2.172886e-04      2.190791e-04     3.291619e-04
## Masters.Degree             2.151084e-04      2.172318e-04     2.226192e-04
## Professional.Degree        2.202783e-04      2.230758e-04     2.279391e-04
## Doctorate                  2.162843e-04      2.192455e-04     2.265779e-04
## Black                      3.960488e-06      1.477978e-05     1.567659e-05
## Asian                      1.352263e-05      1.378340e-05    -5.221528e-06
## Hispanic                   7.814225e-05      8.025560e-05     8.799707e-05
##                           Masters.Degree Professional.Degree     Doctorate
## (Intercept)                -1.954955e-04       -1.986267e-04 -1.376020e-04
## Age                        -1.494549e-06       -1.315726e-06 -4.371636e-06
## Age.Squared                 2.236430e-08        1.538624e-08  5.407927e-08
## Married                    -1.309318e-05       -1.365821e-05 -1.007374e-05
## High.School.Degree.or.GED   2.103961e-04        2.159196e-04  2.115895e-04
## Some.College                2.151084e-04        2.202783e-04  2.162843e-04
## Associates.Degree           2.172318e-04        2.230758e-04  2.192455e-04
## Bachelors.Degree            2.226192e-04        2.279391e-04  2.265779e-04
## Masters.Degree              5.177529e-04        2.291721e-04  2.322228e-04
## Professional.Degree         2.291721e-04        1.652627e-03  2.355982e-04
## Doctorate                   2.322228e-04        2.355982e-04  1.704308e-03
## Black                       7.779597e-06        2.202375e-05  5.697640e-06
## Asian                      -3.684893e-05       -1.248134e-05 -8.974407e-05
## Hispanic                    8.290181e-05        9.759826e-05  8.781607e-05
##                                   Black         Asian      Hispanic
## (Intercept)               -1.750736e-05  7.001157e-06 -1.220009e-04
## Age                       -1.494467e-06 -2.268240e-06  1.492156e-07
## Age.Squared                1.683085e-08  3.003105e-08  7.438570e-09
## Married                    2.026946e-05  4.143261e-07  1.579005e-06
## High.School.Degree.or.GED  7.702493e-06  1.580588e-05  6.842506e-05
## Some.College               3.960488e-06  1.352263e-05  7.814225e-05
## Associates.Degree          1.477978e-05  1.378340e-05  8.025560e-05
## Bachelors.Degree           1.567659e-05 -5.221528e-06  8.799707e-05
## Masters.Degree             7.779597e-06 -3.684893e-05  8.290181e-05
## Professional.Degree        2.202375e-05 -1.248134e-05  9.759826e-05
## Doctorate                  5.697640e-06 -8.974407e-05  8.781607e-05
## Black                      2.633947e-04  2.462559e-05  3.141652e-05
## Asian                      2.462559e-05  4.721117e-04  2.498310e-05
## Hispanic                   3.141652e-05  2.498310e-05  1.814340e-04
vcovHC(model2, type = "HC")#the diagonal elements are the variances of the parameter estimates
##                             (Intercept)           Age   Age.Squared
## (Intercept)                3.331548e-03 -1.478401e-04  1.692814e-06
## Age                       -1.478401e-04  7.659493e-06 -8.969081e-08
## Age.Squared                1.692814e-06 -8.969081e-08  1.068651e-09
## Married                    4.740838e-05 -4.221221e-06  4.373322e-08
## High.School.Degree.or.GED -3.777156e-04  1.175286e-06 -1.219050e-08
## Some.College              -4.111537e-04  2.371700e-06 -2.261180e-08
## Associates.Degree         -3.301934e-04 -1.847297e-06  2.756351e-08
## Bachelors.Degree          -3.880590e-04  4.457625e-07  5.266194e-09
## Masters.Degree            -3.620264e-04 -3.691869e-07  1.019549e-08
## Professional.Degree       -3.452546e-04 -1.417352e-06  2.530270e-08
## Doctorate                 -2.861851e-04 -4.737283e-06  6.420934e-08
## Black                      2.768371e-05 -3.337598e-06  3.859468e-08
## Asian                     -1.833780e-05 -2.322989e-06  2.985196e-08
## Hispanic                  -1.004579e-04 -5.486354e-07  1.424384e-08
##                                 Married High.School.Degree.or.GED
## (Intercept)                4.740838e-05             -3.777156e-04
## Age                       -4.221221e-06              1.175286e-06
## Age.Squared                4.373322e-08             -1.219050e-08
## Married                    8.167724e-05             -3.521090e-06
## High.School.Degree.or.GED -3.521090e-06              4.141168e-04
## Some.College               5.630770e-07              3.443227e-04
## Associates.Degree         -5.905552e-06              3.462857e-04
## Bachelors.Degree          -2.112128e-06              3.465555e-04
## Masters.Degree            -7.037804e-06              3.488479e-04
## Professional.Degree        1.162999e-06              3.425308e-04
## Doctorate                 -1.347654e-06              3.490443e-04
## Black                      2.038535e-05             -8.591835e-07
## Asian                      1.918065e-07              3.328186e-05
## Hispanic                   6.721087e-06              6.066474e-05
##                            Some.College Associates.Degree Bachelors.Degree
## (Intercept)               -4.111537e-04     -3.301934e-04    -3.880590e-04
## Age                        2.371700e-06     -1.847297e-06     4.457625e-07
## Age.Squared               -2.261180e-08      2.756351e-08     5.266194e-09
## Married                    5.630770e-07     -5.905552e-06    -2.112128e-06
## High.School.Degree.or.GED  3.443227e-04      3.462857e-04     3.465555e-04
## Some.College               4.243227e-04      3.467080e-04     3.480696e-04
## Associates.Degree          3.467080e-04      5.198689e-04     3.509143e-04
## Bachelors.Degree           3.480696e-04      3.509143e-04     4.432070e-04
## Masters.Degree             3.490622e-04      3.530814e-04     3.537356e-04
## Professional.Degree        3.427173e-04      3.468691e-04     3.505663e-04
## Doctorate                  3.495016e-04      3.547127e-04     3.555483e-04
## Black                     -4.884873e-06      4.696054e-07     5.231572e-06
## Asian                      3.657587e-05      4.105793e-05     2.715959e-05
## Hispanic                   6.168919e-05      6.835086e-05     7.312921e-05
##                           Masters.Degree Professional.Degree     Doctorate
## (Intercept)                -3.620264e-04       -3.452546e-04 -2.861851e-04
## Age                        -3.691869e-07       -1.417352e-06 -4.737283e-06
## Age.Squared                 1.019549e-08        2.530270e-08  6.420934e-08
## Married                    -7.037804e-06        1.162999e-06 -1.347654e-06
## High.School.Degree.or.GED   3.488479e-04        3.425308e-04  3.490443e-04
## Some.College                3.490622e-04        3.427173e-04  3.495016e-04
## Associates.Degree           3.530814e-04        3.468691e-04  3.547127e-04
## Bachelors.Degree            3.537356e-04        3.505663e-04  3.555483e-04
## Masters.Degree              5.027368e-04        3.507918e-04  3.577949e-04
## Professional.Degree         3.507918e-04        1.823331e-03  3.534760e-04
## Doctorate                   3.577949e-04        3.534760e-04  2.394017e-03
## Black                       2.212451e-06        1.506868e-05 -2.395324e-06
## Asian                       3.447709e-05       -2.329794e-05  2.300081e-05
## Hispanic                    7.707167e-05        7.230336e-05  8.356505e-05
##                                   Black         Asian      Hispanic
## (Intercept)                2.768371e-05 -1.833780e-05 -1.004579e-04
## Age                       -3.337598e-06 -2.322989e-06 -5.486354e-07
## Age.Squared                3.859468e-08  2.985196e-08  1.424384e-08
## Married                    2.038535e-05  1.918065e-07  6.721087e-06
## High.School.Degree.or.GED -8.591835e-07  3.328186e-05  6.066474e-05
## Some.College              -4.884873e-06  3.657587e-05  6.168919e-05
## Associates.Degree          4.696054e-07  4.105793e-05  6.835086e-05
## Bachelors.Degree           5.231572e-06  2.715959e-05  7.312921e-05
## Masters.Degree             2.212451e-06  3.447709e-05  7.707167e-05
## Professional.Degree        1.506868e-05 -2.329794e-05  7.230336e-05
## Doctorate                 -2.395324e-06  2.300081e-05  8.356505e-05
## Black                      1.687806e-04  2.668072e-05  2.967916e-05
## Asian                      2.668072e-05  4.632557e-04  3.264720e-05
## Hispanic                   2.967916e-05  3.264720e-05  1.853689e-04
#Generate the Robust standard errors and print them on screen 
sandwich_se1 <- diag(vcovHC(model1, type = "HC"))^0.5
sandwich_se1
##               (Intercept)                       Age 
##              5.755814e-02              2.860769e-03 
##               Age.Squared                   Married 
##              3.387961e-05              9.884736e-03 
## High.School.Degree.or.GED              Some.College 
##              1.622393e-02              1.701560e-02 
##         Associates.Degree          Bachelors.Degree 
##              2.017645e-02              1.814282e-02 
##            Masters.Degree       Professional.Degree 
##              2.275418e-02              4.065252e-02 
##                 Doctorate                     Black 
##              4.128326e-02              1.622944e-02 
##                     Asian                  Hispanic 
##              2.172813e-02              1.346975e-02
sandwich_se2 <- diag(vcovHC(model2, type = "HC"))^0.5
sandwich_se2
##               (Intercept)                       Age 
##              5.771956e-02              2.767579e-03 
##               Age.Squared                   Married 
##              3.269023e-05              9.037546e-03 
## High.School.Degree.or.GED              Some.College 
##              2.034986e-02              2.059909e-02 
##         Associates.Degree          Bachelors.Degree 
##              2.280063e-02              2.105248e-02 
##            Masters.Degree       Professional.Degree 
##              2.242179e-02              4.270048e-02 
##                 Doctorate                     Black 
##              4.892870e-02              1.299156e-02 
##                     Asian                  Hispanic 
##              2.152337e-02              1.361502e-02
#Estimate Logarithmic Model with Age in Quadratic Form
#LogEarnings.Equation = lm(log(Earnings.Past.12.Months, base = exp(1)) ~ Age + I(Age*Age) + Female 
#                          + Asian + White + Hispanic + Black + High.School.Degree.or.GED + 
#                            Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree + 
#                            Professional.Degree + Doctorate, data=employedwithpay)

#summary(LogEarnings.Equation)