Executive Summary
This is an Econometric Analysis of the Earnings Premiums for Educational Attainment by Gender of roughly 30,000 full-time workers using data from the 2011 American Community Survey (ACS) to find the answer for two questions:
- How do the earnings of full-time workers vary at different levels of educational attainment?
- How does the earnings premium for educational attainment vary by gender?
Code header
# Course: ECON 5300
# Title: Team Project on ACS Labor Market
# Purpose: Data wrangling
# Data: ACS Data for LMA Project-2.csv
# Date: Jan 11, 2018
# Author: Afsar Ali
rm(list=ls(all=TRUE)) # Clear all data in environment
# The tidyverse package contains ggplot2, tibble, tidyr, readr, purr, and dplyr among others
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots
#library(gridExtra)
library(GGally)
#library(knitr)
#library(htmlTable)
#library(kableExtra)
#library(stringr)
# Package used for making interactive plots
library(plotly)
library(car)
library(ggplot2)
library(stargazer)
library(car)
library(olsrr)
library(gvlma)
library(MASS)
#library(flexdashboard)
# Load data
datd <- read.csv("ACS Data for LMA Project-2.csv", header = TRUE)
#Wrangle data
dat <-
datd %>%
filter(Earnings.Past.12.Months >= 10000) %>%
filter(Usual.Weekly.Hours >= 35) %>%
filter(Worked.40..Weeks.During.Past.12.Months == 1) %>%
filter(Biracial == 0) %>%
filter(Hawaiian.or.Pacific.Islander == 0) %>%
filter(Other.Race == 0) %>%
filter(American.Indian.or.Native.American == 0)
#remove unwanted variables
dat <-
dat[,-c(15, 18, 20, 21, 22, 25, 26)]
#attach the file for use
attach(dat)
glimpse(dat)
## Observations: 30,149
## Variables: 19
## $ Age <int> 58, 45, 54, 44, 48, 47, 58, 60, 53, ...
## $ Earnings.Past.12.Months <int> 43500, 23200, 61000, 13000, 30500, 2...
## $ Usual.Weekly.Hours <int> 37, 35, 40, 40, 45, 56, 50, 39, 40, ...
## $ Female <int> 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...
## $ Married <int> 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ...
## $ No.High.School.Degree <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...
## $ High.School.Degree.or.GED <int> 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, ...
## $ Some.College <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ Associates.Degree <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Bachelors.Degree <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ Masters.Degree <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Professional.Degree <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Doctorate <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Educational.Attainment <fct> Associates Degree, High School Degre...
## $ White <int> 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, ...
## $ Black <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ Asian <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ Hispanic <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ Race.Ethnicity <fct> White, Hispanic, White, White, Black...
summary(dat)
## Age Earnings.Past.12.Months Usual.Weekly.Hours
## Min. :18.00 Min. : 10000 Min. :35.00
## 1st Qu.:33.00 1st Qu.: 28000 1st Qu.:40.00
## Median :44.00 Median : 42000 Median :40.00
## Mean :43.24 Mean : 56241 Mean :43.89
## 3rd Qu.:53.00 3rd Qu.: 65000 3rd Qu.:45.00
## Max. :64.00 Max. :577000 Max. :99.00
##
## Female Married No.High.School.Degree
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :1.0000 Median :0.00000
## Mean :0.4457 Mean :0.6274 Mean :0.07025
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## High.School.Degree.or.GED Some.College Associates.Degree
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2484 Mean :0.2195 Mean :0.0938
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Bachelors.Degree Masters.Degree Professional.Degree Doctorate
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.2274 Mean :0.09914 Mean :0.02554 Mean :0.01589
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
##
## Educational.Attainment White Black
## High School Degree :7490 Min. :0.0000 Min. :0.00000
## Bachelors Degree :6857 1st Qu.:0.0000 1st Qu.:0.00000
## Some College :6618 Median :1.0000 Median :0.00000
## Masters Degree :2989 Mean :0.7276 Mean :0.09221
## Associates Degree :2828 3rd Qu.:1.0000 3rd Qu.:0.00000
## No High School Degree:2118 Max. :1.0000 Max. :1.00000
## (Other) :1249
## Asian Hispanic Race.Ethnicity
## Min. :0.00000 Min. :0.0000 White :21937
## 1st Qu.:0.00000 1st Qu.:0.0000 Hispanic: 3788
## Median :0.00000 Median :0.0000 Black : 2780
## Mean :0.05453 Mean :0.1256 Asian : 1644
## 3rd Qu.:0.00000 3rd Qu.:0.0000 Biracial: 0
## Max. :1.00000 Max. :1.0000 Hawaiian: 0
## (Other) : 0
Data Summary Analysis
- About 30,000 entries of the original 65,000 remain.
- Median and mean age are close at 44 and 43 respectively, indicating that age might be somewhat normally distributed.
- Median earnings is 41,000 while mean earnings is 55,000, indicating right skewness.
- Median weekly hours worked is 40 while mean is almost 44, indicating some right skewness. Some people must be working very long weeks.
# Create subset of the data
white_male <-
dat %>%
filter(White==1) %>%
filter(Female==0)
white_female <-
dat %>%
filter(White==1) %>%
filter(Female==1)
nonwhite_male <-
dat %>%
filter(White==0) %>%
filter(Female==0)
nonwhite_female <-
dat %>%
filter(White==0) %>%
filter(Female==1)
#Order of highest average earner by groups:
#White_male - 12336
#Nonwhite_male - 4376
#White_female - 9601
#Nonwhite_female - 3836
Histogram of some selected Variables
-Age: Doesn’t look like there are Any outliers. -Race: Mostly White -Education: Varies, Mostly High School. Maybe omit No High school as dummy variable
hist(Age)
# Most common age is between 45 and 55.
# Very few under 20.
# Between 25 and 55 is fairly evenly represented.
hist(Earnings.Past.12.Months, breaks = seq(0, 600000, by = 25000))
# Highly right skewed.
# Majority of occurrences are under 100,000.
# Most common frequency is between 25,000 and 50000.
# Highest earners are making 550,000.
hist(Usual.Weekly.Hours, breaks = seq(35, 100, by = 1))
# Highly right skewed.
# Most common frequency is 40 hours.
# Largest amount of hours worked in a week is 99.
hist(Female) #Histogram of gender
# There are more males than females in the data set, but not by a huge amount.
hist(Married)#Histogram of married
# There are much more married than unmarried in the data set.
plot_ly(x = Race.Ethnicity,
type = "histogram")
plot_ly(x = Educational.Attainment,
type = "histogram")
Bar Plots of categorical data
- Most common frequency is High School Degree.
- Some college and Bachelor’s degree also exhibit high frequency.
- Least common is Doctorate, followed by Professional Degree.
#Bar plot of educational attainment
dat %>%
ggplot(aes(Educational.Attainment)) +
geom_bar() +
coord_flip()
- Most common frequency by far is white.
- Hispanic, black, and Asian show some amount of common frequency.
- All other races show too little data points to be useful.
- May want to filter out all ethnicity besides White, Hispanic, Black, and Asian if research question has to do with Race/Ethnicity.
#Bar plot of race/ethnicity
dat %>%
ggplot(aes(Race.Ethnicity)) +
geom_bar() +
coord_flip()
Descriptive Stats
stargazer(dat, type = "html", title="Descriptive statistics", digits=2)
Statistic | N | Mean | St. Dev. | Min | Pctl(25) | Pctl(75) | Max |
Age | 30,149 | 43.24 | 11.70 | 18 | 33 | 53 | 64 |
Earnings.Past.12.Months | 30,149 | 56,240.57 | 53,739.72 | 10,000 | 28,000 | 65,000 | 577,000 |
Usual.Weekly.Hours | 30,149 | 43.89 | 8.08 | 35 | 40 | 45 | 99 |
Female | 30,149 | 0.45 | 0.50 | 0 | 0 | 1 | 1 |
Married | 30,149 | 0.63 | 0.48 | 0 | 0 | 1 | 1 |
No.High.School.Degree | 30,149 | 0.07 | 0.26 | 0 | 0 | 0 | 1 |
High.School.Degree.or.GED | 30,149 | 0.25 | 0.43 | 0 | 0 | 0 | 1 |
Some.College | 30,149 | 0.22 | 0.41 | 0 | 0 | 0 | 1 |
Associates.Degree | 30,149 | 0.09 | 0.29 | 0 | 0 | 0 | 1 |
Bachelors.Degree | 30,149 | 0.23 | 0.42 | 0 | 0 | 0 | 1 |
Masters.Degree | 30,149 | 0.10 | 0.30 | 0 | 0 | 0 | 1 |
Professional.Degree | 30,149 | 0.03 | 0.16 | 0 | 0 | 0 | 1 |
Doctorate | 30,149 | 0.02 | 0.13 | 0 | 0 | 0 | 1 |
White | 30,149 | 0.73 | 0.45 | 0 | 0 | 1 | 1 |
Black | 30,149 | 0.09 | 0.29 | 0 | 0 | 0 | 1 |
Asian | 30,149 | 0.05 | 0.23 | 0 | 0 | 0 | 1 |
Hispanic | 30,149 | 0.13 | 0.33 | 0 | 0 | 0 | 1 |
Model Analysis
- With out Race, the model looks unchanged
- Need to use log
# ANOVA
baseMod <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment + Race.Ethnicity, dat)
mod1 <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment + Race.Ethnicity, dat)
mod2 <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment + Race.Ethnicity, dat)
mod3 <- lm(Earnings.Past.12.Months ~ Age + Female + Married + Educational.Attainment, dat)
mod4 <- lm(Earnings.Past.12.Months ~ Age + Female + Married, dat)
anova(baseMod, mod1, mod2, mod3, mod4)
Res.Df <dbl> | RSS <dbl> | Df <dbl> | Sum of Sq <dbl> | F <dbl> | ||
---|---|---|---|---|---|---|
1 | 30135 | 6.486861e+13 | NA | NA | NA | |
2 | 30135 | 6.486861e+13 | 0 | 0.000000e+00 | NA | |
3 | 30135 | 6.486861e+13 | 0 | 0.000000e+00 | NA | |
4 | 30138 | 6.507534e+13 | -3 | -2.067300e+11 | 32.01245 | |
5 | 30145 | 8.091997e+13 | -7 | -1.584464e+13 | 1051.52800 |
# summary(baseMod)
# summary(mod1)
# summary(mod2)
# summary(mod3)
# summary(mod4)
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(baseMod)
plot_ly(x = log(Earnings.Past.12.Months),
type = "histogram")
#Create Age Squared to account for downward trend
dat$Age.Squared <- (dat$Age)^2
#looks much bettter
mod5 <- lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
Black + Asian + Hispanic + High.School.Degree.or.GED + Some.College + Associates.Degree +
Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate , data = dat)
summary(mod5)
##
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared +
## Married + Black + Asian + Hispanic + High.School.Degree.or.GED +
## Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
## Professional.Degree + Doctorate, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.53610 -0.35382 -0.00938 0.33934 2.84393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.490e+00 4.456e-02 190.521 <2e-16 ***
## Age 7.200e-02 2.133e-03 33.754 <2e-16 ***
## Age.Squared -7.097e-04 2.481e-05 -28.605 <2e-16 ***
## Married 1.323e-01 7.000e-03 18.906 <2e-16 ***
## Black -1.572e-01 1.134e-02 -13.860 <2e-16 ***
## Asian -2.286e-02 1.433e-02 -1.595 0.111
## Hispanic -1.299e-01 1.037e-02 -12.528 <2e-16 ***
## High.School.Degree.or.GED 1.842e-01 1.411e-02 13.062 <2e-16 ***
## Some.College 3.366e-01 1.435e-02 23.464 <2e-16 ***
## Associates.Degree 4.001e-01 1.645e-02 24.329 <2e-16 ***
## Bachelors.Degree 7.020e-01 1.446e-02 48.531 <2e-16 ***
## Masters.Degree 8.536e-01 1.637e-02 52.152 <2e-16 ***
## Professional.Degree 1.298e+00 2.378e-02 54.601 <2e-16 ***
## Doctorate 1.016e+00 2.853e-02 35.611 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5559 on 30135 degrees of freedom
## Multiple R-squared: 0.3214, Adjusted R-squared: 0.3211
## F-statistic: 1098 on 13 and 30135 DF, p-value: < 2.2e-16
par(mfrow=c(2,2)) # init 4 charts in 1 panel
plot(mod5)
#qplot(mod3, geom="histogram") doesnt work
Model 5 testing
##Breush Pagan Test
lmtest::bptest(mod5) # Breusch-Pagan test
##
## studentized Breusch-Pagan test
##
## data: mod5
## BP = 679.5, df = 13, p-value < 2.2e-16
#studentized Breusch-Pagan test
#NCV Test
car::ncvTest(mod5) # Breusch-Pagan test
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 808.9328, Df = 1, p = < 2.22e-16
Heteroskedasticity Test
#Use independent variables of the model and perform multiple tests
ols_test_f(mod5, rhs = TRUE, multiple = TRUE)
##
## F Test for Heteroskedasticity
## -----------------------------
## Ho: Variance is homogenous
## Ha: Variance is not homogenous
##
## Variables: Age Age.Squared Married Black Asian Hispanic High.School.Degree.or.GED Some.College Associates.Degree Bachelors.Degree Masters.Degree Professional.Degree Doctorate
##
## Test Summary
## -----------------------------
## Num DF = 13
## Den DF = 30135
## F = 53.450
## Prob > F = 1.981424e-138
Outliers Test
# Assessing Outliers
outlierTest(mod5) # Bonferonni p-value for most extreme obs
## rstudent unadjusted p-value Bonferonni p
## 29413 5.118939 3.0915e-07 0.0093204
## 13560 4.940577 7.8309e-07 0.0236090
## 4365 4.827338 1.3905e-06 0.0419210
qqPlot(mod5, main="QQ Plot") #qq plot for studentized resid
## [1] 13560 29413
leveragePlots(mod5) # leverage plots
Influential Test
# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(dat)-length(mod5$coefficients)-2))
plot(mod5, which=4, cook.levels=cutoff)
# Influence Plot
influencePlot(mod5, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" )
StudRes <dbl> | Hat <dbl> | CookD <dbl> | ||
---|---|---|---|---|
13560 | 4.940577 | 0.0008971591 | 1.564405e-03 | |
19229 | 2.843030 | 0.0029051617 | 1.681771e-03 | |
20011 | -4.009537 | 0.0021605852 | 2.485157e-03 | |
23442 | -3.729216 | 0.0021239131 | 2.113398e-03 | |
24518 | 0.131816 | 0.0028381943 | 3.532636e-06 | |
29413 | 5.118939 | 0.0003412766 | 6.384451e-04 |
Residuals Test
# Normality of Residuals
# qq plot for studentized resid
qqPlot(mod5, main="QQ Plot")
## [1] 13560 29413
# distribution of studentized residuals
sresid <- studres(mod5)
hist(sresid, freq=FALSE,
main="Distribution of Studentized Residuals")
xfit<-seq(min(sresid),max(sresid),length=40)
yfit<-dnorm(xfit)
lines(xfit, yfit)
#### Homoscedasticity Test
# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(mod5)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 808.9328, Df = 1, p = < 2.22e-16
# plot studentized residuals vs. fitted values
spreadLevelPlot(mod5)
##
## Suggested power transformation: -1.711617
Collinearity Test
# Evaluate Collinearity
vif(mod5) # variance inflation factors
## Age Age.Squared
## 60.799851 60.137210
## Married Black
## 1.117507 1.050817
## Asian Hispanic
## 1.033464 1.152416
## High.School.Degree.or.GED Some.College
## 3.624201 3.439634
## Associates.Degree Bachelors.Degree
## 2.243221 3.586620
## Masters.Degree Professional.Degree
## 2.334136 1.372574
## Doctorate
## 1.241500
sqrt(vif(mod5)) > 2 # problem?
## Age Age.Squared
## TRUE TRUE
## Married Black
## FALSE FALSE
## Asian Hispanic
## FALSE FALSE
## High.School.Degree.or.GED Some.College
## FALSE FALSE
## Associates.Degree Bachelors.Degree
## FALSE FALSE
## Masters.Degree Professional.Degree
## FALSE FALSE
## Doctorate
## FALSE
# Evaluate Nonlinearity
# component + residual plot
crPlots(mod5)
# Ceres plots
#ceresPlots(mod5) #didnt work
Global Model Test
# Global test of model assumptions
#The gvlma( ) function in the gvlma package, performs a global validation of linear model assumptions as well separate evaluations of skewness, kurtosis, and heteroscedasticity.
# Global test of model assumptions
gvmodel <- gvlma(mod5)
summary(gvmodel)
##
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared +
## Married + Black + Asian + Hispanic + High.School.Degree.or.GED +
## Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
## Professional.Degree + Doctorate, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.53610 -0.35382 -0.00938 0.33934 2.84393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.490e+00 4.456e-02 190.521 <2e-16 ***
## Age 7.200e-02 2.133e-03 33.754 <2e-16 ***
## Age.Squared -7.097e-04 2.481e-05 -28.605 <2e-16 ***
## Married 1.323e-01 7.000e-03 18.906 <2e-16 ***
## Black -1.572e-01 1.134e-02 -13.860 <2e-16 ***
## Asian -2.286e-02 1.433e-02 -1.595 0.111
## Hispanic -1.299e-01 1.037e-02 -12.528 <2e-16 ***
## High.School.Degree.or.GED 1.842e-01 1.411e-02 13.062 <2e-16 ***
## Some.College 3.366e-01 1.435e-02 23.464 <2e-16 ***
## Associates.Degree 4.001e-01 1.645e-02 24.329 <2e-16 ***
## Bachelors.Degree 7.020e-01 1.446e-02 48.531 <2e-16 ***
## Masters.Degree 8.536e-01 1.637e-02 52.152 <2e-16 ***
## Professional.Degree 1.298e+00 2.378e-02 54.601 <2e-16 ***
## Doctorate 1.016e+00 2.853e-02 35.611 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5559 on 30135 degrees of freedom
## Multiple R-squared: 0.3214, Adjusted R-squared: 0.3211
## F-statistic: 1098 on 13 and 30135 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = mod5)
##
## Value p-value Decision
## Global Stat 1439.5143 0.0000 Assumptions NOT satisfied!
## Skewness 267.2908 0.0000 Assumptions NOT satisfied!
## Kurtosis 1006.9427 0.0000 Assumptions NOT satisfied!
## Link Function 165.1786 0.0000 Assumptions NOT satisfied!
## Heteroscedasticity 0.1022 0.7492 Assumptions acceptable.
Test Other Models
fit1 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared, data=dat)
fit2 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married, data = dat)
fit3 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married + White +
Black + Asian + Hispanic, data = dat)
fit4 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married + High.School.Degree.or.GED +
Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
Professional.Degree + Doctorate, data = dat)
fit5 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Age.Squared + Married + High.School.Degree.or.GED +
Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
Professional.Degree + Doctorate + White +
Black + Asian + Hispanic, data = dat)
#summary(fit) # show results
stargazer(fit1, fit2, fit3, fit4, fit5, align=T, type="html")
Dependent variable: | |||||
log(Earnings.Past.12.Months) | |||||
(1) | (2) | (3) | (4) | (5) | |
Female | -0.255*** | -0.239*** | -0.238*** | -0.285*** | -0.281*** |
(0.007) | (0.007) | (0.007) | (0.006) | (0.006) | |
Age | 0.092*** | 0.082*** | 0.082*** | 0.072*** | 0.073*** |
(0.002) | (0.002) | (0.002) | (0.002) | (0.002) | |
Age.Squared | -0.001*** | -0.001*** | -0.001*** | -0.001*** | -0.001*** |
(0.00003) | (0.00003) | (0.00003) | (0.00002) | (0.00002) | |
Married | 0.167*** | 0.143*** | 0.112*** | 0.101*** | |
(0.008) | (0.008) | (0.007) | (0.007) | ||
White | 0.319*** | 0.128*** | |||
(0.011) | (0.010) | ||||
Black | 0.115*** | 0.003 | |||
(0.016) | (0.014) | ||||
Asian | 0.401*** | 0.111*** | |||
(0.018) | (0.016) | ||||
Hispanic | |||||
High.School.Degree.or.GED | 0.248*** | 0.206*** | |||
(0.013) | (0.014) | ||||
Some.College | 0.417*** | 0.374*** | |||
(0.014) | (0.014) | ||||
Associates.Degree | 0.505*** | 0.456*** | |||
(0.016) | (0.016) | ||||
Bachelors.Degree | 0.803*** | 0.747*** | |||
(0.013) | (0.014) | ||||
Masters.Degree | 0.965*** | 0.912*** | |||
(0.015) | (0.016) | ||||
Professional.Degree | 1.379*** | 1.324*** | |||
(0.023) | (0.023) | ||||
Doctorate | 1.088*** | 1.031*** | |||
(0.027) | (0.028) | ||||
Constant | 8.668*** | 8.792*** | 8.559*** | 8.501*** | 8.442*** |
(0.048) | (0.048) | (0.048) | (0.043) | (0.043) | |
Observations | 30,149 | 30,149 | 30,149 | 30,149 | 30,149 |
R2 | 0.125 | 0.138 | 0.168 | 0.358 | 0.363 |
Adjusted R2 | 0.125 | 0.138 | 0.168 | 0.357 | 0.363 |
Residual Std. Error | 0.631 (df = 30145) | 0.626 (df = 30144) | 0.615 (df = 30141) | 0.541 (df = 30137) | 0.539 (df = 30134) |
F Statistic | 1,435.689*** (df = 3; 30145) | 1,205.886*** (df = 4; 30144) | 870.324*** (df = 7; 30141) | 1,525.437*** (df = 11; 30137) | 1,226.889*** (df = 14; 30134) |
Note: | p<0.1; p<0.05; p<0.01 |
Effect of Education on Earnings - Male VS Female
data<- dat
# data <-
# data0 %>%
# select(Earnings.Past.12.Months, Female, Age, Married, High.School.Degree.or.GED,
# Some.College, Associates.Degree, Bachelors.Degree, Masters.Degree,
# Professional.Degree, Doctorate, White, Black, Asian, Hispanic)
data$Age.Squared <- (data$Age)^2
#2 tables male and female
fit1 <- lm(log(Earnings.Past.12.Months) ~ Female + Age, data=data)
fit2 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Married, data = data)
fit3 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Married + High.School.Degree.or.GED
+ Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
Professional.Degree + Doctorate, data = data)
fit4 <- lm(log(Earnings.Past.12.Months) ~ Female + Age + Married + White +
Black + Asian + Hispanic, data = data)
male <- subset(data, Earnings.Past.12.Months > 0 & Female == 0)
female <- subset(data, Earnings.Past.12.Months > 0 & Female == 1)
model1 <- lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
High.School.Degree.or.GED + Some.College + Associates.Degree +
Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
Black + Asian + Hispanic, data = male)
model2 <- lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
High.School.Degree.or.GED + Some.College + Associates.Degree +
Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
Black + Asian + Hispanic, data = female)
stargazer(model1, model2, type="html", title="Effect of Education on Earnings - Male VS Female")
Dependent variable: | ||
log(Earnings.Past.12.Months) | ||
(1) | (2) | |
Age | 0.078*** | 0.063*** |
(0.003) | (0.003) | |
Age.Squared | -0.001*** | -0.001*** |
(0.00003) | (0.00003) | |
Married | 0.177*** | 0.021** |
(0.010) | (0.009) | |
High.School.Degree.or.GED | 0.206*** | 0.202*** |
(0.018) | (0.022) | |
Some.College | 0.366*** | 0.374*** |
(0.018) | (0.022) | |
Associates.Degree | 0.419*** | 0.488*** |
(0.022) | (0.024) | |
Bachelors.Degree | 0.720*** | 0.763*** |
(0.019) | (0.022) | |
Masters.Degree | 0.899*** | 0.920*** |
(0.022) | (0.024) | |
Professional.Degree | 1.321*** | 1.304*** |
(0.031) | (0.035) | |
Doctorate | 0.941*** | 1.161*** |
(0.036) | (0.043) | |
Black | -0.199*** | -0.070*** |
(0.017) | (0.014) | |
Asian | -0.068*** | 0.045** |
(0.020) | (0.019) | |
Hispanic | -0.175*** | -0.070*** |
(0.014) | (0.014) | |
Constant | 8.435*** | 8.501*** |
(0.060) | (0.061) | |
Observations | 16,712 | 13,437 |
R2 | 0.357 | 0.329 |
Adjusted R2 | 0.357 | 0.329 |
Residual Std. Error | 0.564 (df = 16698) | 0.500 (df = 13423) |
F Statistic | 714.692*** (df = 13; 16698) | 507.198*** (df = 13; 13423) |
Note: | p<0.1; p<0.05; p<0.01 |
Robust standard errors test
#Conduct BPG Test
library(lmtest)
bptest(model1)
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 482.63, df = 13, p-value < 2.2e-16
bptestequation1 = lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
High.School.Degree.or.GED + Some.College + Associates.Degree +
Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
Black + Asian + Hispanic, data = male)
summary(bptestequation1)
##
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared +
## Married + High.School.Degree.or.GED + Some.College + Associates.Degree +
## Bachelors.Degree + Masters.Degree + Professional.Degree +
## Doctorate + Black + Asian + Hispanic, data = male)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.44928 -0.34816 -0.00211 0.32827 2.76717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.435e+00 6.000e-02 140.580 < 2e-16 ***
## Age 7.815e-02 2.898e-03 26.966 < 2e-16 ***
## Age.Squared -7.736e-04 3.369e-05 -22.963 < 2e-16 ***
## Married 1.773e-01 9.954e-03 17.810 < 2e-16 ***
## High.School.Degree.or.GED 2.056e-01 1.779e-02 11.555 < 2e-16 ***
## Some.College 3.660e-01 1.839e-02 19.897 < 2e-16 ***
## Associates.Degree 4.195e-01 2.204e-02 19.036 < 2e-16 ***
## Bachelors.Degree 7.203e-01 1.866e-02 38.592 < 2e-16 ***
## Masters.Degree 8.991e-01 2.195e-02 40.959 < 2e-16 ***
## Professional.Degree 1.321e+00 3.080e-02 42.894 < 2e-16 ***
## Doctorate 9.413e-01 3.628e-02 25.947 < 2e-16 ***
## Black -1.990e-01 1.713e-02 -11.617 < 2e-16 ***
## Asian -6.808e-02 1.965e-02 -3.465 0.000531 ***
## Hispanic -1.745e-01 1.387e-02 -12.587 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5637 on 16698 degrees of freedom
## Multiple R-squared: 0.3575, Adjusted R-squared: 0.357
## F-statistic: 714.7 on 13 and 16698 DF, p-value: < 2.2e-16
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 261.23, df = 13, p-value < 2.2e-16
bptestequation2 = lm(log(Earnings.Past.12.Months) ~ Age + Age.Squared + Married +
High.School.Degree.or.GED + Some.College + Associates.Degree +
Bachelors.Degree + Masters.Degree + Professional.Degree + Doctorate +
Black + Asian + Hispanic, data = female)
summary(bptestequation2)
##
## Call:
## lm(formula = log(Earnings.Past.12.Months) ~ Age + Age.Squared +
## Married + High.School.Degree.or.GED + Some.College + Associates.Degree +
## Bachelors.Degree + Masters.Degree + Professional.Degree +
## Doctorate + Black + Asian + Hispanic, data = female)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27042 -0.31358 -0.00757 0.30965 2.80738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.501e+00 6.133e-02 138.596 < 2e-16 ***
## Age 6.344e-02 2.885e-03 21.993 < 2e-16 ***
## Age.Squared -6.079e-04 3.357e-05 -18.109 < 2e-16 ***
## Married 2.146e-02 9.109e-03 2.356 0.0185 *
## High.School.Degree.or.GED 2.019e-01 2.155e-02 9.370 < 2e-16 ***
## Some.College 3.738e-01 2.156e-02 17.334 < 2e-16 ***
## Associates.Degree 4.876e-01 2.362e-02 20.640 < 2e-16 ***
## Bachelors.Degree 7.634e-01 2.163e-02 35.298 < 2e-16 ***
## Masters.Degree 9.200e-01 2.353e-02 39.103 < 2e-16 ***
## Professional.Degree 1.304e+00 3.463e-02 37.655 < 2e-16 ***
## Doctorate 1.161e+00 4.278e-02 27.146 < 2e-16 ***
## Black -6.998e-02 1.386e-02 -5.051 4.46e-07 ***
## Asian 4.468e-02 1.923e-02 2.324 0.0201 *
## Hispanic -7.028e-02 1.433e-02 -4.905 9.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4998 on 13423 degrees of freedom
## Multiple R-squared: 0.3294, Adjusted R-squared: 0.3288
## F-statistic: 507.2 on 13 and 13423 DF, p-value: < 2.2e-16
#Install the Sandwich Package (only need to do once)
library(sandwich)
#Generate the Variance Covariance Matrix of the Parameter Estimates
vcovHC(model1, type = "HC")
## (Intercept) Age Age.Squared
## (Intercept) 3.312940e-03 -1.560587e-04 1.794279e-06
## Age -1.560587e-04 8.183999e-06 -9.603317e-08
## Age.Squared 1.794279e-06 -9.603317e-08 1.147828e-09
## Married 8.891925e-05 -6.417011e-06 6.278239e-08
## High.School.Degree.or.GED -2.181476e-04 -1.887838e-07 4.981505e-09
## Some.College -2.427131e-04 6.658479e-07 -2.248424e-09
## Associates.Degree -2.052714e-04 -1.340413e-06 2.167623e-08
## Bachelors.Degree -2.264834e-04 -5.613305e-07 1.490452e-08
## Masters.Degree -1.954955e-04 -1.494549e-06 2.236430e-08
## Professional.Degree -1.986267e-04 -1.315726e-06 1.538624e-08
## Doctorate -1.376020e-04 -4.371636e-06 5.407927e-08
## Black -1.750736e-05 -1.494467e-06 1.683085e-08
## Asian 7.001157e-06 -2.268240e-06 3.003105e-08
## Hispanic -1.220009e-04 1.492156e-07 7.438570e-09
## Married High.School.Degree.or.GED
## (Intercept) 8.891925e-05 -2.181476e-04
## Age -6.417011e-06 -1.887838e-07
## Age.Squared 6.278239e-08 4.981505e-09
## Married 9.770800e-05 -1.282818e-06
## High.School.Degree.or.GED -1.282818e-06 2.632160e-04
## Some.College -4.932940e-06 2.089817e-04
## Associates.Degree -6.650048e-06 2.098084e-04
## Bachelors.Degree -1.006439e-05 2.123174e-04
## Masters.Degree -1.309318e-05 2.103961e-04
## Professional.Degree -1.365821e-05 2.159196e-04
## Doctorate -1.007374e-05 2.115895e-04
## Black 2.026946e-05 7.702493e-06
## Asian 4.143261e-07 1.580588e-05
## Hispanic 1.579005e-06 6.842506e-05
## Some.College Associates.Degree Bachelors.Degree
## (Intercept) -2.427131e-04 -2.052714e-04 -2.264834e-04
## Age 6.658479e-07 -1.340413e-06 -5.613305e-07
## Age.Squared -2.248424e-09 2.167623e-08 1.490452e-08
## Married -4.932940e-06 -6.650048e-06 -1.006439e-05
## High.School.Degree.or.GED 2.089817e-04 2.098084e-04 2.123174e-04
## Some.College 2.895308e-04 2.140031e-04 2.172886e-04
## Associates.Degree 2.140031e-04 4.070893e-04 2.190791e-04
## Bachelors.Degree 2.172886e-04 2.190791e-04 3.291619e-04
## Masters.Degree 2.151084e-04 2.172318e-04 2.226192e-04
## Professional.Degree 2.202783e-04 2.230758e-04 2.279391e-04
## Doctorate 2.162843e-04 2.192455e-04 2.265779e-04
## Black 3.960488e-06 1.477978e-05 1.567659e-05
## Asian 1.352263e-05 1.378340e-05 -5.221528e-06
## Hispanic 7.814225e-05 8.025560e-05 8.799707e-05
## Masters.Degree Professional.Degree Doctorate
## (Intercept) -1.954955e-04 -1.986267e-04 -1.376020e-04
## Age -1.494549e-06 -1.315726e-06 -4.371636e-06
## Age.Squared 2.236430e-08 1.538624e-08 5.407927e-08
## Married -1.309318e-05 -1.365821e-05 -1.007374e-05
## High.School.Degree.or.GED 2.103961e-04 2.159196e-04 2.115895e-04
## Some.College 2.151084e-04 2.202783e-04 2.162843e-04
## Associates.Degree 2.172318e-04 2.230758e-04 2.192455e-04
## Bachelors.Degree 2.226192e-04 2.279391e-04 2.265779e-04
## Masters.Degree 5.177529e-04 2.291721e-04 2.322228e-04
## Professional.Degree 2.291721e-04 1.652627e-03 2.355982e-04
## Doctorate 2.322228e-04 2.355982e-04 1.704308e-03
## Black 7.779597e-06 2.202375e-05 5.697640e-06
## Asian -3.684893e-05 -1.248134e-05 -8.974407e-05
## Hispanic 8.290181e-05 9.759826e-05 8.781607e-05
## Black Asian Hispanic
## (Intercept) -1.750736e-05 7.001157e-06 -1.220009e-04
## Age -1.494467e-06 -2.268240e-06 1.492156e-07
## Age.Squared 1.683085e-08 3.003105e-08 7.438570e-09
## Married 2.026946e-05 4.143261e-07 1.579005e-06
## High.School.Degree.or.GED 7.702493e-06 1.580588e-05 6.842506e-05
## Some.College 3.960488e-06 1.352263e-05 7.814225e-05
## Associates.Degree 1.477978e-05 1.378340e-05 8.025560e-05
## Bachelors.Degree 1.567659e-05 -5.221528e-06 8.799707e-05
## Masters.Degree 7.779597e-06 -3.684893e-05 8.290181e-05
## Professional.Degree 2.202375e-05 -1.248134e-05 9.759826e-05
## Doctorate 5.697640e-06 -8.974407e-05 8.781607e-05
## Black 2.633947e-04 2.462559e-05 3.141652e-05
## Asian 2.462559e-05 4.721117e-04 2.498310e-05
## Hispanic 3.141652e-05 2.498310e-05 1.814340e-04
vcovHC(model2, type = "HC")#the diagonal elements are the variances of the parameter estimates
## (Intercept) Age Age.Squared
## (Intercept) 3.331548e-03 -1.478401e-04 1.692814e-06
## Age -1.478401e-04 7.659493e-06 -8.969081e-08
## Age.Squared 1.692814e-06 -8.969081e-08 1.068651e-09
## Married 4.740838e-05 -4.221221e-06 4.373322e-08
## High.School.Degree.or.GED -3.777156e-04 1.175286e-06 -1.219050e-08
## Some.College -4.111537e-04 2.371700e-06 -2.261180e-08
## Associates.Degree -3.301934e-04 -1.847297e-06 2.756351e-08
## Bachelors.Degree -3.880590e-04 4.457625e-07 5.266194e-09
## Masters.Degree -3.620264e-04 -3.691869e-07 1.019549e-08
## Professional.Degree -3.452546e-04 -1.417352e-06 2.530270e-08
## Doctorate -2.861851e-04 -4.737283e-06 6.420934e-08
## Black 2.768371e-05 -3.337598e-06 3.859468e-08
## Asian -1.833780e-05 -2.322989e-06 2.985196e-08
## Hispanic -1.004579e-04 -5.486354e-07 1.424384e-08
## Married High.School.Degree.or.GED
## (Intercept) 4.740838e-05 -3.777156e-04
## Age -4.221221e-06 1.175286e-06
## Age.Squared 4.373322e-08 -1.219050e-08
## Married 8.167724e-05 -3.521090e-06
## High.School.Degree.or.GED -3.521090e-06 4.141168e-04
## Some.College 5.630770e-07 3.443227e-04
## Associates.Degree -5.905552e-06 3.462857e-04
## Bachelors.Degree -2.112128e-06 3.465555e-04
## Masters.Degree -7.037804e-06 3.488479e-04
## Professional.Degree 1.162999e-06 3.425308e-04
## Doctorate -1.347654e-06 3.490443e-04
## Black 2.038535e-05 -8.591835e-07
## Asian 1.918065e-07 3.328186e-05
## Hispanic 6.721087e-06 6.066474e-05
## Some.College Associates.Degree Bachelors.Degree
## (Intercept) -4.111537e-04 -3.301934e-04 -3.880590e-04
## Age 2.371700e-06 -1.847297e-06 4.457625e-07
## Age.Squared -2.261180e-08 2.756351e-08 5.266194e-09
## Married 5.630770e-07 -5.905552e-06 -2.112128e-06
## High.School.Degree.or.GED 3.443227e-04 3.462857e-04 3.465555e-04
## Some.College 4.243227e-04 3.467080e-04 3.480696e-04
## Associates.Degree 3.467080e-04 5.198689e-04 3.509143e-04
## Bachelors.Degree 3.480696e-04 3.509143e-04 4.432070e-04
## Masters.Degree 3.490622e-04 3.530814e-04 3.537356e-04
## Professional.Degree 3.427173e-04 3.468691e-04 3.505663e-04
## Doctorate 3.495016e-04 3.547127e-04 3.555483e-04
## Black -4.884873e-06 4.696054e-07 5.231572e-06
## Asian 3.657587e-05 4.105793e-05 2.715959e-05
## Hispanic 6.168919e-05 6.835086e-05 7.312921e-05
## Masters.Degree Professional.Degree Doctorate
## (Intercept) -3.620264e-04 -3.452546e-04 -2.861851e-04
## Age -3.691869e-07 -1.417352e-06 -4.737283e-06
## Age.Squared 1.019549e-08 2.530270e-08 6.420934e-08
## Married -7.037804e-06 1.162999e-06 -1.347654e-06
## High.School.Degree.or.GED 3.488479e-04 3.425308e-04 3.490443e-04
## Some.College 3.490622e-04 3.427173e-04 3.495016e-04
## Associates.Degree 3.530814e-04 3.468691e-04 3.547127e-04
## Bachelors.Degree 3.537356e-04 3.505663e-04 3.555483e-04
## Masters.Degree 5.027368e-04 3.507918e-04 3.577949e-04
## Professional.Degree 3.507918e-04 1.823331e-03 3.534760e-04
## Doctorate 3.577949e-04 3.534760e-04 2.394017e-03
## Black 2.212451e-06 1.506868e-05 -2.395324e-06
## Asian 3.447709e-05 -2.329794e-05 2.300081e-05
## Hispanic 7.707167e-05 7.230336e-05 8.356505e-05
## Black Asian Hispanic
## (Intercept) 2.768371e-05 -1.833780e-05 -1.004579e-04
## Age -3.337598e-06 -2.322989e-06 -5.486354e-07
## Age.Squared 3.859468e-08 2.985196e-08 1.424384e-08
## Married 2.038535e-05 1.918065e-07 6.721087e-06
## High.School.Degree.or.GED -8.591835e-07 3.328186e-05 6.066474e-05
## Some.College -4.884873e-06 3.657587e-05 6.168919e-05
## Associates.Degree 4.696054e-07 4.105793e-05 6.835086e-05
## Bachelors.Degree 5.231572e-06 2.715959e-05 7.312921e-05
## Masters.Degree 2.212451e-06 3.447709e-05 7.707167e-05
## Professional.Degree 1.506868e-05 -2.329794e-05 7.230336e-05
## Doctorate -2.395324e-06 2.300081e-05 8.356505e-05
## Black 1.687806e-04 2.668072e-05 2.967916e-05
## Asian 2.668072e-05 4.632557e-04 3.264720e-05
## Hispanic 2.967916e-05 3.264720e-05 1.853689e-04
#Generate the Robust standard errors and print them on screen
sandwich_se1 <- diag(vcovHC(model1, type = "HC"))^0.5
sandwich_se1
## (Intercept) Age
## 5.755814e-02 2.860769e-03
## Age.Squared Married
## 3.387961e-05 9.884736e-03
## High.School.Degree.or.GED Some.College
## 1.622393e-02 1.701560e-02
## Associates.Degree Bachelors.Degree
## 2.017645e-02 1.814282e-02
## Masters.Degree Professional.Degree
## 2.275418e-02 4.065252e-02
## Doctorate Black
## 4.128326e-02 1.622944e-02
## Asian Hispanic
## 2.172813e-02 1.346975e-02
sandwich_se2 <- diag(vcovHC(model2, type = "HC"))^0.5
sandwich_se2
## (Intercept) Age
## 5.771956e-02 2.767579e-03
## Age.Squared Married
## 3.269023e-05 9.037546e-03
## High.School.Degree.or.GED Some.College
## 2.034986e-02 2.059909e-02
## Associates.Degree Bachelors.Degree
## 2.280063e-02 2.105248e-02
## Masters.Degree Professional.Degree
## 2.242179e-02 4.270048e-02
## Doctorate Black
## 4.892870e-02 1.299156e-02
## Asian Hispanic
## 2.152337e-02 1.361502e-02
#Estimate Logarithmic Model with Age in Quadratic Form
#LogEarnings.Equation = lm(log(Earnings.Past.12.Months, base = exp(1)) ~ Age + I(Age*Age) + Female
# + Asian + White + Hispanic + Black + High.School.Degree.or.GED +
# Some.College + Associates.Degree + Bachelors.Degree + Masters.Degree +
# Professional.Degree + Doctorate, data=employedwithpay)
#summary(LogEarnings.Equation)