Code header
# Course: BUAN 5210
# Title: Final Project EDA
# Purpose: Intial EDA for the Final Project
# Date: Dec 1st, 2017
# Author: Afsar Ali
# Clear working environment
rm(list=ls(all=TRUE))
# The tidyverse package contains ggplot2, tibble, tidyr, readr, purr, and dplyr among others
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots
library(gridExtra)
# The GGally package contains ggpairs which is a custom correlation graph for ggplot2
library(GGally)
# Load gridExtra so can plot more than one graph with grid.arrange
library(gridExtra)
#load other packages
library(plotly)
library(psych)
library(knitr)
library(htmlTable)
#Load and Review data
dat <- read.csv("ECON5100_project_data.csv", header = TRUE)
dat <- mutate(dat, DRatio = (DLTT/AT)) #Calculating long term debt ratio
#Make a subset of relevent data
dat1 <- select(dat, "AT","NI", "CH", "DLTT", "PRCH_C", "PRCL_C", "DRatio", "TDC2", "SALARY", "BONUS", "OTHCOMP", "AGE", "GENDER")
dat1<- mutate (dat1, ageg = cut(AGE, 6)) #grouping age by 6
datm <- dat1 %>% filter(GENDER =="MALE") #nale only
datf <- dat1 %>% filter(GENDER =="FEMALE") #female only
#attach the file for use
attach(dat1)
glimpse(dat)
## Observations: 8,300
## Variables: 99
## $ GVKEY <int> 21542, 21542, 21542, 21542, 21542, 215...
## $ CO_PER_ROL <int> 38455, 38456, 38457, 38459, 52715, 547...
## $ YEAR <int> 2016, 2016, 2016, 2016, 2016, 2016, 20...
## $ TICKER <fct> AAON, AAON, AAON, AAON, AAON, AAON, AB...
## $ CUSIP <fct> 00036020, 00036020, 00036020, 00036020...
## $ EXECID <int> 36415, 36416, 36417, 36419, 49230, 510...
## $ EXEC_FNAME <fct> Norman, Robert, Kathy, Scott, Sam, Gar...
## $ EXEC_FULLNAME <fct> Norman H. Asbjornson, Robert G. Fergus...
## $ EXEC_LNAME <fct> Asbjornson, Fergus, Sheffield, Asbjorn...
## $ EXEC_MNAME <fct> H., G., I., M., J., D., P., Hlavinka, ...
## $ GENDER <fct> MALE, MALE, FEMALE, MALE, MALE, MALE, ...
## $ NAMEPREFIX <fct> Mr., Mr., Ms., Mr., Mr., Mr., Mr., Ms....
## $ PAGE <int> 80, 75, 63, 47, 39, 56, 59, 52, 54, 44...
## $ BECAMECEO <fct> 1/1/1989, , , , , , , , 3/31/2015, , ,...
## $ EXECRANK <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ JOINED_CO <fct> , , , , , , , , , , , 1/1/1973, , 3/1/...
## $ LEFTCO <fct> , , , , , , , , , , , , 5/7/2001, , , ...
## $ LEFTOFC <fct> , , , , , , , , , , , , 5/7/2001, , , ...
## $ PCEO <fct> CEO, , , , , , , , CEO, , , CEO, , , ,...
## $ PCFO <fct> , , , CFO, , , , , , CFO, , , , , , CF...
## $ REASON <fct> , , , , , , , , , , , , RETIRED, , , ,...
## $ REJOIN <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ RELEFT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ TITLE <fct> Chairman of the Board and Chief Execut...
## $ AGE <int> 80, 75, 63, 47, 39, 56, 59, 52, 54, 44...
## $ ALLOTHPD <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ ALLOTHTOT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ BONUS <dbl> -8.715, 6.192, 0.000, 0.000, 6.436, 0....
## $ CEOANN <fct> CEO, , , , , , , , CEO, , , CEO, , , ,...
## $ CFOANN <fct> , , , CFO, , , , , , CFO, , , , , , CF...
## $ CHG_CTRL_PYMT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ COMMENT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ DEFER_BALANCE_TOT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_CONTRIB_CO_TOT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_CONTRIB_EXEC_TOT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_EARNINGS_TOT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_RPT_AS_COMP_TOT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ DEFER_WITHDR_TOT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ EIP_UNEARN_NUM <dbl> 102.719, 11.510, 16.514, 16.505, 12.32...
## $ EIP_UNEARN_VAL <dbl> 3394.862, 380.405, 545.787, 545.490, 4...
## $ EXECDIR <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,...
## $ EXECRANKANN <int> 1, 5, 2, 3, 4, 6, 2, 4, 1, 5, 3, 1, 4,...
## $ INTERLOCK <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LTIP <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ NONEQ_INCENT <dbl> 359.521, 68.797, 85.898, 85.891, 71.51...
## $ OLD_DATAFMT_FLAG <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ OPTION_AWARDS <dbl> 320.497, 30.047, 65.167, 65.167, 41.36...
## $ OPTION_AWARDS_BLK_VALUE <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ OPTION_AWARDS_FV <dbl> 733.690, 68.668, 113.368, 113.368, 81....
## $ OPTION_AWARDS_NUM <dbl> 73.310, 6.930, 11.485, 11.485, 8.240, ...
## $ OPTION_AWARDS_RPT_VALUE <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ OPT_EXER_NUM <dbl> 0.000, 0.000, 22.000, 0.000, 12.050, 0...
## $ OPT_EXER_VAL <dbl> 0.000, 0.000, 522.728, 0.000, 214.078,...
## $ OPT_UNEX_EXER_EST_VAL <dbl> 72.655, 272.082, 1475.336, 120.486, 0....
## $ OPT_UNEX_EXER_NUM <dbl> 7.664, 11.570, 57.711, 5.461, 0.000, 0...
## $ OPT_UNEX_UNEXER_EST_VAL <dbl> 715.048, 133.702, 225.357, 225.357, 19...
## $ OPT_UNEX_UNEXER_NUM <dbl> 65.646, 8.860, 15.024, 15.024, 11.940,...
## $ OTHANN <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ OTHCOMP <dbl> 92.977, 24.150, 22.473, 32.135, 25.951...
## $ PENSION_CHG <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ PENSION_PYMTS_TOT <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PENSION_VALUE_TOT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ REPRICE <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ RET_YRS <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ RSTKGRNT <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ RSTKVYRS <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ SALARY <dbl> 480.960, 197.220, 246.240, 246.220, 20...
## $ SAL_PCT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, NA,...
## $ SHROWN_EXCL_OPTS <dbl> 9753.256, 19.725, 13.644, 1386.240, 10...
## $ SHROWN_EXCL_OPTS_PCT <dbl> 18.528, 0.037, 0.026, 2.633, 0.020, 0....
## $ SHROWN_TOT <dbl> 9775.582, 35.381, 58.152, 1393.998, 15...
## $ SHROWN_TOT_PCT <dbl> 18.570, 0.067, 0.110, 2.648, 0.029, 0....
## $ SHRS_VEST_NUM <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ SHRS_VEST_VAL <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ STOCK_AWARDS <dbl> 782.817, 60.135, 130.609, 130.609, 82....
## $ STOCK_AWARDS_FV <dbl> 2833.765, 293.901, 413.851, 413.557, 3...
## $ STOCK_UNVEST_NUM <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ STOCK_UNVEST_VAL <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ TDC1 <dbl> 4492.198, 658.928, 881.830, 891.171, 6...
## $ TDC1_PCT <dbl> 28.146, 13.513, 24.696, 25.870, 20.450...
## $ TDC2 <dbl> 3758.508, 590.260, 1291.190, 777.803, ...
## $ TDC2_PCT <dbl> 21.543, 8.931, 95.937, 17.883, 54.013,...
## $ TERM_PYMT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ TITLEANN <fct> Chairman of the Board and Chief Execut...
## $ TOTAL_ALT1 <dbl> 4492.198, 658.928, 881.830, 891.171, 6...
## $ TOTAL_ALT1_PCT <dbl> 28.146, 13.513, 24.696, 25.870, 20.450...
## $ TOTAL_ALT2 <dbl> 924.743, 296.359, 877.339, 364.246, 52...
## $ TOTAL_ALT2_PCT <dbl> -4.885, -3.810, 133.496, -67.252, -74....
## $ TOTAL_CURR <dbl> 472.245, 203.412, 246.240, 246.220, 21...
## $ TOTAL_CURR_PCT <dbl> -11.236, -1.730, -4.721, -4.721, -1.72...
## $ TOTAL_SEC <dbl> 2028.057, 386.541, 550.387, 560.022, 4...
## $ TOTAL_SEC_PCT <dbl> -14.579, -9.653, 4.799, 6.407, -1.359,...
## $ AT <dbl> 256.530, 256.530, 256.530, 256.530, 25...
## $ CH <dbl> 24.153, 24.153, 24.153, 24.153, 24.153...
## $ DLTT <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 268.3, 2...
## $ NI <dbl> 53.376, 53.376, 53.376, 53.376, 53.376...
## $ PRCH_C <dbl> 33.9000, 33.9000, 33.9000, 33.9000, 33...
## $ PRCL_C <dbl> 19.06, 19.06, 19.06, 19.06, 19.06, 19....
## $ DRatio <dbl> 0.00000000, 0.00000000, 0.00000000, 0....
summary(dat)
## GVKEY CO_PER_ROL YEAR TICKER
## Min. : 1045 Min. : 79 Min. :2016 ETR : 10
## 1st Qu.: 8444 1st Qu.:37770 1st Qu.:2016 ENDP : 9
## Median : 23083 Median :48168 Median :2016 WLTW : 9
## Mean : 54156 Mean :44293 Mean :2016 AAP : 8
## 3rd Qu.: 66446 3rd Qu.:52516 3rd Qu.:2016 AIG : 8
## Max. :316056 Max. :55541 Max. :2016 ARNC : 8
## (Other):8248
## CUSIP EXECID EXEC_FNAME
## 29364G10: 10 Min. : 13 John : 364
## G3040110: 9 1st Qu.:32995 Michael: 340
## G9662910: 9 Median :43632 David : 295
## 00508X20: 8 Mean :39663 Robert : 263
## 00751Y10: 8 3rd Qu.:48445 James : 244
## 02687478: 8 Max. :51697 Thomas : 205
## (Other) :8248 (Other):6589
## EXEC_FULLNAME EXEC_LNAME EXEC_MNAME
## Ajay J. Sabherwal : 2 Smith : 59 :1113
## Alan B. Miller : 2 Johnson : 31 J. : 824
## Amin J. Khoury : 2 Miller : 31 A. : 682
## Barry Diller : 2 Jones : 24 M. : 556
## Barry M. Smith : 2 Brown : 21 L. : 412
## Charles A. Mathis, CPA: 2 Williams: 20 D. : 373
## (Other) :8288 (Other) :8114 (Other):4340
## GENDER NAMEPREFIX PAGE BECAMECEO
## FEMALE: 508 Mr. :7247 Min. :29.00 :6373
## MALE :7792 Ms. : 817 1st Qu.:49.00 1/1/2013: 29
## Dr. : 222 Median :54.00 1/1/2015: 24
## : 7 Mean :54.23 1/1/2017: 24
## Pro : 2 3rd Qu.:59.00 1/1/2016: 21
## Cap : 1 Max. :96.00 1/1/2012: 20
## (Other): 4 NA's :60 (Other) :1809
## EXECRANK JOINED_CO LEFTCO LEFTOFC
## Min. :1.000 :7537 :8276 :7846
## 1st Qu.:3.000 1/1/1986: 15 1/1/1999 : 1 12/31/2016: 20
## Median :5.000 1/1/1980: 14 1/1/2000 : 1 3/1/2017 : 11
## Mean :4.356 1/1/1992: 13 1/1/2006 : 1 1/1/2017 : 8
## 3rd Qu.:5.000 1/1/1995: 13 1/1/2007 : 1 10/1/2016 : 7
## Max. :9.000 1/1/1989: 12 1/25/2005: 1 12/31/2015: 6
## NA's :7189 (Other) : 696 (Other) : 19 (Other) : 402
## PCEO PCFO REASON REJOIN RELEFT
## :6713 :6721 :8277 Mode:logical Mode:logical
## CEO:1587 CFO:1579 RESIGNED: 10 NA's:8300 NA's:8300
## RETIRED : 8
## UNKNOWN : 5
##
##
##
## TITLE
## Chief Executive Officer, President and Director : 530
## Chief Financial Officer and Executive Vice President: 468
## Chief Financial Officer and Senior Vice President : 232
## Chief Executive Officer and Director : 183
## Chairman and Chief Executive Officer : 173
## Chief Financial Officer : 168
## (Other) :6546
## AGE ALLOTHPD ALLOTHTOT BONUS
## Min. :29.00 Mode:logical Mode:logical Min. : -8.71
## 1st Qu.:49.00 NA's:8300 NA's:8300 1st Qu.: 0.00
## Median :54.00 Median : 0.00
## Mean :54.23 Mean : 107.53
## 3rd Qu.:59.00 3rd Qu.: 0.00
## Max. :96.00 Max. :32000.00
## NA's :60
## CEOANN CFOANN CHG_CTRL_PYMT COMMENT
## :6713 :6721 Min. : 0.0 Mode:logical
## CEO:1587 CFO:1579 1st Qu.: 969.4 NA's:8300
## Median : 3374.2
## Mean : 6800.4
## 3rd Qu.: 8090.6
## Max. :520877.8
##
## DEFER_BALANCE_TOT DEFER_CONTRIB_CO_TOT DEFER_CONTRIB_EXEC_TOT
## Min. : 0.0 Min. : -5.691 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.0 Median : 0.000 Median : 0.00
## Mean : 1167.8 Mean : 42.137 Mean : 99.95
## 3rd Qu.: 519.1 3rd Qu.: 19.468 3rd Qu.: 17.80
## Max. :232879.1 Max. :13405.253 Max. :35028.01
##
## DEFER_EARNINGS_TOT DEFER_RPT_AS_COMP_TOT DEFER_WITHDR_TOT
## Min. :-19442.88 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.00 Median : 0.000 Median : 0.00
## Mean : 107.14 Mean : 2.313 Mean : 67.45
## 3rd Qu.: 28.42 3rd Qu.: 0.000 3rd Qu.: 0.00
## Max. : 39213.68 Max. :1234.283 Max. :71144.81
##
## EIP_UNEARN_NUM EIP_UNEARN_VAL EXECDIR EXECRANKANN
## Min. : 0.00 Min. : 0.0 Min. :0.0000 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:0.0000 1st Qu.: 2.000
## Median : 11.97 Median : 445.9 Median :0.0000 Median : 3.000
## Mean : 56.08 Mean : 2112.4 Mean :0.2563 Mean : 3.184
## 3rd Qu.: 49.72 3rd Qu.: 2024.7 3rd Qu.:1.0000 3rd Qu.: 4.000
## Max. :9240.00 Max. :157794.0 Max. :1.0000 Max. :10.000
## NA's :49
## INTERLOCK LTIP NONEQ_INCENT OLD_DATAFMT_FLAG
## Min. :0 Mode:logical Min. : 0.00 Min. :0
## 1st Qu.:0 NA's:8300 1st Qu.: 65.23 1st Qu.:0
## Median :0 Median : 314.27 Median :0
## Mean :0 Mean : 621.99 Mean :0
## 3rd Qu.:0 3rd Qu.: 730.43 3rd Qu.:0
## Max. :0 Max. :46999.10 Max. :0
##
## OPTION_AWARDS OPTION_AWARDS_BLK_VALUE OPTION_AWARDS_FV
## Min. : 0.0 Mode:logical Min. : 0.0
## 1st Qu.: 0.0 NA's:8300 1st Qu.: 0.0
## Median : 0.0 Median : 0.0
## Mean : 387.8 Mean : 387.9
## 3rd Qu.: 310.3 3rd Qu.: 302.4
## Max. :77990.7 Max. :77990.7
##
## OPTION_AWARDS_NUM OPTION_AWARDS_RPT_VALUE OPT_EXER_NUM
## Min. : 0.00 Mode:logical Min. : 0.00
## 1st Qu.: 0.00 NA's:8300 1st Qu.: 0.00
## Median : 0.00 Median : 0.00
## Mean : 37.92 Mean : 28.61
## 3rd Qu.: 28.75 3rd Qu.: 5.62
## Max. :10805.60 Max. :3547.09
##
## OPT_EXER_VAL OPT_UNEX_EXER_EST_VAL OPT_UNEX_EXER_NUM
## Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00
## Median : 0.0 Median : 0.0 Median : 12.50
## Mean : 826.5 Mean : 3259.2 Mean : 144.28
## 3rd Qu.: 81.5 3rd Qu.: 992.1 3rd Qu.: 96.72
## Max. :156626.4 Max. :608378.1 Max. :12320.05
##
## OPT_UNEX_UNEXER_EST_VAL OPT_UNEX_UNEXER_NUM OTHANN
## Min. : 0.0 Min. : 0.000 Mode:logical
## 1st Qu.: 0.0 1st Qu.: 0.000 NA's:8300
## Median : 0.0 Median : 6.091
## Mean : 868.5 Mean : 83.203
## 3rd Qu.: 434.1 3rd Qu.: 68.016
## Max. :273459.5 Max. :13279.313
##
## OTHCOMP PENSION_CHG PENSION_PYMTS_TOT
## Min. : -758.69 Min. :-5340.86 Min. : 0.00
## 1st Qu.: 14.94 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 40.15 Median : 0.00 Median : 0.00
## Mean : 191.82 Mean : 141.21 Mean : 29.92
## 3rd Qu.: 110.37 3rd Qu.: 12.83 3rd Qu.: 0.00
## Max. :58410.15 Max. :12890.42 Max. :50437.34
##
## PENSION_VALUE_TOT REPRICE RET_YRS RSTKGRNT
## Min. : 0.00 Min. :0 Mode:logical Mode:logical
## 1st Qu.: 0.00 1st Qu.:0 NA's:8300 NA's:8300
## Median : 0.00 Median :0
## Mean : 1085.24 Mean :0
## 3rd Qu.: 53.49 3rd Qu.:0
## Max. :98934.00 Max. :0
##
## RSTKVYRS SALARY SAL_PCT SHROWN_EXCL_OPTS
## Mode:logical Min. : 0.0 Min. : -100.000 Min. : 0.0
## NA's:8300 1st Qu.: 350.0 1st Qu.: 0.000 1st Qu.: 17.5
## Median : 475.0 Median : 2.932 Median : 54.6
## Mean : 555.7 Mean : 14.736 Mean : 569.8
## 3rd Qu.: 680.1 3rd Qu.: 7.048 3rd Qu.: 162.2
## Max. :8303.2 Max. :12898.267 Max. :409513.0
## NA's :1221 NA's :189
## SHROWN_EXCL_OPTS_PCT SHROWN_TOT SHROWN_TOT_PCT
## Min. : 0.0000 Min. : 0.0 Min. : 0.0000
## 1st Qu.: 0.0160 1st Qu.: 27.3 1st Qu.: 0.0300
## Median : 0.0600 Median : 84.0 Median : 0.0980
## Mean : 0.5051 Mean : 668.8 Mean : 0.6218
## 3rd Qu.: 0.1870 3rd Qu.: 248.1 3rd Qu.: 0.2922
## Max. :68.9730 Max. :409513.0 Max. :68.9730
## NA's :234 NA's :189 NA's :560
## SHRS_VEST_NUM SHRS_VEST_VAL STOCK_AWARDS
## Min. : 0.000 Min. : 0.00 Min. : 0.0
## 1st Qu.: 2.378 1st Qu.: 64.96 1st Qu.: 254.8
## Median : 11.500 Median : 362.28 Median : 716.0
## Mean : 35.498 Mean : 1430.03 Mean : 1574.3
## 3rd Qu.: 35.514 3rd Qu.: 1252.80 3rd Qu.: 1731.5
## Max. :2999.040 Max. :180095.53 Max. :242449.1
##
## STOCK_AWARDS_FV STOCK_UNVEST_NUM STOCK_UNVEST_VAL TDC1
## Min. : 0.0 Min. : 0.00 Min. : 0.0 Min. : 0
## 1st Qu.: 200.0 1st Qu.: 3.81 1st Qu.: 140.5 1st Qu.: 1079
## Median : 673.4 Median : 21.33 Median : 788.2 Median : 2031
## Mean : 1550.5 Mean : 63.81 Mean : 2473.9 Mean : 3418
## 3rd Qu.: 1693.5 3rd Qu.: 62.04 3rd Qu.: 2256.2 3rd Qu.: 3946
## Max. :242449.1 Max. :13388.24 Max. :520877.8 Max. :243877
##
## TDC1_PCT TDC2 TDC2_PCT TERM_PYMT
## Min. : -100.000 Min. : 0 Min. : -100.000 Min. : -2485
## 1st Qu.: -10.784 1st Qu.: 1048 1st Qu.: -16.489 1st Qu.: 0
## Median : 4.378 Median : 2028 Median : 5.749 Median : 802
## Mean : 23.199 Mean : 3856 Mean : 33.930 Mean : 3364
## 3rd Qu.: 23.810 3rd Qu.: 4110 3rd Qu.: 36.848 3rd Qu.: 3057
## Max. :20286.228 Max. :243877 Max. :20286.228 Max. :330233
## NA's :1496 NA's :1495
## TITLEANN
## Chief Executive Officer, President and Director : 530
## Chief Financial Officer and Executive Vice President: 468
## Chief Financial Officer and Senior Vice President : 232
## Chief Executive Officer and Director : 183
## Chairman and Chief Executive Officer : 173
## Chief Financial Officer : 168
## (Other) :6546
## TOTAL_ALT1 TOTAL_ALT1_PCT TOTAL_ALT2
## Min. : -5064 Min. : -250.665 Min. : 0.0
## 1st Qu.: 1116 1st Qu.: -10.301 1st Qu.: 906.2
## Median : 2101 Median : 5.499 Median : 1759.8
## Mean : 3557 Mean : 24.560 Mean : 3874.8
## 3rd Qu.: 4105 3rd Qu.: 26.647 3rd Qu.: 3878.2
## Max. :243877 Max. :20286.228 Max. :219593.1
## NA's :1496
## TOTAL_ALT2_PCT TOTAL_CURR TOTAL_CURR_PCT TOTAL_SEC
## Min. :-100.000 Min. : 0.0 Min. :-100.000 Min. : -5064
## 1st Qu.: -17.501 1st Qu.: 372.4 1st Qu.: 0.000 1st Qu.: 1134
## Median : 8.246 Median : 507.5 Median : 2.776 Median : 2117
## Mean : 32.112 Mean : 663.2 Mean : 10.787 Mean : 3570
## 3rd Qu.: 43.425 3rd Qu.: 750.0 3rd Qu.: 7.513 3rd Qu.: 4116
## Max. :3694.950 Max. :35500.0 Max. :3886.688 Max. :243877
## NA's :1495 NA's :1221 NA's :4
## TOTAL_SEC_PCT AT CH
## Min. : -250.66 Min. : 14 Min. : 0.00
## 1st Qu.: -10.12 1st Qu.: 1174 1st Qu.: 59.35
## Median : 5.72 Median : 3639 Median : 183.50
## Mean : 26.55 Mean : 26872 Mean : 1003.40
## 3rd Qu.: 26.43 3rd Qu.: 11988 3rd Qu.: 603.00
## Max. :20286.23 Max. :3287968 Max. :121711.00
## NA's :1219
## DLTT NI PRCH_C
## Min. : 0 Min. :-6177.00 Min. : 0.4999
## 1st Qu.: 169 1st Qu.: 18.03 1st Qu.: 25.6300
## Median : 790 Median : 104.34 Median : 47.0000
## Mean : 7335 Mean : 603.33 Mean : 64.8202
## 3rd Qu.: 2995 3rd Qu.: 420.00 3rd Qu.: 78.9700
## Max. :3226737 Max. :45687.00 Max. :1845.3731
##
## PRCL_C DRatio
## Min. : 0.012 Min. :0.00000
## 1st Qu.: 14.070 1st Qu.:0.07777
## Median : 28.240 Median :0.23549
## Mean : 40.643 Mean :0.26140
## 3rd Qu.: 49.950 3rd Qu.:0.37706
## Max. :1462.020 Max. :3.00015
##
Histogram of AGE
Mostly middle aged
plot_ly(x = ageg,
type = "histogram")
plot_ly(x = ageg,
type = "histogram",
histnorm = "probability")
Histrogram of Gender
lot of Males (93% is Male) Mostly Middle aged Men
plot_ly(x = GENDER,
type = "histogram")
plot_ly(x = GENDER,
type = "histogram",
histnorm = "probability")
Histogram of other Variables
Nothing Significant From these Histogram
plot_ly(x = CH,
type = "histogram") #Total Cash
plot_ly(x = AT,
type = "histogram") #Assets - Total
plot_ly(x = NI,
type = "histogram") #Net Income (Loss)
plot_ly(x = DLTT,
type = "histogram") #Long-Term Debt - Total
plot_ly(x = PRCH_C,
type = "histogram") #Price High - Annual - Calendar
plot_ly(x = PRCL_C,
type = "histogram") #Price Low - Annual - Calendar
plot_ly(x = DRatio,
type = "histogram") #Debt Ratio
plot_ly(x = SALARY,
type = "histogram") #Salary
plot_ly(x = dat$SHRS_VEST_VAL,
type = "histogram") #Value Realized on Vesting ($)
plot_ly(x = BONUS,
type = "histogram") #The dollar value of a bonus earned by the named executive officer during the fiscal year.
plot_ly(x = OTHCOMP,
type = "histogram") #All Other Compensation ($)
plot_ly(x = TDC2,
type = "histogram") #Total Compensation (Salary + Bonus + Other Annual + Restricted Stock Grants + LTIP Payouts + All Other + Value of Options Exercised).
Histogram of other Variables using log to find significance
Normal Distribution
plot_ly(x = log(CH),
type = "histogram")
plot_ly(x = log(AT),
type = "histogram")
plot_ly(x = log(NI),
type = "histogram")
plot_ly(x = log(DLTT),
type = "histogram")
plot_ly(x = log(PRCH_C),
type = "histogram")
plot_ly(x = log(PRCL_C),
type = "histogram")
plot_ly(x = log(SALARY),
type = "histogram")
plot_ly(x = log(dat$SAL_PCT),
type = "histogram")
plot_ly(x = log(BONUS),
type = "histogram")
plot_ly(x = log(OTHCOMP),
type = "histogram")
plot_ly(x = log(TDC2),
type = "histogram")
plot_ly(x = log(DRatio),
type = "histogram")
Relationships between selected variables
High to Low relationships -AT and DLTT, SALARY, BONUS -NI and TDC2, SALARY, BONUS -CH and SALARY, BONUS, TDC2 -DLTT and SALARY -PRCH_C, PRCH_L and TDC2 -SALARY and AGE
par(mfrow=c(1,1))
pairs.panels(dat1)
-No Significant Difference observed on Male only Data probably due to high number of observation
#observe male data only
pairs.panels(datm)
-AT and Bonus has a huge difference between Male .21 and female .60 -DLTT and Bonus has a huge difference between Male .05 and female .56 -TDC2 and CH has a small difference between Male .25 and female .36 -NI and Stock price has difference between Male and female -All other relationships are high compared to Men Probably due to very low observations
Does female Execs has to perform better then men in average? Does firm have better with female Exec?
#observe only female data
pairs.panels(datf)
#Getting Ideas about graphs
dat1 %>%
select("AT", "NI", "CH","TDC2", "SALARY", "BONUS", "AGE", "GENDER") %>%
ggpairs(dat1)
Compbining Datset
Added age group to my data
hist(dat$AGE)
group_by(dat1, AGE)
summarize(group_by(dat1, AGE), aSalary=mean(SALARY))
Probability Stats
Added age group to my data Review Probability
#Review Probability
htmlTable(as.data.frame(round(prop.table(xtabs(~ GENDER + ageg,
data = dat1 %>%
mutate(AGE = AGE) %>%
mutate(ageg = cut(AGE, 6))), 2), 2)),
caption = "Frequency of Price Difference",
header = paste(c("Brand", "Price Difference", "† Frequency")),
tfoot="† Data from Midterm Project")
Frequency of Price Difference | |||
Brand | Price Difference | †Frequency | |
---|---|---|---|
1 | FEMALE | (28.9,40.2] | 0.04 |
2 | MALE | (28.9,40.2] | 0.96 |
3 | FEMALE | (40.2,51.3] | 0.06 |
4 | MALE | (40.2,51.3] | 0.94 |
5 | FEMALE | (51.3,62.5] | 0.07 |
6 | MALE | (51.3,62.5] | 0.93 |
7 | FEMALE | (62.5,73.7] | 0.05 |
8 | MALE | (62.5,73.7] | 0.95 |
9 | FEMALE | (73.7,84.8] | 0.01 |
10 | MALE | (73.7,84.8] | 0.99 |
11 | FEMALE | (84.8,96.1] | 0 |
12 | MALE | (84.8,96.1] | 1 |
†Data from Midterm Project |
##added new variable ageg (AGE GROUP Based on this)
Female low probability
(d <- as.data.frame(prop.table(xtabs(~ GENDER + ageg,
data = dat1 %>%
mutate (ageg = cut(AGE, 6)) %>% # step 2
mutate (ageg = recode(ageg, "(0,30)" = "30 and under",
"(31,45)" = "31-45",
"(46,65)" = "46-65",
"(66,100)" = "65+"))), 2)) %>%
# 3 and 4. Visualize conditional probability of brand by price difference type
ggplot(aes(x = GENDER, y = Freq, fill = ageg)) +
geom_bar(stat="identity", position = "dodge", color = "black") +
geom_text(aes(label = round(Freq,2)), vjust=1.5, color = "white",
position = position_dodge(0.9), size = 4)+
ggtitle("Women are less likely to be Executives", sub = "Probabbility Grouped by Age")
)
EDA Graphs
Middle age Men has highest number Bonus and Salary
ggplot(dat, aes(x = AGE, y = BONUS, color = GENDER)) +
geom_point() +
ggtitle("Middle age men has highest number of bonus with outliers", sub = "Small number of Female observations")
Over all Observation
grid.arrange(
ggplot(dat, aes(x = AGE, y = SALARY, color = GENDER)) +
geom_point() +
stat_density2d() +
ggtitle("Mostly Middle aged Men", sub = "Small number of Female observations"),
ggplot(dat, aes(x = AGE, y = SALARY, color = GENDER)) +
geom_point() +
stat_density2d(aes(fill =..density..), geom = "raster", contour = FALSE) +
ggtitle("Mostly Middle aged Men", sub = "Heat map, light color indicates more observations"),
ncol = 2
)
grid.arrange(
dat %>%
group_by(GENDER) %>%
summarise(age_mean = mean(AGE)) %>%
ggplot(aes(x = GENDER, y = age_mean, fill = GENDER)) +
geom_bar(stat="identity") +
coord_flip(),
dat %>%
ggplot(aes(x = GENDER, y = AGE, fill = GENDER)) +
geom_boxplot() +
coord_flip() +
guides(fill = FALSE, ylab = FALSE),
ncol = 2
)
Female Compensation in average
As female age they get higher compensation
datf %>%
group_by(ageg) %>%
summarise(Avg_Com = mean(TDC2)) %>%
ggplot(aes(x = ageg, y = Avg_Com, )) +
geom_bar(stat="identity", position = "dodge", color = "black") +
labs(fill = "Gender") +
# Add and improve titles and labels
ggtitle("In Average as female age there Compensation gets Larger") +
ylab("Average Compensation by Thousands") +
xlab("Age Group") +
scale_x_discrete(labels=c("Under 40", "40 to 51", "51 to 62", "62 to 73", "73 to 84", "84 +", "NA")) +
theme_classic() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),)
datm %>%
group_by(ageg) %>%
summarise(Avg_Com = mean(TDC2)) %>%
ggplot(aes(x = ageg, y = Avg_Com, )) +
geom_bar(stat="identity", position = "dodge", color = "black") +
labs(fill = "Gender") +
# Add and improve titles and labels
ggtitle("In Average as female age there Compensation gets Larger") +
ylab("Average Compensation by Thousands") +
xlab("Age Group") +
scale_x_discrete(labels=c("Under 40", "40 to 51", "51 to 62", "62 to 73", "73 to 84", "84 +", "NA")) +
theme_classic() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),)
dat1 %>%
group_by( GENDER ) %>%
summarise( count = n(),
proportion = sum(count) / nrow(dat1),
percent = proportion * 100)
dat1 %>%
group_by( AGE ) %>%
ggplot() +
geom_bar(aes(x = GENDER)) +
ggtitle("Middle Age")
Average Compensation by Gender
Lets compare Total Compensation side by side
- We can see that:
- There are no Female Executive over 84
- Female age group has a big compensation jump from 62-73 to 73-84
# Average profit by region and category
dat1 %>%
group_by(ageg, GENDER) %>%
summarise(Avg_Com = mean(TDC2)) %>%
ggplot(aes(x = ageg, y = Avg_Com, fill = reorder(GENDER, desc(Avg_Com)))) +
geom_bar(stat="identity", position = "dodge", color = "black") +
geom_hline(yintercept = 0, color = "black") +
labs(fill = "Gender") +
# Add and improve titles and labels
ggtitle("Middle age female age groups has a big compensation jump") +
ylab("Average Compensation by Thousands") +
xlab("Age Group") +
scale_x_discrete(labels=c("Under 40", "40 to 51", "51 to 62", "62 to 73", "73 to 84", "84 +", "NA")) +
theme_classic() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),)
Multiple Regression Analysis
Testing Different Models for Significance Analysis
- Most statistically significant
- Have the largest impact
Visualize which variables have coefficients statistically different from zero
attach(dat1)
# ANOVA
baseMod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C)
mod1 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI)
mod2 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT)
mod3 <- lm(TDC2 ~ AGE + BONUS + AT + CH)
mod4 <- lm(TDC2 ~ AGE + BONUS + AT)
anova(baseMod, mod1, mod2, mod3, mod4)
summary(baseMod)
##
## Call:
## lm(formula = TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C +
## PRCL_C)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30887 -2136 -1242 265 229687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.750e+03 5.491e+02 -3.188 0.00144 **
## AGE 7.941e+01 9.956e+00 7.975 1.72e-15 ***
## BONUS 1.970e+00 1.216e-01 16.206 < 2e-16 ***
## AT -3.835e-03 8.718e-04 -4.400 1.10e-05 ***
## CH 1.714e-01 2.236e-02 7.665 2.00e-14 ***
## DLTT -2.388e-04 1.187e-03 -0.201 0.84048
## NI 7.366e-01 3.893e-02 18.920 < 2e-16 ***
## PRCH_C 7.994e+00 4.572e+00 1.748 0.08042 .
## PRCL_C 1.784e+00 6.457e+00 0.276 0.78228
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6640 on 8231 degrees of freedom
## (60 observations deleted due to missingness)
## Multiple R-squared: 0.1555, Adjusted R-squared: 0.1547
## F-statistic: 189.4 on 8 and 8231 DF, p-value: < 2.2e-16
attach(datm)
# ANOVA
baseMod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C)
mod1 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI)
mod2 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT)
mod3 <- lm(TDC2 ~ AGE + BONUS + AT + CH)
mod4 <- lm(TDC2 ~ AGE + BONUS + AT)
anova(baseMod, mod1, mod2, mod3, mod4)
summary(baseMod)
##
## Call:
## lm(formula = TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C +
## PRCL_C)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30782 -2149 -1234 277 229688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.936e+03 5.501e+02 -3.519 0.000435 ***
## AGE 8.297e+01 9.968e+00 8.324 < 2e-16 ***
## BONUS 1.926e+00 1.206e-01 15.964 < 2e-16 ***
## AT -2.836e-03 9.237e-04 -3.070 0.002148 **
## CH 1.523e-01 2.223e-02 6.851 7.91e-12 ***
## DLTT -1.089e-03 1.205e-03 -0.904 0.366064
## NI 7.369e-01 4.053e-02 18.179 < 2e-16 ***
## PRCH_C 9.298e+00 4.584e+00 2.028 0.042584 *
## PRCL_C 9.598e-02 6.479e+00 0.015 0.988181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6497 on 7723 degrees of freedom
## (60 observations deleted due to missingness)
## Multiple R-squared: 0.1563, Adjusted R-squared: 0.1554
## F-statistic: 178.8 on 8 and 7723 DF, p-value: < 2.2e-16
attach(datf)
# ANOVA
baseMod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C)
mod1 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI)
mod2 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT)
mod3 <- lm(TDC2 ~ AGE + BONUS + AT + CH)
mod4 <- lm(TDC2 ~ AGE + BONUS + AT)
anova(baseMod, mod1, mod2, mod3, mod4)
summary(baseMod)
##
## Call:
## lm(formula = TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C +
## PRCL_C)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29760 -1698 -1022 183 156374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.254e+03 3.230e+03 1.008 0.314
## AGE -2.057e+01 5.946e+01 -0.346 0.730
## BONUS 4.741e+00 9.649e-01 4.913 1.22e-06 ***
## AT -5.897e-03 9.393e-03 -0.628 0.530
## CH 1.410e+00 2.475e-01 5.696 2.10e-08 ***
## DLTT -1.210e-01 8.732e-02 -1.385 0.167
## NI 3.765e-01 1.925e-01 1.956 0.051 .
## PRCH_C -1.083e+01 2.667e+01 -0.406 0.685
## PRCL_C 2.003e+01 3.716e+01 0.539 0.590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8277 on 499 degrees of freedom
## Multiple R-squared: 0.2103, Adjusted R-squared: 0.1976
## F-statistic: 16.61 on 8 and 499 DF, p-value: < 2.2e-16
Visualization of Data Regression
Age Groups Has most Confident Interval Variance
attach(dat1)
mod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C+ OTHCOMP+ ageg)
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Int
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing=FALSE),],3))
estimate | se | t | pval | low_CI | high_CI | |
---|---|---|---|---|---|---|
OTHCOMP | 1.666 | 0.071 | 23.307 | 0 | 1.526 | 1.806 |
NI | 0.698 | 0.038 | 18.474 | 0 | 0.624 | 0.772 |
BONUS | 1.631 | 0.119 | 13.752 | 0 | 1.398 | 1.863 |
CH | 0.157 | 0.022 | 7.254 | 0 | 0.115 | 0.2 |
AT | -0.003 | 0.001 | -3.594 | 0 | -0.005 | -0.001 |
PRCH_C | 9.562 | 4.427 | 2.16 | 0.031 | 0.884 | 18.239 |
AGE | 42.328 | 24.645 | 1.718 | 0.086 | -5.981 | 90.638 |
ageg(62.5,73.7] | 1375.288 | 851.987 | 1.614 | 0.107 | -294.821 | 3045.398 |
ageg(51.3,62.5] | 969.318 | 650.804 | 1.489 | 0.136 | -306.421 | 2245.057 |
ageg(84.8,96.1] | -2276.753 | 1968.756 | -1.156 | 0.248 | -6136.013 | 1582.506 |
ageg(40.2,51.3] | 514.775 | 525.218 | 0.98 | 0.327 | -514.785 | 1544.334 |
DLTT | -0.001 | 0.001 | -0.709 | 0.478 | -0.003 | 0.001 |
ageg(73.7,84.8] | 80.865 | 1308.199 | 0.062 | 0.951 | -2483.535 | 2645.266 |
PRCL_C | -0.259 | 6.251 | -0.041 | 0.967 | -12.513 | 11.994 |
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw() +
ggtitle("ALL Data: Age group has different confident intervals") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI) +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI) +
xlab("Coefficient with Confidence Interval") +
geom_vline(xintercept = 0, color = "red")
)
MALE ONLY
attach(datm) #Only Male Data
mod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C+ OTHCOMP+ ageg)
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Int
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing=FALSE),],3))
estimate | se | t | pval | low_CI | high_CI | |
---|---|---|---|---|---|---|
OTHCOMP | 1.625 | 0.072 | 22.691 | 0 | 1.485 | 1.766 |
NI | 0.703 | 0.039 | 17.902 | 0 | 0.626 | 0.78 |
BONUS | 1.583 | 0.118 | 13.46 | 0 | 1.353 | 1.814 |
CH | 0.141 | 0.022 | 6.553 | 0 | 0.099 | 0.183 |
AT | -0.002 | 0.001 | -2.485 | 0.013 | -0.004 | 0 |
PRCH_C | 10.633 | 4.436 | 2.397 | 0.017 | 1.938 | 19.328 |
ageg(62.5,73.7] | 1513.737 | 857.505 | 1.765 | 0.078 | -167.207 | 3194.68 |
AGE | 40.677 | 24.844 | 1.637 | 0.102 | -8.024 | 89.378 |
ageg(51.3,62.5] | 1014.902 | 653.229 | 1.554 | 0.12 | -265.604 | 2295.408 |
DLTT | -0.002 | 0.001 | -1.299 | 0.194 | -0.004 | 0.001 |
ageg(84.8,96.1] | -2226.189 | 1952.745 | -1.14 | 0.254 | -6054.1 | 1601.722 |
ageg(40.2,51.3] | 464.158 | 525.652 | 0.883 | 0.377 | -566.262 | 1494.578 |
PRCL_C | -1.665 | 6.268 | -0.266 | 0.791 | -13.953 | 10.623 |
ageg(73.7,84.8] | 200.288 | 1306.506 | 0.153 | 0.878 | -2360.819 | 2761.395 |
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw() +
ggtitle("Male only: Age group has different confident intervals") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI) +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI) +
xlab("Coefficient with Confidence Interval") +
geom_vline(xintercept = 0, color = "red")
)
FEMALE ONLY: USING LOG
attach(datf) #Only Female Data
mod <- lm(log(TDC2) ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C+ OTHCOMP+ ageg)
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Int
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing=FALSE),],3))
estimate | se | t | pval | low_CI | high_CI | |
---|---|---|---|---|---|---|
AT | 0 | 0 | -5.408 | 0 | 0 | 0 |
CH | 0 | 0 | 5.063 | 0 | 0 | 0 |
OTHCOMP | 0 | 0 | 4.979 | 0 | 0 | 0 |
DLTT | 0 | 0 | 4.61 | 0 | 0 | 0 |
BONUS | 0 | 0 | 4.372 | 0 | 0 | 0.001 |
AGE | 0.027 | 0.014 | 1.942 | 0.053 | 0 | 0.054 |
NI | 0 | 0 | -1.478 | 0.14 | 0 | 0 |
PRCL_C | 0.005 | 0.004 | 1.373 | 0.17 | -0.002 | 0.013 |
ageg(40.2,51.3] | 0.334 | 0.336 | 0.994 | 0.321 | -0.326 | 0.995 |
ageg(73.7,84.8] | -0.776 | 1.125 | -0.69 | 0.491 | -2.987 | 1.434 |
PRCH_C | -0.002 | 0.003 | -0.595 | 0.552 | -0.007 | 0.004 |
ageg(51.3,62.5] | 0.18 | 0.396 | 0.453 | 0.651 | -0.599 | 0.959 |
ageg(62.5,73.7] | -0.153 | 0.492 | -0.311 | 0.756 | -1.121 | 0.814 |
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw() +
ggtitle("Female only: Age group has different confident intervals") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI) +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI) +
xlab("Coefficient with Confidence Interval") +
geom_vline(xintercept = 0, color = "red")
)
Chow test
#== Chow test
chow.test <- local({
m <- 2 # number of restrictions (delta_0 = delta_1 = 0)
n <- dim(dat1)[1]
full.1 <- lm( TDC2 ~ AGE, data=subset(dat, GENDER == "FEMALE")) # female regression
full.2 <- lm( TDC2 ~ AGE, data=subset(dat, GENDER == "MALE")) # male regression
full <- lm(TDC2 ~ AGE + GENDER + GENDER*AGE, data=dat) # full model using dumies
rest <- lm(TDC2 ~ AGE + AGE, data=dat) # restricted model
SSE.1 <- sum( resid(full.1)^2 )
SSE.2 <- sum( resid(full.2)^2 )
SSE.full <- sum( resid(full)^2 )
all.equal(SSE.full, SSE.1 + SSE.2)
SSE.rm <- sum( resid(rest)^2 )
chow <- (( SSE.rm - (SSE.1 + SSE.2)) / m ) / ( (SSE.1 + SSE.2) / (n - 2*m) )
pval.chow <- 1 - pf(chow, m, n-2*m)
out <- list(SSE.1=SSE.1, SSE.2=SSE.2, SSE.rm=SSE.rm, chow=chow, pval.chow=pval.chow)
return(out)
})
sapply(chow.test, cbind)
## SSE.1 SSE.2 SSE.rm chow pval.chow
## 4.325947e+10 3.826649e+11 4.259930e+11 6.685080e-01 5.125002e-01