Code header

# Course: BUAN 5210
# Title: Final Project EDA
# Purpose: Intial EDA for the Final Project
# Date: Dec 1st, 2017
# Author: Afsar Ali
# Clear working environment
rm(list=ls(all=TRUE)) 

# The tidyverse package contains ggplot2, tibble, tidyr, readr, purr, and dplyr among others
library(tidyverse)
# The gridExtra package contains grid.arrange function used to combine plots
library(gridExtra)
# The GGally package contains ggpairs which is a custom correlation graph for ggplot2
library(GGally)
# Load gridExtra so can plot more than one graph with grid.arrange
library(gridExtra)
#load other packages
library(plotly)
library(psych)
library(knitr)
library(htmlTable)
#Load and Review data
dat <- read.csv("ECON5100_project_data.csv", header = TRUE)
dat <- mutate(dat, DRatio = (DLTT/AT)) #Calculating long term debt ratio

#Make a subset of relevent data
dat1 <- select(dat, "AT","NI", "CH", "DLTT", "PRCH_C", "PRCL_C", "DRatio", "TDC2", "SALARY", "BONUS", "OTHCOMP", "AGE", "GENDER")
dat1<-  mutate (dat1, ageg = cut(AGE, 6)) #grouping age by 6
datm <- dat1 %>%  filter(GENDER =="MALE") #nale only 
datf <- dat1 %>%  filter(GENDER =="FEMALE") #female only

#attach the file for use
attach(dat1)

glimpse(dat)
## Observations: 8,300
## Variables: 99
## $ GVKEY                   <int> 21542, 21542, 21542, 21542, 21542, 215...
## $ CO_PER_ROL              <int> 38455, 38456, 38457, 38459, 52715, 547...
## $ YEAR                    <int> 2016, 2016, 2016, 2016, 2016, 2016, 20...
## $ TICKER                  <fct> AAON, AAON, AAON, AAON, AAON, AAON, AB...
## $ CUSIP                   <fct> 00036020, 00036020, 00036020, 00036020...
## $ EXECID                  <int> 36415, 36416, 36417, 36419, 49230, 510...
## $ EXEC_FNAME              <fct> Norman, Robert, Kathy, Scott, Sam, Gar...
## $ EXEC_FULLNAME           <fct> Norman H. Asbjornson, Robert G. Fergus...
## $ EXEC_LNAME              <fct> Asbjornson, Fergus, Sheffield, Asbjorn...
## $ EXEC_MNAME              <fct> H., G., I., M., J., D., P., Hlavinka, ...
## $ GENDER                  <fct> MALE, MALE, FEMALE, MALE, MALE, MALE, ...
## $ NAMEPREFIX              <fct> Mr., Mr., Ms., Mr., Mr., Mr., Mr., Ms....
## $ PAGE                    <int> 80, 75, 63, 47, 39, 56, 59, 52, 54, 44...
## $ BECAMECEO               <fct> 1/1/1989, , , , , , , , 3/31/2015, , ,...
## $ EXECRANK                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ JOINED_CO               <fct> , , , , , , , , , , , 1/1/1973, , 3/1/...
## $ LEFTCO                  <fct> , , , , , , , , , , , , 5/7/2001, , , ...
## $ LEFTOFC                 <fct> , , , , , , , , , , , , 5/7/2001, , , ...
## $ PCEO                    <fct> CEO, , , , , , , , CEO, , , CEO, , , ,...
## $ PCFO                    <fct> , , , CFO, , , , , , CFO, , , , , , CF...
## $ REASON                  <fct> , , , , , , , , , , , , RETIRED, , , ,...
## $ REJOIN                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ RELEFT                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ TITLE                   <fct> Chairman of the Board and Chief Execut...
## $ AGE                     <int> 80, 75, 63, 47, 39, 56, 59, 52, 54, 44...
## $ ALLOTHPD                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ ALLOTHTOT               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ BONUS                   <dbl> -8.715, 6.192, 0.000, 0.000, 6.436, 0....
## $ CEOANN                  <fct> CEO, , , , , , , , CEO, , , CEO, , , ,...
## $ CFOANN                  <fct> , , , CFO, , , , , , CFO, , , , , , CF...
## $ CHG_CTRL_PYMT           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ COMMENT                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ DEFER_BALANCE_TOT       <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_CONTRIB_CO_TOT    <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_CONTRIB_EXEC_TOT  <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_EARNINGS_TOT      <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ DEFER_RPT_AS_COMP_TOT   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ DEFER_WITHDR_TOT        <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ EIP_UNEARN_NUM          <dbl> 102.719, 11.510, 16.514, 16.505, 12.32...
## $ EIP_UNEARN_VAL          <dbl> 3394.862, 380.405, 545.787, 545.490, 4...
## $ EXECDIR                 <int> 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,...
## $ EXECRANKANN             <int> 1, 5, 2, 3, 4, 6, 2, 4, 1, 5, 3, 1, 4,...
## $ INTERLOCK               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LTIP                    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ NONEQ_INCENT            <dbl> 359.521, 68.797, 85.898, 85.891, 71.51...
## $ OLD_DATAFMT_FLAG        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ OPTION_AWARDS           <dbl> 320.497, 30.047, 65.167, 65.167, 41.36...
## $ OPTION_AWARDS_BLK_VALUE <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ OPTION_AWARDS_FV        <dbl> 733.690, 68.668, 113.368, 113.368, 81....
## $ OPTION_AWARDS_NUM       <dbl> 73.310, 6.930, 11.485, 11.485, 8.240, ...
## $ OPTION_AWARDS_RPT_VALUE <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ OPT_EXER_NUM            <dbl> 0.000, 0.000, 22.000, 0.000, 12.050, 0...
## $ OPT_EXER_VAL            <dbl> 0.000, 0.000, 522.728, 0.000, 214.078,...
## $ OPT_UNEX_EXER_EST_VAL   <dbl> 72.655, 272.082, 1475.336, 120.486, 0....
## $ OPT_UNEX_EXER_NUM       <dbl> 7.664, 11.570, 57.711, 5.461, 0.000, 0...
## $ OPT_UNEX_UNEXER_EST_VAL <dbl> 715.048, 133.702, 225.357, 225.357, 19...
## $ OPT_UNEX_UNEXER_NUM     <dbl> 65.646, 8.860, 15.024, 15.024, 11.940,...
## $ OTHANN                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ OTHCOMP                 <dbl> 92.977, 24.150, 22.473, 32.135, 25.951...
## $ PENSION_CHG             <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ PENSION_PYMTS_TOT       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PENSION_VALUE_TOT       <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ REPRICE                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ RET_YRS                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ RSTKGRNT                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ RSTKVYRS                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ SALARY                  <dbl> 480.960, 197.220, 246.240, 246.220, 20...
## $ SAL_PCT                 <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, NA,...
## $ SHROWN_EXCL_OPTS        <dbl> 9753.256, 19.725, 13.644, 1386.240, 10...
## $ SHROWN_EXCL_OPTS_PCT    <dbl> 18.528, 0.037, 0.026, 2.633, 0.020, 0....
## $ SHROWN_TOT              <dbl> 9775.582, 35.381, 58.152, 1393.998, 15...
## $ SHROWN_TOT_PCT          <dbl> 18.570, 0.067, 0.110, 2.648, 0.029, 0....
## $ SHRS_VEST_NUM           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ SHRS_VEST_VAL           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ STOCK_AWARDS            <dbl> 782.817, 60.135, 130.609, 130.609, 82....
## $ STOCK_AWARDS_FV         <dbl> 2833.765, 293.901, 413.851, 413.557, 3...
## $ STOCK_UNVEST_NUM        <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ STOCK_UNVEST_VAL        <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ TDC1                    <dbl> 4492.198, 658.928, 881.830, 891.171, 6...
## $ TDC1_PCT                <dbl> 28.146, 13.513, 24.696, 25.870, 20.450...
## $ TDC2                    <dbl> 3758.508, 590.260, 1291.190, 777.803, ...
## $ TDC2_PCT                <dbl> 21.543, 8.931, 95.937, 17.883, 54.013,...
## $ TERM_PYMT               <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.0...
## $ TITLEANN                <fct> Chairman of the Board and Chief Execut...
## $ TOTAL_ALT1              <dbl> 4492.198, 658.928, 881.830, 891.171, 6...
## $ TOTAL_ALT1_PCT          <dbl> 28.146, 13.513, 24.696, 25.870, 20.450...
## $ TOTAL_ALT2              <dbl> 924.743, 296.359, 877.339, 364.246, 52...
## $ TOTAL_ALT2_PCT          <dbl> -4.885, -3.810, 133.496, -67.252, -74....
## $ TOTAL_CURR              <dbl> 472.245, 203.412, 246.240, 246.220, 21...
## $ TOTAL_CURR_PCT          <dbl> -11.236, -1.730, -4.721, -4.721, -1.72...
## $ TOTAL_SEC               <dbl> 2028.057, 386.541, 550.387, 560.022, 4...
## $ TOTAL_SEC_PCT           <dbl> -14.579, -9.653, 4.799, 6.407, -1.359,...
## $ AT                      <dbl> 256.530, 256.530, 256.530, 256.530, 25...
## $ CH                      <dbl> 24.153, 24.153, 24.153, 24.153, 24.153...
## $ DLTT                    <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 268.3, 2...
## $ NI                      <dbl> 53.376, 53.376, 53.376, 53.376, 53.376...
## $ PRCH_C                  <dbl> 33.9000, 33.9000, 33.9000, 33.9000, 33...
## $ PRCL_C                  <dbl> 19.06, 19.06, 19.06, 19.06, 19.06, 19....
## $ DRatio                  <dbl> 0.00000000, 0.00000000, 0.00000000, 0....
summary(dat)
##      GVKEY          CO_PER_ROL         YEAR          TICKER    
##  Min.   :  1045   Min.   :   79   Min.   :2016   ETR    :  10  
##  1st Qu.:  8444   1st Qu.:37770   1st Qu.:2016   ENDP   :   9  
##  Median : 23083   Median :48168   Median :2016   WLTW   :   9  
##  Mean   : 54156   Mean   :44293   Mean   :2016   AAP    :   8  
##  3rd Qu.: 66446   3rd Qu.:52516   3rd Qu.:2016   AIG    :   8  
##  Max.   :316056   Max.   :55541   Max.   :2016   ARNC   :   8  
##                                                  (Other):8248  
##       CUSIP          EXECID        EXEC_FNAME  
##  29364G10:  10   Min.   :   13   John   : 364  
##  G3040110:   9   1st Qu.:32995   Michael: 340  
##  G9662910:   9   Median :43632   David  : 295  
##  00508X20:   8   Mean   :39663   Robert : 263  
##  00751Y10:   8   3rd Qu.:48445   James  : 244  
##  02687478:   8   Max.   :51697   Thomas : 205  
##  (Other) :8248                   (Other):6589  
##                 EXEC_FULLNAME     EXEC_LNAME     EXEC_MNAME  
##  Ajay J. Sabherwal     :   2   Smith   :  59          :1113  
##  Alan B. Miller        :   2   Johnson :  31   J.     : 824  
##  Amin J. Khoury        :   2   Miller  :  31   A.     : 682  
##  Barry Diller          :   2   Jones   :  24   M.     : 556  
##  Barry M. Smith        :   2   Brown   :  21   L.     : 412  
##  Charles A. Mathis, CPA:   2   Williams:  20   D.     : 373  
##  (Other)               :8288   (Other) :8114   (Other):4340  
##     GENDER       NAMEPREFIX        PAGE          BECAMECEO   
##  FEMALE: 508   Mr.    :7247   Min.   :29.00           :6373  
##  MALE  :7792   Ms.    : 817   1st Qu.:49.00   1/1/2013:  29  
##                Dr.    : 222   Median :54.00   1/1/2015:  24  
##                       :   7   Mean   :54.23   1/1/2017:  24  
##                Pro    :   2   3rd Qu.:59.00   1/1/2016:  21  
##                Cap    :   1   Max.   :96.00   1/1/2012:  20  
##                (Other):   4   NA's   :60      (Other) :1809  
##     EXECRANK        JOINED_CO          LEFTCO           LEFTOFC    
##  Min.   :1.000           :7537            :8276             :7846  
##  1st Qu.:3.000   1/1/1986:  15   1/1/1999 :   1   12/31/2016:  20  
##  Median :5.000   1/1/1980:  14   1/1/2000 :   1   3/1/2017  :  11  
##  Mean   :4.356   1/1/1992:  13   1/1/2006 :   1   1/1/2017  :   8  
##  3rd Qu.:5.000   1/1/1995:  13   1/1/2007 :   1   10/1/2016 :   7  
##  Max.   :9.000   1/1/1989:  12   1/25/2005:   1   12/31/2015:   6  
##  NA's   :7189    (Other) : 696   (Other)  :  19   (Other)   : 402  
##   PCEO       PCFO           REASON      REJOIN         RELEFT       
##     :6713      :6721           :8277   Mode:logical   Mode:logical  
##  CEO:1587   CFO:1579   RESIGNED:  10   NA's:8300      NA's:8300     
##                        RETIRED :   8                                
##                        UNKNOWN :   5                                
##                                                                     
##                                                                     
##                                                                     
##                                                   TITLE     
##  Chief Executive Officer, President and Director     : 530  
##  Chief Financial Officer and Executive Vice President: 468  
##  Chief Financial Officer and Senior Vice President   : 232  
##  Chief Executive Officer and Director                : 183  
##  Chairman and Chief Executive Officer                : 173  
##  Chief Financial Officer                             : 168  
##  (Other)                                             :6546  
##       AGE        ALLOTHPD       ALLOTHTOT          BONUS         
##  Min.   :29.00   Mode:logical   Mode:logical   Min.   :   -8.71  
##  1st Qu.:49.00   NA's:8300      NA's:8300      1st Qu.:    0.00  
##  Median :54.00                                 Median :    0.00  
##  Mean   :54.23                                 Mean   :  107.53  
##  3rd Qu.:59.00                                 3rd Qu.:    0.00  
##  Max.   :96.00                                 Max.   :32000.00  
##  NA's   :60                                                      
##  CEOANN     CFOANN     CHG_CTRL_PYMT      COMMENT       
##     :6713      :6721   Min.   :     0.0   Mode:logical  
##  CEO:1587   CFO:1579   1st Qu.:   969.4   NA's:8300     
##                        Median :  3374.2                 
##                        Mean   :  6800.4                 
##                        3rd Qu.:  8090.6                 
##                        Max.   :520877.8                 
##                                                         
##  DEFER_BALANCE_TOT  DEFER_CONTRIB_CO_TOT DEFER_CONTRIB_EXEC_TOT
##  Min.   :     0.0   Min.   :   -5.691    Min.   :    0.00      
##  1st Qu.:     0.0   1st Qu.:    0.000    1st Qu.:    0.00      
##  Median :     0.0   Median :    0.000    Median :    0.00      
##  Mean   :  1167.8   Mean   :   42.137    Mean   :   99.95      
##  3rd Qu.:   519.1   3rd Qu.:   19.468    3rd Qu.:   17.80      
##  Max.   :232879.1   Max.   :13405.253    Max.   :35028.01      
##                                                                
##  DEFER_EARNINGS_TOT  DEFER_RPT_AS_COMP_TOT DEFER_WITHDR_TOT  
##  Min.   :-19442.88   Min.   :   0.000      Min.   :    0.00  
##  1st Qu.:     0.00   1st Qu.:   0.000      1st Qu.:    0.00  
##  Median :     0.00   Median :   0.000      Median :    0.00  
##  Mean   :   107.14   Mean   :   2.313      Mean   :   67.45  
##  3rd Qu.:    28.42   3rd Qu.:   0.000      3rd Qu.:    0.00  
##  Max.   : 39213.68   Max.   :1234.283      Max.   :71144.81  
##                                                              
##  EIP_UNEARN_NUM    EIP_UNEARN_VAL        EXECDIR        EXECRANKANN    
##  Min.   :   0.00   Min.   :     0.0   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.:   0.00   1st Qu.:     0.0   1st Qu.:0.0000   1st Qu.: 2.000  
##  Median :  11.97   Median :   445.9   Median :0.0000   Median : 3.000  
##  Mean   :  56.08   Mean   :  2112.4   Mean   :0.2563   Mean   : 3.184  
##  3rd Qu.:  49.72   3rd Qu.:  2024.7   3rd Qu.:1.0000   3rd Qu.: 4.000  
##  Max.   :9240.00   Max.   :157794.0   Max.   :1.0000   Max.   :10.000  
##                                                        NA's   :49      
##    INTERLOCK   LTIP          NONEQ_INCENT      OLD_DATAFMT_FLAG
##  Min.   :0   Mode:logical   Min.   :    0.00   Min.   :0       
##  1st Qu.:0   NA's:8300      1st Qu.:   65.23   1st Qu.:0       
##  Median :0                  Median :  314.27   Median :0       
##  Mean   :0                  Mean   :  621.99   Mean   :0       
##  3rd Qu.:0                  3rd Qu.:  730.43   3rd Qu.:0       
##  Max.   :0                  Max.   :46999.10   Max.   :0       
##                                                                
##  OPTION_AWARDS     OPTION_AWARDS_BLK_VALUE OPTION_AWARDS_FV 
##  Min.   :    0.0   Mode:logical            Min.   :    0.0  
##  1st Qu.:    0.0   NA's:8300               1st Qu.:    0.0  
##  Median :    0.0                           Median :    0.0  
##  Mean   :  387.8                           Mean   :  387.9  
##  3rd Qu.:  310.3                           3rd Qu.:  302.4  
##  Max.   :77990.7                           Max.   :77990.7  
##                                                             
##  OPTION_AWARDS_NUM  OPTION_AWARDS_RPT_VALUE  OPT_EXER_NUM    
##  Min.   :    0.00   Mode:logical            Min.   :   0.00  
##  1st Qu.:    0.00   NA's:8300               1st Qu.:   0.00  
##  Median :    0.00                           Median :   0.00  
##  Mean   :   37.92                           Mean   :  28.61  
##  3rd Qu.:   28.75                           3rd Qu.:   5.62  
##  Max.   :10805.60                           Max.   :3547.09  
##                                                              
##   OPT_EXER_VAL      OPT_UNEX_EXER_EST_VAL OPT_UNEX_EXER_NUM 
##  Min.   :     0.0   Min.   :     0.0      Min.   :    0.00  
##  1st Qu.:     0.0   1st Qu.:     0.0      1st Qu.:    0.00  
##  Median :     0.0   Median :     0.0      Median :   12.50  
##  Mean   :   826.5   Mean   :  3259.2      Mean   :  144.28  
##  3rd Qu.:    81.5   3rd Qu.:   992.1      3rd Qu.:   96.72  
##  Max.   :156626.4   Max.   :608378.1      Max.   :12320.05  
##                                                             
##  OPT_UNEX_UNEXER_EST_VAL OPT_UNEX_UNEXER_NUM  OTHANN       
##  Min.   :     0.0        Min.   :    0.000   Mode:logical  
##  1st Qu.:     0.0        1st Qu.:    0.000   NA's:8300     
##  Median :     0.0        Median :    6.091                 
##  Mean   :   868.5        Mean   :   83.203                 
##  3rd Qu.:   434.1        3rd Qu.:   68.016                 
##  Max.   :273459.5        Max.   :13279.313                 
##                                                            
##     OTHCOMP          PENSION_CHG       PENSION_PYMTS_TOT 
##  Min.   : -758.69   Min.   :-5340.86   Min.   :    0.00  
##  1st Qu.:   14.94   1st Qu.:    0.00   1st Qu.:    0.00  
##  Median :   40.15   Median :    0.00   Median :    0.00  
##  Mean   :  191.82   Mean   :  141.21   Mean   :   29.92  
##  3rd Qu.:  110.37   3rd Qu.:   12.83   3rd Qu.:    0.00  
##  Max.   :58410.15   Max.   :12890.42   Max.   :50437.34  
##                                                          
##  PENSION_VALUE_TOT     REPRICE  RET_YRS        RSTKGRNT      
##  Min.   :    0.00   Min.   :0   Mode:logical   Mode:logical  
##  1st Qu.:    0.00   1st Qu.:0   NA's:8300      NA's:8300     
##  Median :    0.00   Median :0                                
##  Mean   : 1085.24   Mean   :0                                
##  3rd Qu.:   53.49   3rd Qu.:0                                
##  Max.   :98934.00   Max.   :0                                
##                                                              
##  RSTKVYRS           SALARY          SAL_PCT          SHROWN_EXCL_OPTS  
##  Mode:logical   Min.   :   0.0   Min.   : -100.000   Min.   :     0.0  
##  NA's:8300      1st Qu.: 350.0   1st Qu.:    0.000   1st Qu.:    17.5  
##                 Median : 475.0   Median :    2.932   Median :    54.6  
##                 Mean   : 555.7   Mean   :   14.736   Mean   :   569.8  
##                 3rd Qu.: 680.1   3rd Qu.:    7.048   3rd Qu.:   162.2  
##                 Max.   :8303.2   Max.   :12898.267   Max.   :409513.0  
##                                  NA's   :1221        NA's   :189       
##  SHROWN_EXCL_OPTS_PCT   SHROWN_TOT       SHROWN_TOT_PCT   
##  Min.   : 0.0000      Min.   :     0.0   Min.   : 0.0000  
##  1st Qu.: 0.0160      1st Qu.:    27.3   1st Qu.: 0.0300  
##  Median : 0.0600      Median :    84.0   Median : 0.0980  
##  Mean   : 0.5051      Mean   :   668.8   Mean   : 0.6218  
##  3rd Qu.: 0.1870      3rd Qu.:   248.1   3rd Qu.: 0.2922  
##  Max.   :68.9730      Max.   :409513.0   Max.   :68.9730  
##  NA's   :234          NA's   :189        NA's   :560      
##  SHRS_VEST_NUM      SHRS_VEST_VAL        STOCK_AWARDS     
##  Min.   :   0.000   Min.   :     0.00   Min.   :     0.0  
##  1st Qu.:   2.378   1st Qu.:    64.96   1st Qu.:   254.8  
##  Median :  11.500   Median :   362.28   Median :   716.0  
##  Mean   :  35.498   Mean   :  1430.03   Mean   :  1574.3  
##  3rd Qu.:  35.514   3rd Qu.:  1252.80   3rd Qu.:  1731.5  
##  Max.   :2999.040   Max.   :180095.53   Max.   :242449.1  
##                                                           
##  STOCK_AWARDS_FV    STOCK_UNVEST_NUM   STOCK_UNVEST_VAL        TDC1       
##  Min.   :     0.0   Min.   :    0.00   Min.   :     0.0   Min.   :     0  
##  1st Qu.:   200.0   1st Qu.:    3.81   1st Qu.:   140.5   1st Qu.:  1079  
##  Median :   673.4   Median :   21.33   Median :   788.2   Median :  2031  
##  Mean   :  1550.5   Mean   :   63.81   Mean   :  2473.9   Mean   :  3418  
##  3rd Qu.:  1693.5   3rd Qu.:   62.04   3rd Qu.:  2256.2   3rd Qu.:  3946  
##  Max.   :242449.1   Max.   :13388.24   Max.   :520877.8   Max.   :243877  
##                                                                           
##     TDC1_PCT              TDC2           TDC2_PCT           TERM_PYMT     
##  Min.   : -100.000   Min.   :     0   Min.   : -100.000   Min.   : -2485  
##  1st Qu.:  -10.784   1st Qu.:  1048   1st Qu.:  -16.489   1st Qu.:     0  
##  Median :    4.378   Median :  2028   Median :    5.749   Median :   802  
##  Mean   :   23.199   Mean   :  3856   Mean   :   33.930   Mean   :  3364  
##  3rd Qu.:   23.810   3rd Qu.:  4110   3rd Qu.:   36.848   3rd Qu.:  3057  
##  Max.   :20286.228   Max.   :243877   Max.   :20286.228   Max.   :330233  
##  NA's   :1496                         NA's   :1495                        
##                                                  TITLEANN   
##  Chief Executive Officer, President and Director     : 530  
##  Chief Financial Officer and Executive Vice President: 468  
##  Chief Financial Officer and Senior Vice President   : 232  
##  Chief Executive Officer and Director                : 183  
##  Chairman and Chief Executive Officer                : 173  
##  Chief Financial Officer                             : 168  
##  (Other)                                             :6546  
##    TOTAL_ALT1     TOTAL_ALT1_PCT        TOTAL_ALT2      
##  Min.   : -5064   Min.   : -250.665   Min.   :     0.0  
##  1st Qu.:  1116   1st Qu.:  -10.301   1st Qu.:   906.2  
##  Median :  2101   Median :    5.499   Median :  1759.8  
##  Mean   :  3557   Mean   :   24.560   Mean   :  3874.8  
##  3rd Qu.:  4105   3rd Qu.:   26.647   3rd Qu.:  3878.2  
##  Max.   :243877   Max.   :20286.228   Max.   :219593.1  
##                   NA's   :1496                          
##  TOTAL_ALT2_PCT       TOTAL_CURR      TOTAL_CURR_PCT       TOTAL_SEC     
##  Min.   :-100.000   Min.   :    0.0   Min.   :-100.000   Min.   : -5064  
##  1st Qu.: -17.501   1st Qu.:  372.4   1st Qu.:   0.000   1st Qu.:  1134  
##  Median :   8.246   Median :  507.5   Median :   2.776   Median :  2117  
##  Mean   :  32.112   Mean   :  663.2   Mean   :  10.787   Mean   :  3570  
##  3rd Qu.:  43.425   3rd Qu.:  750.0   3rd Qu.:   7.513   3rd Qu.:  4116  
##  Max.   :3694.950   Max.   :35500.0   Max.   :3886.688   Max.   :243877  
##  NA's   :1495                         NA's   :1221       NA's   :4       
##  TOTAL_SEC_PCT            AT                CH           
##  Min.   : -250.66   Min.   :     14   Min.   :     0.00  
##  1st Qu.:  -10.12   1st Qu.:   1174   1st Qu.:    59.35  
##  Median :    5.72   Median :   3639   Median :   183.50  
##  Mean   :   26.55   Mean   :  26872   Mean   :  1003.40  
##  3rd Qu.:   26.43   3rd Qu.:  11988   3rd Qu.:   603.00  
##  Max.   :20286.23   Max.   :3287968   Max.   :121711.00  
##  NA's   :1219                                            
##       DLTT               NI               PRCH_C         
##  Min.   :      0   Min.   :-6177.00   Min.   :   0.4999  
##  1st Qu.:    169   1st Qu.:   18.03   1st Qu.:  25.6300  
##  Median :    790   Median :  104.34   Median :  47.0000  
##  Mean   :   7335   Mean   :  603.33   Mean   :  64.8202  
##  3rd Qu.:   2995   3rd Qu.:  420.00   3rd Qu.:  78.9700  
##  Max.   :3226737   Max.   :45687.00   Max.   :1845.3731  
##                                                          
##      PRCL_C             DRatio       
##  Min.   :   0.012   Min.   :0.00000  
##  1st Qu.:  14.070   1st Qu.:0.07777  
##  Median :  28.240   Median :0.23549  
##  Mean   :  40.643   Mean   :0.26140  
##  3rd Qu.:  49.950   3rd Qu.:0.37706  
##  Max.   :1462.020   Max.   :3.00015  
## 

Histogram of AGE

Mostly middle aged

plot_ly(x = ageg,
             type = "histogram")
plot_ly(x = ageg,
             type = "histogram",
             histnorm = "probability")

Histrogram of Gender

lot of Males (93% is Male) Mostly Middle aged Men

plot_ly(x = GENDER,
             type = "histogram")
plot_ly(x = GENDER,
             type = "histogram",
             histnorm = "probability")

Histogram of other Variables

Nothing Significant From these Histogram

plot_ly(x = CH,
             type = "histogram") #Total Cash
plot_ly(x = AT,
             type = "histogram") #Assets - Total
plot_ly(x = NI,
             type = "histogram") #Net Income (Loss)
plot_ly(x = DLTT,
             type = "histogram") #Long-Term Debt - Total
plot_ly(x = PRCH_C,
             type = "histogram") #Price High - Annual - Calendar
plot_ly(x = PRCL_C,
             type = "histogram") #Price Low - Annual - Calendar
plot_ly(x = DRatio,
             type = "histogram") #Debt Ratio
plot_ly(x = SALARY,
             type = "histogram") #Salary
plot_ly(x = dat$SHRS_VEST_VAL,
             type = "histogram") #Value Realized on Vesting ($)
plot_ly(x = BONUS,
             type = "histogram") #The dollar value of a bonus earned by the named executive officer during the fiscal year.
plot_ly(x = OTHCOMP,
             type = "histogram") #All Other Compensation ($)
plot_ly(x = TDC2,
             type = "histogram") #Total Compensation (Salary + Bonus + Other Annual + Restricted Stock Grants + LTIP Payouts + All Other + Value of Options Exercised).

Histogram of other Variables using log to find significance

Normal Distribution

plot_ly(x = log(CH),
             type = "histogram")
plot_ly(x = log(AT),
             type = "histogram")
plot_ly(x = log(NI),
             type = "histogram")
plot_ly(x = log(DLTT),
             type = "histogram")
plot_ly(x = log(PRCH_C),
             type = "histogram")
plot_ly(x = log(PRCL_C),
             type = "histogram")
plot_ly(x = log(SALARY),
             type = "histogram")
plot_ly(x = log(dat$SAL_PCT),
             type = "histogram")
plot_ly(x = log(BONUS),
             type = "histogram")
plot_ly(x = log(OTHCOMP),
             type = "histogram")
plot_ly(x = log(TDC2),
             type = "histogram")
plot_ly(x = log(DRatio),
             type = "histogram")

Relationships between selected variables

High to Low relationships -AT and DLTT, SALARY, BONUS -NI and TDC2, SALARY, BONUS -CH and SALARY, BONUS, TDC2 -DLTT and SALARY -PRCH_C, PRCH_L and TDC2 -SALARY and AGE

par(mfrow=c(1,1))
pairs.panels(dat1)

-No Significant Difference observed on Male only Data probably due to high number of observation

#observe male data only 
pairs.panels(datm)

-AT and Bonus has a huge difference between Male .21 and female .60 -DLTT and Bonus has a huge difference between Male .05 and female .56 -TDC2 and CH has a small difference between Male .25 and female .36 -NI and Stock price has difference between Male and female -All other relationships are high compared to Men Probably due to very low observations

Does female Execs has to perform better then men in average? Does firm have better with female Exec?

#observe only  female data
pairs.panels(datf)

#Getting Ideas about graphs
dat1 %>%
  select("AT", "NI", "CH","TDC2", "SALARY", "BONUS", "AGE", "GENDER") %>%
ggpairs(dat1)

Compbining Datset

Added age group to my data

hist(dat$AGE)

group_by(dat1, AGE)
summarize(group_by(dat1, AGE), aSalary=mean(SALARY))

Probability Stats

Added age group to my data Review Probability

#Review Probability
htmlTable(as.data.frame(round(prop.table(xtabs(~ GENDER + ageg,  
      data = dat1 %>%
      mutate(AGE = AGE) %>% 
      mutate(ageg = cut(AGE, 6))), 2), 2)),
      caption = "Frequency of Price Difference",
      header = paste(c("Brand", "Price Difference", "&dagger; Frequency")),
      tfoot="&dagger; Data from Midterm Project")
Frequency of Price Difference
Brand Price Difference † Frequency
1 FEMALE (28.9,40.2] 0.04
2 MALE (28.9,40.2] 0.96
3 FEMALE (40.2,51.3] 0.06
4 MALE (40.2,51.3] 0.94
5 FEMALE (51.3,62.5] 0.07
6 MALE (51.3,62.5] 0.93
7 FEMALE (62.5,73.7] 0.05
8 MALE (62.5,73.7] 0.95
9 FEMALE (73.7,84.8] 0.01
10 MALE (73.7,84.8] 0.99
11 FEMALE (84.8,96.1] 0
12 MALE (84.8,96.1] 1
† Data from Midterm Project
##added new variable ageg (AGE GROUP Based on this)

Female low probability

(d <- as.data.frame(prop.table(xtabs(~ GENDER + ageg,  
  data = dat1 %>%
  mutate (ageg = cut(AGE, 6)) %>% # step 2
  mutate (ageg = recode(ageg, "(0,30)" = "30 and under",
                             "(31,45)" = "31-45",
                             "(46,65)" = "46-65",
                             "(66,100)" = "65+"))), 2)) %>%
# 3 and 4. Visualize conditional probability of brand by price difference type
  ggplot(aes(x = GENDER, y = Freq, fill = ageg)) +
  geom_bar(stat="identity", position = "dodge", color = "black") +
  geom_text(aes(label = round(Freq,2)), vjust=1.5, color = "white",
            position = position_dodge(0.9), size = 4)+
  ggtitle("Women are less likely to be Executives", sub = "Probabbility Grouped by Age")
)

EDA Graphs

Middle age Men has highest number Bonus and Salary

ggplot(dat, aes(x = AGE, y = BONUS, color = GENDER)) +
  geom_point() +
  ggtitle("Middle age men has highest number of bonus with outliers", sub = "Small number of Female observations")

Over all Observation

grid.arrange(
ggplot(dat, aes(x = AGE, y = SALARY, color = GENDER)) +
  geom_point() +
  stat_density2d() +
  ggtitle("Mostly Middle aged Men", sub = "Small number of Female observations"),
ggplot(dat, aes(x = AGE, y = SALARY, color = GENDER)) +
  geom_point() +
  stat_density2d(aes(fill =..density..), geom = "raster", contour = FALSE) +
  ggtitle("Mostly Middle aged Men", sub = "Heat map, light color indicates more observations"),
ncol = 2
)

grid.arrange(
dat %>%
  group_by(GENDER) %>%
  summarise(age_mean = mean(AGE)) %>%
  ggplot(aes(x = GENDER, y = age_mean, fill = GENDER)) +
    geom_bar(stat="identity") +
    coord_flip(), 
dat %>%
  ggplot(aes(x = GENDER, y = AGE, fill = GENDER)) +
    geom_boxplot() +
    coord_flip() +
    guides(fill = FALSE, ylab = FALSE),
ncol = 2
)

Female Compensation in average

As female age they get higher compensation

datf %>%
  group_by(ageg) %>%
  summarise(Avg_Com = mean(TDC2)) %>%
  ggplot(aes(x = ageg, y = Avg_Com, )) +
  geom_bar(stat="identity", position = "dodge", color = "black") +
  labs(fill = "Gender") +
# Add and improve titles and labels
    ggtitle("In Average as female age there Compensation gets Larger") + 
    ylab("Average Compensation by Thousands") +
    xlab("Age Group") +
    scale_x_discrete(labels=c("Under 40", "40 to 51", "51 to 62", "62 to 73", "73 to 84", "84 +", "NA")) +
 theme_classic() + 
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),)

datm %>%
  group_by(ageg) %>%
  summarise(Avg_Com = mean(TDC2)) %>%
  ggplot(aes(x = ageg, y = Avg_Com, )) +
  geom_bar(stat="identity", position = "dodge", color = "black") +
  labs(fill = "Gender") +
# Add and improve titles and labels
    ggtitle("In Average as female age there Compensation gets Larger") + 
    ylab("Average Compensation by Thousands") +
    xlab("Age Group") +
    scale_x_discrete(labels=c("Under 40", "40 to 51", "51 to 62", "62 to 73", "73 to 84", "84 +", "NA")) +
 theme_classic() + 
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),)

dat1 %>%
  group_by( GENDER ) %>%
  summarise( count = n(),
              proportion = sum(count) / nrow(dat1),
              percent = proportion * 100)
dat1 %>%
  group_by( AGE ) %>%
  ggplot() +
    geom_bar(aes(x = GENDER)) +
    ggtitle("Middle Age")

Average Compensation by Gender

Lets compare Total Compensation side by side

# Average profit by region and category 
dat1 %>%
  group_by(ageg, GENDER) %>%
  summarise(Avg_Com = mean(TDC2)) %>%
  ggplot(aes(x = ageg, y = Avg_Com, fill = reorder(GENDER, desc(Avg_Com)))) +
  geom_bar(stat="identity", position = "dodge", color = "black") +
  geom_hline(yintercept = 0, color = "black") +
  labs(fill = "Gender") +
# Add and improve titles and labels
    ggtitle("Middle age female age groups has a big compensation jump") + 
    ylab("Average Compensation by Thousands") +
    xlab("Age Group") +
    scale_x_discrete(labels=c("Under 40", "40 to 51", "51 to 62", "62 to 73", "73 to 84", "84 +", "NA")) +
 theme_classic() + 
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),)

Multiple Regression Analysis

Testing Different Models for Significance Analysis

  1. Most statistically significant
  2. Have the largest impact

Visualize which variables have coefficients statistically different from zero

attach(dat1)
# ANOVA
baseMod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C)
mod1 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI)
mod2 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT)
mod3 <- lm(TDC2 ~ AGE + BONUS + AT + CH)
mod4 <- lm(TDC2 ~ AGE + BONUS + AT)
anova(baseMod, mod1, mod2, mod3, mod4)
summary(baseMod)
## 
## Call:
## lm(formula = TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + 
##     PRCL_C)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30887  -2136  -1242    265 229687 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.750e+03  5.491e+02  -3.188  0.00144 ** 
## AGE          7.941e+01  9.956e+00   7.975 1.72e-15 ***
## BONUS        1.970e+00  1.216e-01  16.206  < 2e-16 ***
## AT          -3.835e-03  8.718e-04  -4.400 1.10e-05 ***
## CH           1.714e-01  2.236e-02   7.665 2.00e-14 ***
## DLTT        -2.388e-04  1.187e-03  -0.201  0.84048    
## NI           7.366e-01  3.893e-02  18.920  < 2e-16 ***
## PRCH_C       7.994e+00  4.572e+00   1.748  0.08042 .  
## PRCL_C       1.784e+00  6.457e+00   0.276  0.78228    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6640 on 8231 degrees of freedom
##   (60 observations deleted due to missingness)
## Multiple R-squared:  0.1555, Adjusted R-squared:  0.1547 
## F-statistic: 189.4 on 8 and 8231 DF,  p-value: < 2.2e-16
attach(datm)
# ANOVA
baseMod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C)
mod1 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI)
mod2 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT)
mod3 <- lm(TDC2 ~ AGE + BONUS + AT + CH)
mod4 <- lm(TDC2 ~ AGE + BONUS + AT)
anova(baseMod, mod1, mod2, mod3, mod4)
summary(baseMod)
## 
## Call:
## lm(formula = TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + 
##     PRCL_C)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30782  -2149  -1234    277 229688 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.936e+03  5.501e+02  -3.519 0.000435 ***
## AGE          8.297e+01  9.968e+00   8.324  < 2e-16 ***
## BONUS        1.926e+00  1.206e-01  15.964  < 2e-16 ***
## AT          -2.836e-03  9.237e-04  -3.070 0.002148 ** 
## CH           1.523e-01  2.223e-02   6.851 7.91e-12 ***
## DLTT        -1.089e-03  1.205e-03  -0.904 0.366064    
## NI           7.369e-01  4.053e-02  18.179  < 2e-16 ***
## PRCH_C       9.298e+00  4.584e+00   2.028 0.042584 *  
## PRCL_C       9.598e-02  6.479e+00   0.015 0.988181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6497 on 7723 degrees of freedom
##   (60 observations deleted due to missingness)
## Multiple R-squared:  0.1563, Adjusted R-squared:  0.1554 
## F-statistic: 178.8 on 8 and 7723 DF,  p-value: < 2.2e-16
attach(datf)
# ANOVA
baseMod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C)
mod1 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI)
mod2 <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT)
mod3 <- lm(TDC2 ~ AGE + BONUS + AT + CH)
mod4 <- lm(TDC2 ~ AGE + BONUS + AT)
anova(baseMod, mod1, mod2, mod3, mod4)
summary(baseMod)
## 
## Call:
## lm(formula = TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + 
##     PRCL_C)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29760  -1698  -1022    183 156374 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.254e+03  3.230e+03   1.008    0.314    
## AGE         -2.057e+01  5.946e+01  -0.346    0.730    
## BONUS        4.741e+00  9.649e-01   4.913 1.22e-06 ***
## AT          -5.897e-03  9.393e-03  -0.628    0.530    
## CH           1.410e+00  2.475e-01   5.696 2.10e-08 ***
## DLTT        -1.210e-01  8.732e-02  -1.385    0.167    
## NI           3.765e-01  1.925e-01   1.956    0.051 .  
## PRCH_C      -1.083e+01  2.667e+01  -0.406    0.685    
## PRCL_C       2.003e+01  3.716e+01   0.539    0.590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8277 on 499 degrees of freedom
## Multiple R-squared:  0.2103, Adjusted R-squared:  0.1976 
## F-statistic: 16.61 on 8 and 499 DF,  p-value: < 2.2e-16

Visualization of Data Regression

Age Groups Has most Confident Interval Variance

attach(dat1)
mod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C+ OTHCOMP+ ageg)
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Int 

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing=FALSE),],3))
estimate se t pval low_CI high_CI
OTHCOMP 1.666 0.071 23.307 0 1.526 1.806
NI 0.698 0.038 18.474 0 0.624 0.772
BONUS 1.631 0.119 13.752 0 1.398 1.863
CH 0.157 0.022 7.254 0 0.115 0.2
AT -0.003 0.001 -3.594 0 -0.005 -0.001
PRCH_C 9.562 4.427 2.16 0.031 0.884 18.239
AGE 42.328 24.645 1.718 0.086 -5.981 90.638
ageg(62.5,73.7] 1375.288 851.987 1.614 0.107 -294.821 3045.398
ageg(51.3,62.5] 969.318 650.804 1.489 0.136 -306.421 2245.057
ageg(84.8,96.1] -2276.753 1968.756 -1.156 0.248 -6136.013 1582.506
ageg(40.2,51.3] 514.775 525.218 0.98 0.327 -514.785 1544.334
DLTT -0.001 0.001 -0.709 0.478 -0.003 0.001
ageg(73.7,84.8] 80.865 1308.199 0.062 0.951 -2483.535 2645.266
PRCL_C -0.259 6.251 -0.041 0.967 -12.513 11.994
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw() +
  ggtitle("ALL Data: Age group has different confident intervals") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI) +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI) +
   xlab("Coefficient with Confidence Interval") +
   geom_vline(xintercept = 0, color = "red")
)

MALE ONLY

attach(datm) #Only Male Data
mod <- lm(TDC2 ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C+ OTHCOMP+ ageg)
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Int 

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing=FALSE),],3))
estimate se t pval low_CI high_CI
OTHCOMP 1.625 0.072 22.691 0 1.485 1.766
NI 0.703 0.039 17.902 0 0.626 0.78
BONUS 1.583 0.118 13.46 0 1.353 1.814
CH 0.141 0.022 6.553 0 0.099 0.183
AT -0.002 0.001 -2.485 0.013 -0.004 0
PRCH_C 10.633 4.436 2.397 0.017 1.938 19.328
ageg(62.5,73.7] 1513.737 857.505 1.765 0.078 -167.207 3194.68
AGE 40.677 24.844 1.637 0.102 -8.024 89.378
ageg(51.3,62.5] 1014.902 653.229 1.554 0.12 -265.604 2295.408
DLTT -0.002 0.001 -1.299 0.194 -0.004 0.001
ageg(84.8,96.1] -2226.189 1952.745 -1.14 0.254 -6054.1 1601.722
ageg(40.2,51.3] 464.158 525.652 0.883 0.377 -566.262 1494.578
PRCL_C -1.665 6.268 -0.266 0.791 -13.953 10.623
ageg(73.7,84.8] 200.288 1306.506 0.153 0.878 -2360.819 2761.395
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw() +
  ggtitle("Male only: Age group has different confident intervals") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI) +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI) +
   xlab("Coefficient with Confidence Interval") +
   geom_vline(xintercept = 0, color = "red")
)

FEMALE ONLY: USING LOG

attach(datf) #Only Female Data
mod <- lm(log(TDC2) ~ AGE + BONUS + AT + CH + DLTT + NI + PRCH_C + PRCL_C+ OTHCOMP+ ageg)
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Int 

# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI") 

htmlTable(round(coe_CI[order(coe_CI$pval, decreasing=FALSE),],3))
estimate se t pval low_CI high_CI
AT 0 0 -5.408 0 0 0
CH 0 0 5.063 0 0 0
OTHCOMP 0 0 4.979 0 0 0
DLTT 0 0 4.61 0 0 0
BONUS 0 0 4.372 0 0 0.001
AGE 0.027 0.014 1.942 0.053 0 0.054
NI 0 0 -1.478 0.14 0 0
PRCL_C 0.005 0.004 1.373 0.17 -0.002 0.013
ageg(40.2,51.3] 0.334 0.336 0.994 0.321 -0.326 0.995
ageg(73.7,84.8] -0.776 1.125 -0.69 0.491 -2.987 1.434
PRCH_C -0.002 0.003 -0.595 0.552 -0.007 0.004
ageg(51.3,62.5] 0.18 0.396 0.453 0.651 -0.599 0.959
ageg(62.5,73.7] -0.153 0.492 -0.311 0.756 -1.121 0.814
(g1 <- ggplot(coe_CI, aes(x = estimate, y = reorder(row.names(coe_CI),desc(pval)))) +
  geom_point(size = 3) +
  xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
  ylab("Variable") +
  xlab("Coefficient") +
  theme_bw() +
  ggtitle("Female only: Age group has different confident intervals") +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))), 
               xend = coe_CI$high_CI) +
  geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))), 
               xend = coe_CI$low_CI) +
   xlab("Coefficient with Confidence Interval") +
   geom_vline(xintercept = 0, color = "red")
)

Chow test

#== Chow test
chow.test <- local({
  m <- 2    # number of restrictions (delta_0 = delta_1 = 0)
  n <- dim(dat1)[1]
  full.1 <- lm( TDC2 ~ AGE, data=subset(dat, GENDER == "FEMALE"))  # female regression
  full.2 <- lm( TDC2 ~ AGE, data=subset(dat, GENDER == "MALE"))    # male regression
  full <- lm(TDC2 ~ AGE + GENDER + GENDER*AGE, data=dat)          # full model using dumies
  rest <- lm(TDC2 ~ AGE + AGE, data=dat)                          # restricted model
  SSE.1 <- sum( resid(full.1)^2 )
  SSE.2 <- sum( resid(full.2)^2 )
  SSE.full <- sum( resid(full)^2 )
  all.equal(SSE.full, SSE.1 + SSE.2)
  SSE.rm <- sum( resid(rest)^2 )
  chow <- (( SSE.rm - (SSE.1 + SSE.2)) / m ) / ( (SSE.1 + SSE.2) / (n - 2*m) )
  pval.chow <- 1 - pf(chow, m, n-2*m)
  out <- list(SSE.1=SSE.1, SSE.2=SSE.2, SSE.rm=SSE.rm, chow=chow, pval.chow=pval.chow)
  return(out)
})
sapply(chow.test, cbind)
##        SSE.1        SSE.2       SSE.rm         chow    pval.chow 
## 4.325947e+10 3.826649e+11 4.259930e+11 6.685080e-01 5.125002e-01