require.package("plyr")
require.package("dplyr")
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dataPath <- "C:/Code/R/IS608-VizAnalytics/FinalProject/Data/"

The loadBirthData2 function, shown below, was used to load the CDC natality data from a local file into R for manipulation.

#####
# FUNCTION: loadBirthData2 
#
#
loadBirthData2 <- function(birthFile)
{
  # Load the Natality data
  birthData <- read.table(birthFile, 
                          header=TRUE, 
                          sep="\t", 
                          fill=TRUE, 
                          stringsAsFactors=FALSE,
                          colClasses=c('character', # Notes
                                       'character', # Year
                                       'character', # Year.Code
                                       'character', # Month
                                       'character', # Month.Code
                                       'character', # State
                                       'character', # State.Code
                                       'character', # Age of Mother
                                       'character', # Age of Mother Code
                                       'numeric'))  # Births
  
  #print(birthData[is.na(birthData$Births),])
  birthDataWoNa <- subset(birthData, !is.na(birthData$Births), 
                          select = - c(Notes))  
  birthDataWoNa <- dplyr::group_by(birthDataWoNa, 
                                   Year, Year.Code, Month, Month.Code, State, State.Code, Age.of.Mother, Age.of.Mother.Code)
  birthDataWoNa <- dplyr::summarise(birthDataWoNa, sum(Births))
  
  return (birthDataWoNa)
}

The loadCensusData function, shown below, was used to load the census population estimates data from a local file into R for manipulation.

#####
# FUNCTION: loadCensusData 
#
#
loadCensusData <- function(file, doReduce = TRUE)
{
  # Load the Natality data
  dataFile <- sprintf("%s/%s", dataPath, file)
  data <- read.table(dataFile, 
                          header=TRUE, 
                          sep=",", 
                          quote="",
                          fill=TRUE, 
                          stringsAsFactors=FALSE,
                          colClasses=c('character')) 
  
  data <- mutate(data, County.Code=paste(STATE, COUNTY, sep=""))
  if(doReduce)
  {
    data <- mutate(data, CENSUS2010POPThousands=as.numeric(CENSUS2010POP) / 1000)
  }
  
  #print(summary(data))
  
  return (data)
}

The loadUnemploymentData function shown below loads the local area unemployment data along with reference datasets for states and periods (U.S. Bureau of Labor Statistics, 2015). These data sets are cleaned and joined together to facilitate usage in the visualization later.

#####
# FUNCTION: loadUnemploymentData 
#
#
loadUnemploymentData <- function()
{
  # Load the Local Area Unemployment data
  dataFile <- sprintf("%s/la.data.3.AllStatesS.txt", dataPath)
  dataTable <- read.table(dataFile, 
                          header=TRUE, 
                          sep="\t", 
                          fill=TRUE, 
                          stringsAsFactors=FALSE,
                          #col.names=c('series_id', 'year', 'period', 'value', 'footnote_codes'),
                          colClasses=c('character', # series_id
                                       'numeric',   # year
                                       'character', # period
                                       'numeric'   # value
                                       #, 'character'  # footnote_codes
                                       ))  
  
  # State IDs
  dataFile <- sprintf("%s/la.state_region_division.txt", dataPath)
  states <- read.table(dataFile, 
                          header=TRUE, 
                          sep="\t", 
                          fill=TRUE, 
                          stringsAsFactors=FALSE,
                          col.names=c('srd_code', 'blank', 'srd_text'),
                          colClasses=c('character',  # srd_code
                                       'character' # srd_text
                                       ))
  
  # format for the series id we want (unemployment rate)
  states$series_id <- paste("LASST", states$srd_code, "0000000000003", sep="") #, 
  print(head(states))
  #print(head(dataTable$series_id))
  
  # Period IDs (months)
  dataFile <- sprintf("%s/la.period.txt", dataPath)
  periods <- read.table(dataFile, 
                          header=TRUE, 
                          sep="\t", 
                          fill=TRUE, 
                          stringsAsFactors=FALSE,
                         # col.names=c('srd_code', 'blank', 'srd_text'),
                          colClasses=c('character', 
                                       'character'
                                       ))
  periods$month <- stringr::str_extract(periods$period, "\\d{2,2}")
  periods <- subset(periods, periods$period != "M13")
  print(periods)
  
  # Only between 2003 and 2013
  cleanerData <- subset(dataTable, 2003 <= dataTable$year &  dataTable$year <= 2013)
  # Trim whitespace off the series_id
  cleanerData$series_id <- stringr::str_trim(cleanerData$series_id)
  # join state names and limit to only the unemployment data we care about.
  cleanerData <- plyr::join(cleanerData, states, by="series_id", type="inner")
  # join month info and limit to only the months (excluding annual average)
  cleanerData <- plyr::join(cleanerData, periods, by="period", type="inner")
  
  return (cleanerData)
}

The following code loads the 2 birth datasets and binds them into a single data frame. Local area unemployment data is also loaded.

birthFile <- sprintf("%s/Natality-2007-2013-YearMonthStateAge.txt", dataPath)
bd0713 <- loadBirthData2(birthFile)

birthFile <- sprintf("%s/Natality-2003-2006-YearMonthStateAge.txt", dataPath)
bd0306 <- loadBirthData2(birthFile)
head(bd0306)
## Source: local data frame [6 x 9]
## Groups: Year, Year.Code, Month, Month.Code, State, State.Code, Age.of.Mother
## 
##   Year Year.Code Month Month.Code   State State.Code Age.of.Mother
## 1 2003      2003 April          4 Alabama         01   15-19 years
## 2 2003      2003 April          4 Alabama         01   20-24 years
## 3 2003      2003 April          4 Alabama         01   25-29 years
## 4 2003      2003 April          4 Alabama         01   30-34 years
## 5 2003      2003 April          4 Alabama         01   35-39 years
## 6 2003      2003 April          4 Alabama         01   40-44 years
## Variables not shown: Age.of.Mother.Code (chr), sum(Births) (dbl)
bd0313 <- rbind(bd0713, bd0306)

ud <- loadUnemploymentData()
##   srd_code blank   srd_text            series_id
## 1       01          Alabama LASST010000000000003
## 2       02           Alaska LASST020000000000003
## 3       04          Arizona LASST040000000000003
## 4       05         Arkansas LASST050000000000003
## 5       06       California LASST060000000000003
## 6       08         Colorado LASST080000000000003
##    period period_abbr period_name month
## 1     M01         JAN     January    01
## 2     M02         FEB    February    02
## 3     M03         MAR       March    03
## 4     M04         APR       April    04
## 5     M05         MAY         May    05
## 6     M06         JUN        June    06
## 7     M07         JUL        July    07
## 8     M08         AUG      August    08
## 9     M09         SEP   September    09
## 10    M10         OCT     October    10
## 11    M11         NOV    November    11
## 12    M12         DEC    December    12
head(ud)
##              series_id year period value footnote_codes srd_code blank
## 1 LASST010000000000003 2003    M01   5.8                      01      
## 2 LASST010000000000003 2003    M02   5.8                      01      
## 3 LASST010000000000003 2003    M03   5.9                      01      
## 4 LASST010000000000003 2003    M04   6.0                      01      
## 5 LASST010000000000003 2003    M05   6.1                      01      
## 6 LASST010000000000003 2003    M06   6.1                      01      
##   srd_text period_abbr period_name month
## 1  Alabama         JAN     January    01
## 2  Alabama         FEB    February    02
## 3  Alabama         MAR       March    03
## 4  Alabama         APR       April    04
## 5  Alabama         MAY         May    05
## 6  Alabama         JUN        June    06

The following code snippet saves the massaged local area unemployment data to a comma separated value (CSV) file for reference and later use.

dataFile <- sprintf("%s/LA-States-2003-2013.csv", dataPath)
bSaveUnempData <- FALSE
if(bSaveUnempData)
{
  write.table(ud, dataFile,sep=",", row.names=FALSE)
}

Here we update column names in the data frames inorder to facilitate the plyr::join operation.

colnames(bd0313) <- c("Year",  "Year.Code", 
                             "Month",   "Month.Code",   "State",    
                             "State Code", "Age.of.Mother", "Age.of.Mother.Code", "Births") # "County",  "County Code",

colnames(ud) <- c("series_id","Year","period","UnemploymentRate",
                  "footnote_codes","srd_code","blank",
                  "State","period_abbr","Month","monthNum")

ubd <- plyr::join(ud, bd0313, by=c("Year", "Month", "State"), type="inner")
head(ubd)
##              series_id Year period UnemploymentRate footnote_codes
## 1 LASST010000000000003 2003    M01              5.8               
## 2 LASST010000000000003 2003    M01              5.8               
## 3 LASST010000000000003 2003    M01              5.8               
## 4 LASST010000000000003 2003    M01              5.8               
## 5 LASST010000000000003 2003    M01              5.8               
## 6 LASST010000000000003 2003    M01              5.8               
##   srd_code blank   State period_abbr   Month monthNum Year.Code Month.Code
## 1       01       Alabama         JAN January       01      2003          1
## 2       01       Alabama         JAN January       01      2003          1
## 3       01       Alabama         JAN January       01      2003          1
## 4       01       Alabama         JAN January       01      2003          1
## 5       01       Alabama         JAN January       01      2003          1
## 6       01       Alabama         JAN January       01      2003          1
##   State Code Age.of.Mother Age.of.Mother.Code Births
## 1         01   15-19 years              15-19    688
## 2         01   20-24 years              20-24   1496
## 3         01   25-29 years              25-29   1341
## 4         01   30-34 years              30-34    905
## 5         01   35-39 years              35-39    347
## 6         01   40-44 years              40-44     68
bSave <- FALSE
if(bSave)
{
  dataFile <- sprintf("%s/LA-Natality-Combinedv2.csv", path)
  write.table(ubd, dataFile,sep=",", row.names=FALSE)  
}

Add Census and Create Birth Rate.

# Transform raw year/month columns into a Date column
ubd <- dplyr::mutate(ubd, 
                        Date = lubridate::parse_date_time(sprintf("%s-%s-01", 
                                                                  Year.Code, 
                                                                  Month.Code), 
                                                          orders="ymd"))

birthsPerYearMth <- aggregate(Births ~  Date, ubd, sum)
birthsPerYearMth <- birthsPerYearMth[order(birthsPerYearMth$Date), ]
doBirthRateCombine <- TRUE
if(doBirthRateCombine)
{
  # Load Census Data
  censusData <- loadCensusData("CO-EST2014-Alldata.csv")
  censusData09 <- loadCensusData("CO-EST2009-ALLDATA.csv", FALSE)
  
  # Melt into long form
  censusLong <- reshape2::melt(censusData, id.vars=c("STNAME", "COUNTY"), variable.name="TYPE")
  censusLong09 <- reshape2::melt(censusData09, id.vars=c("STNAME", "COUNTY"), variable.name="TYPE")
  
  # Extract only the years/columns we need, convert to thousands and parse out a Year column
  #print(head(censusLong))
  censusLongYears <- rbind(censusLong09[censusLong09$TYPE == "POPESTIMATE2003" & censusLong09$COUNTY == "000",],
                           censusLong09[censusLong09$TYPE == "POPESTIMATE2004" & censusLong09$COUNTY == "000",],
                           censusLong09[censusLong09$TYPE == "POPESTIMATE2005" & censusLong09$COUNTY == "000",],
                           censusLong09[censusLong09$TYPE == "POPESTIMATE2006" & censusLong09$COUNTY == "000",],
                           censusLong09[censusLong09$TYPE == "POPESTIMATE2007" & censusLong09$COUNTY == "000",],
                           censusLong09[censusLong09$TYPE == "POPESTIMATE2008" & censusLong09$COUNTY == "000",],
                           censusLong09[censusLong09$TYPE == "POPESTIMATE2009" & censusLong09$COUNTY == "000",],
                           censusLong[censusLong$TYPE == "POPESTIMATE2010" & censusLong$COUNTY == "000",],
                           censusLong[censusLong$TYPE == "POPESTIMATE2011" & censusLong$COUNTY == "000",],
                           censusLong[censusLong$TYPE == "POPESTIMATE2012" & censusLong$COUNTY == "000",],
                           censusLong[censusLong$TYPE == "POPESTIMATE2013" & censusLong$COUNTY == "000",])
  censusLongYears <- mutate(censusLongYears, Thousands=as.numeric(censusLongYears$value) / 1000)
  censusLongYears <- mutate(censusLongYears, Year=as.numeric(stringr::str_sub(TYPE, 12)))
  #print(head(censusLongYears))
  
  #print(censusLongYears[censusLongYears$STNAME == 'Alaska', ])
  
  # Slim down to State, Year, Pop in Thousands.
  censusStateYear <- censusLongYears[,c("STNAME", "Year", "Thousands")]
  colnames(censusStateYear) <- c("State", "Year", "CENSUSPOPEstThousands")
  
  # Aggregate to annual for use later with Age of Mother.
  censusAnnual <- aggregate(CENSUSPOPEstThousands ~ Year, censusStateYear, sum)
  print(head(censusAnnual))
  
  # Join the census data to the combined data frame.
  udbc <- join(ubd, censusStateYear, by=c("State", "Year"))
  
  # State Oriented
  udbcState <- subset(udbc, 
                           select=c("State",
                                    "Year.Code","Month.Code",
                                    "Births",
                                    "UnemploymentRate",
                                    "CENSUSPOPEstThousands")) 
  udbcState <- aggregate(Births ~ State+Year.Code+Month.Code+UnemploymentRate+CENSUSPOPEstThousands, udbcState, sum)
  
  # Birth Rate for State data 
  udbcState <- mutate(udbcState, BirthsPer1000Pop=Births / CENSUSPOPEstThousands)
  
  # Show me
  print(head(udbcState))
  
  # Age Oriented
  udbcAge <- subset(udbc, 
                       select=c(
                                "Year.Code","Month.Code",
                                "Births",
                                "UnemploymentRate",
                                "Age.of.Mother"
                                )
                    )  
  colnames(udbcAge) <- c("Year", "Month", "Births", "UnemploymentRate", "Age.of.Mother")

  udbcAge <- ddply(udbcAge,
                   .(Year,
                     Month,
                     Age.of.Mother),
                   summarise,
                   Births=sum(Births),
                   UnemploymentRate=mean(UnemploymentRate))

  # Join census annuals and revert col names for consistency with State data.
  udbcAge <- join(udbcAge, censusAnnual, by=c("Year"))
  colnames(udbcAge) <- c("Year.Code", 
                         "Month.Code", 
                         "Age.of.Mother", 
                         "Births", 
                         "UnemploymentRate", 
                         "CENSUSPOPEstThousands")

  
  udbcAge <- mutate(udbcAge, BirthsPer1000Pop=Births / CENSUSPOPEstThousands)
  print(head(udbcAge))
  
  # Enable this bSave if you want to emit the massaged data as CSV
  # This was used to pass the data over to a Python script for conversion to 
  # Google Data Table formated JSON files.
  bSave <- FALSE
  if(bSave)
  {
    dataFile <- sprintf("%s/LA-Natality-Census-Combined.csv", dataPath)
    write.table(udbcState, dataFile,sep=",", row.names=FALSE)  
         
    dataFile <- sprintf("%s/LA-Natality-Census-Age-Combined.csv", dataPath)
    write.table(udbcAge, dataFile,sep=",", row.names=FALSE)  
  }  
}
##   Year CENSUSPOPEstThousands
## 1 2003              290326.4
## 2 2004              293045.7
## 3 2005              295753.2
## 4 2006              298593.2
## 5 2007              301579.9
## 6 2008              304374.8
##     State Year.Code Month.Code UnemploymentRate CENSUSPOPEstThousands
## 1 Wyoming      2003         10              4.1               499.189
## 2 Wyoming      2003         11              4.1               499.189
## 3 Wyoming      2003         12              4.1               499.189
## 4 Wyoming      2003          8              4.1               499.189
## 5 Wyoming      2003          9              4.1               499.189
## 6 Wyoming      2003          7              4.2               499.189
##   Births BirthsPer1000Pop
## 1    542         1.085761
## 2    520         1.041690
## 3    549         1.099784
## 4    583         1.167894
## 5    561         1.123823
## 6    614         1.229995
##   Year.Code Month.Code Age.of.Mother Births UnemploymentRate
## 1      2003          1   15-19 years  34928         5.505882
## 2      2003          1   20-24 years  84714         5.505882
## 3      2003          1   25-29 years  86521         5.505882
## 4      2003          1   30-34 years  76967         5.505882
## 5      2003          1   35-39 years  37346         5.505882
## 6      2003          1   40-44 years   8234         5.528000
##   CENSUSPOPEstThousands BirthsPer1000Pop
## 1              290326.4       0.12030597
## 2              290326.4       0.29178881
## 3              290326.4       0.29801284
## 4              290326.4       0.26510505
## 5              290326.4       0.12863452
## 6              290326.4       0.02836118

Finally, convert to JSON for easlier use in by Google Charts

The PanGviz Python package is used to convert the LA-Natality-xyz.csv files listed above into Google Data Table formated JSON.