require.package("plyr")
require.package("dplyr")
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataPath <- "C:/Code/R/IS608-VizAnalytics/FinalProject/Data/"
The loadBirthData2
function, shown below, was used to load the CDC natality data from a local file into R for manipulation.
#####
# FUNCTION: loadBirthData2
#
#
loadBirthData2 <- function(birthFile)
{
# Load the Natality data
birthData <- read.table(birthFile,
header=TRUE,
sep="\t",
fill=TRUE,
stringsAsFactors=FALSE,
colClasses=c('character', # Notes
'character', # Year
'character', # Year.Code
'character', # Month
'character', # Month.Code
'character', # State
'character', # State.Code
'character', # Age of Mother
'character', # Age of Mother Code
'numeric')) # Births
#print(birthData[is.na(birthData$Births),])
birthDataWoNa <- subset(birthData, !is.na(birthData$Births),
select = - c(Notes))
birthDataWoNa <- dplyr::group_by(birthDataWoNa,
Year, Year.Code, Month, Month.Code, State, State.Code, Age.of.Mother, Age.of.Mother.Code)
birthDataWoNa <- dplyr::summarise(birthDataWoNa, sum(Births))
return (birthDataWoNa)
}
The loadCensusData
function, shown below, was used to load the census population estimates data from a local file into R for manipulation.
#####
# FUNCTION: loadCensusData
#
#
loadCensusData <- function(file, doReduce = TRUE)
{
# Load the Natality data
dataFile <- sprintf("%s/%s", dataPath, file)
data <- read.table(dataFile,
header=TRUE,
sep=",",
quote="",
fill=TRUE,
stringsAsFactors=FALSE,
colClasses=c('character'))
data <- mutate(data, County.Code=paste(STATE, COUNTY, sep=""))
if(doReduce)
{
data <- mutate(data, CENSUS2010POPThousands=as.numeric(CENSUS2010POP) / 1000)
}
#print(summary(data))
return (data)
}
The loadUnemploymentData
function shown below loads the local area unemployment data along with reference datasets for states and periods (U.S. Bureau of Labor Statistics, 2015). These data sets are cleaned and joined together to facilitate usage in the visualization later.
#####
# FUNCTION: loadUnemploymentData
#
#
loadUnemploymentData <- function()
{
# Load the Local Area Unemployment data
dataFile <- sprintf("%s/la.data.3.AllStatesS.txt", dataPath)
dataTable <- read.table(dataFile,
header=TRUE,
sep="\t",
fill=TRUE,
stringsAsFactors=FALSE,
#col.names=c('series_id', 'year', 'period', 'value', 'footnote_codes'),
colClasses=c('character', # series_id
'numeric', # year
'character', # period
'numeric' # value
#, 'character' # footnote_codes
))
# State IDs
dataFile <- sprintf("%s/la.state_region_division.txt", dataPath)
states <- read.table(dataFile,
header=TRUE,
sep="\t",
fill=TRUE,
stringsAsFactors=FALSE,
col.names=c('srd_code', 'blank', 'srd_text'),
colClasses=c('character', # srd_code
'character' # srd_text
))
# format for the series id we want (unemployment rate)
states$series_id <- paste("LASST", states$srd_code, "0000000000003", sep="") #,
print(head(states))
#print(head(dataTable$series_id))
# Period IDs (months)
dataFile <- sprintf("%s/la.period.txt", dataPath)
periods <- read.table(dataFile,
header=TRUE,
sep="\t",
fill=TRUE,
stringsAsFactors=FALSE,
# col.names=c('srd_code', 'blank', 'srd_text'),
colClasses=c('character',
'character'
))
periods$month <- stringr::str_extract(periods$period, "\\d{2,2}")
periods <- subset(periods, periods$period != "M13")
print(periods)
# Only between 2003 and 2013
cleanerData <- subset(dataTable, 2003 <= dataTable$year & dataTable$year <= 2013)
# Trim whitespace off the series_id
cleanerData$series_id <- stringr::str_trim(cleanerData$series_id)
# join state names and limit to only the unemployment data we care about.
cleanerData <- plyr::join(cleanerData, states, by="series_id", type="inner")
# join month info and limit to only the months (excluding annual average)
cleanerData <- plyr::join(cleanerData, periods, by="period", type="inner")
return (cleanerData)
}
The following code loads the 2 birth datasets and binds them into a single data frame. Local area unemployment data is also loaded.
birthFile <- sprintf("%s/Natality-2007-2013-YearMonthStateAge.txt", dataPath)
bd0713 <- loadBirthData2(birthFile)
birthFile <- sprintf("%s/Natality-2003-2006-YearMonthStateAge.txt", dataPath)
bd0306 <- loadBirthData2(birthFile)
head(bd0306)
## Source: local data frame [6 x 9]
## Groups: Year, Year.Code, Month, Month.Code, State, State.Code, Age.of.Mother
##
## Year Year.Code Month Month.Code State State.Code Age.of.Mother
## 1 2003 2003 April 4 Alabama 01 15-19 years
## 2 2003 2003 April 4 Alabama 01 20-24 years
## 3 2003 2003 April 4 Alabama 01 25-29 years
## 4 2003 2003 April 4 Alabama 01 30-34 years
## 5 2003 2003 April 4 Alabama 01 35-39 years
## 6 2003 2003 April 4 Alabama 01 40-44 years
## Variables not shown: Age.of.Mother.Code (chr), sum(Births) (dbl)
bd0313 <- rbind(bd0713, bd0306)
ud <- loadUnemploymentData()
## srd_code blank srd_text series_id
## 1 01 Alabama LASST010000000000003
## 2 02 Alaska LASST020000000000003
## 3 04 Arizona LASST040000000000003
## 4 05 Arkansas LASST050000000000003
## 5 06 California LASST060000000000003
## 6 08 Colorado LASST080000000000003
## period period_abbr period_name month
## 1 M01 JAN January 01
## 2 M02 FEB February 02
## 3 M03 MAR March 03
## 4 M04 APR April 04
## 5 M05 MAY May 05
## 6 M06 JUN June 06
## 7 M07 JUL July 07
## 8 M08 AUG August 08
## 9 M09 SEP September 09
## 10 M10 OCT October 10
## 11 M11 NOV November 11
## 12 M12 DEC December 12
head(ud)
## series_id year period value footnote_codes srd_code blank
## 1 LASST010000000000003 2003 M01 5.8 01
## 2 LASST010000000000003 2003 M02 5.8 01
## 3 LASST010000000000003 2003 M03 5.9 01
## 4 LASST010000000000003 2003 M04 6.0 01
## 5 LASST010000000000003 2003 M05 6.1 01
## 6 LASST010000000000003 2003 M06 6.1 01
## srd_text period_abbr period_name month
## 1 Alabama JAN January 01
## 2 Alabama FEB February 02
## 3 Alabama MAR March 03
## 4 Alabama APR April 04
## 5 Alabama MAY May 05
## 6 Alabama JUN June 06
The following code snippet saves the massaged local area unemployment data to a comma separated value (CSV) file for reference and later use.
dataFile <- sprintf("%s/LA-States-2003-2013.csv", dataPath)
bSaveUnempData <- FALSE
if(bSaveUnempData)
{
write.table(ud, dataFile,sep=",", row.names=FALSE)
}
Here we update column names in the data frames inorder to facilitate the plyr::join
operation.
colnames(bd0313) <- c("Year", "Year.Code",
"Month", "Month.Code", "State",
"State Code", "Age.of.Mother", "Age.of.Mother.Code", "Births") # "County", "County Code",
colnames(ud) <- c("series_id","Year","period","UnemploymentRate",
"footnote_codes","srd_code","blank",
"State","period_abbr","Month","monthNum")
ubd <- plyr::join(ud, bd0313, by=c("Year", "Month", "State"), type="inner")
head(ubd)
## series_id Year period UnemploymentRate footnote_codes
## 1 LASST010000000000003 2003 M01 5.8
## 2 LASST010000000000003 2003 M01 5.8
## 3 LASST010000000000003 2003 M01 5.8
## 4 LASST010000000000003 2003 M01 5.8
## 5 LASST010000000000003 2003 M01 5.8
## 6 LASST010000000000003 2003 M01 5.8
## srd_code blank State period_abbr Month monthNum Year.Code Month.Code
## 1 01 Alabama JAN January 01 2003 1
## 2 01 Alabama JAN January 01 2003 1
## 3 01 Alabama JAN January 01 2003 1
## 4 01 Alabama JAN January 01 2003 1
## 5 01 Alabama JAN January 01 2003 1
## 6 01 Alabama JAN January 01 2003 1
## State Code Age.of.Mother Age.of.Mother.Code Births
## 1 01 15-19 years 15-19 688
## 2 01 20-24 years 20-24 1496
## 3 01 25-29 years 25-29 1341
## 4 01 30-34 years 30-34 905
## 5 01 35-39 years 35-39 347
## 6 01 40-44 years 40-44 68
bSave <- FALSE
if(bSave)
{
dataFile <- sprintf("%s/LA-Natality-Combinedv2.csv", path)
write.table(ubd, dataFile,sep=",", row.names=FALSE)
}
# Transform raw year/month columns into a Date column
ubd <- dplyr::mutate(ubd,
Date = lubridate::parse_date_time(sprintf("%s-%s-01",
Year.Code,
Month.Code),
orders="ymd"))
birthsPerYearMth <- aggregate(Births ~ Date, ubd, sum)
birthsPerYearMth <- birthsPerYearMth[order(birthsPerYearMth$Date), ]
doBirthRateCombine <- TRUE
if(doBirthRateCombine)
{
# Load Census Data
censusData <- loadCensusData("CO-EST2014-Alldata.csv")
censusData09 <- loadCensusData("CO-EST2009-ALLDATA.csv", FALSE)
# Melt into long form
censusLong <- reshape2::melt(censusData, id.vars=c("STNAME", "COUNTY"), variable.name="TYPE")
censusLong09 <- reshape2::melt(censusData09, id.vars=c("STNAME", "COUNTY"), variable.name="TYPE")
# Extract only the years/columns we need, convert to thousands and parse out a Year column
#print(head(censusLong))
censusLongYears <- rbind(censusLong09[censusLong09$TYPE == "POPESTIMATE2003" & censusLong09$COUNTY == "000",],
censusLong09[censusLong09$TYPE == "POPESTIMATE2004" & censusLong09$COUNTY == "000",],
censusLong09[censusLong09$TYPE == "POPESTIMATE2005" & censusLong09$COUNTY == "000",],
censusLong09[censusLong09$TYPE == "POPESTIMATE2006" & censusLong09$COUNTY == "000",],
censusLong09[censusLong09$TYPE == "POPESTIMATE2007" & censusLong09$COUNTY == "000",],
censusLong09[censusLong09$TYPE == "POPESTIMATE2008" & censusLong09$COUNTY == "000",],
censusLong09[censusLong09$TYPE == "POPESTIMATE2009" & censusLong09$COUNTY == "000",],
censusLong[censusLong$TYPE == "POPESTIMATE2010" & censusLong$COUNTY == "000",],
censusLong[censusLong$TYPE == "POPESTIMATE2011" & censusLong$COUNTY == "000",],
censusLong[censusLong$TYPE == "POPESTIMATE2012" & censusLong$COUNTY == "000",],
censusLong[censusLong$TYPE == "POPESTIMATE2013" & censusLong$COUNTY == "000",])
censusLongYears <- mutate(censusLongYears, Thousands=as.numeric(censusLongYears$value) / 1000)
censusLongYears <- mutate(censusLongYears, Year=as.numeric(stringr::str_sub(TYPE, 12)))
#print(head(censusLongYears))
#print(censusLongYears[censusLongYears$STNAME == 'Alaska', ])
# Slim down to State, Year, Pop in Thousands.
censusStateYear <- censusLongYears[,c("STNAME", "Year", "Thousands")]
colnames(censusStateYear) <- c("State", "Year", "CENSUSPOPEstThousands")
# Aggregate to annual for use later with Age of Mother.
censusAnnual <- aggregate(CENSUSPOPEstThousands ~ Year, censusStateYear, sum)
print(head(censusAnnual))
# Join the census data to the combined data frame.
udbc <- join(ubd, censusStateYear, by=c("State", "Year"))
# State Oriented
udbcState <- subset(udbc,
select=c("State",
"Year.Code","Month.Code",
"Births",
"UnemploymentRate",
"CENSUSPOPEstThousands"))
udbcState <- aggregate(Births ~ State+Year.Code+Month.Code+UnemploymentRate+CENSUSPOPEstThousands, udbcState, sum)
# Birth Rate for State data
udbcState <- mutate(udbcState, BirthsPer1000Pop=Births / CENSUSPOPEstThousands)
# Show me
print(head(udbcState))
# Age Oriented
udbcAge <- subset(udbc,
select=c(
"Year.Code","Month.Code",
"Births",
"UnemploymentRate",
"Age.of.Mother"
)
)
colnames(udbcAge) <- c("Year", "Month", "Births", "UnemploymentRate", "Age.of.Mother")
udbcAge <- ddply(udbcAge,
.(Year,
Month,
Age.of.Mother),
summarise,
Births=sum(Births),
UnemploymentRate=mean(UnemploymentRate))
# Join census annuals and revert col names for consistency with State data.
udbcAge <- join(udbcAge, censusAnnual, by=c("Year"))
colnames(udbcAge) <- c("Year.Code",
"Month.Code",
"Age.of.Mother",
"Births",
"UnemploymentRate",
"CENSUSPOPEstThousands")
udbcAge <- mutate(udbcAge, BirthsPer1000Pop=Births / CENSUSPOPEstThousands)
print(head(udbcAge))
# Enable this bSave if you want to emit the massaged data as CSV
# This was used to pass the data over to a Python script for conversion to
# Google Data Table formated JSON files.
bSave <- FALSE
if(bSave)
{
dataFile <- sprintf("%s/LA-Natality-Census-Combined.csv", dataPath)
write.table(udbcState, dataFile,sep=",", row.names=FALSE)
dataFile <- sprintf("%s/LA-Natality-Census-Age-Combined.csv", dataPath)
write.table(udbcAge, dataFile,sep=",", row.names=FALSE)
}
}
## Year CENSUSPOPEstThousands
## 1 2003 290326.4
## 2 2004 293045.7
## 3 2005 295753.2
## 4 2006 298593.2
## 5 2007 301579.9
## 6 2008 304374.8
## State Year.Code Month.Code UnemploymentRate CENSUSPOPEstThousands
## 1 Wyoming 2003 10 4.1 499.189
## 2 Wyoming 2003 11 4.1 499.189
## 3 Wyoming 2003 12 4.1 499.189
## 4 Wyoming 2003 8 4.1 499.189
## 5 Wyoming 2003 9 4.1 499.189
## 6 Wyoming 2003 7 4.2 499.189
## Births BirthsPer1000Pop
## 1 542 1.085761
## 2 520 1.041690
## 3 549 1.099784
## 4 583 1.167894
## 5 561 1.123823
## 6 614 1.229995
## Year.Code Month.Code Age.of.Mother Births UnemploymentRate
## 1 2003 1 15-19 years 34928 5.505882
## 2 2003 1 20-24 years 84714 5.505882
## 3 2003 1 25-29 years 86521 5.505882
## 4 2003 1 30-34 years 76967 5.505882
## 5 2003 1 35-39 years 37346 5.505882
## 6 2003 1 40-44 years 8234 5.528000
## CENSUSPOPEstThousands BirthsPer1000Pop
## 1 290326.4 0.12030597
## 2 290326.4 0.29178881
## 3 290326.4 0.29801284
## 4 290326.4 0.26510505
## 5 290326.4 0.12863452
## 6 290326.4 0.02836118
The PanGviz Python package is used to convert the LA-Natality-xyz.csv files listed above into Google Data Table formated JSON.