Organize data and gapfill missing countries that have incomplete data. This is the first gapfilling step. The second step will involve gapfilling countries that have no data. This first level of gapfilling probably has very little error.
cats <- read.csv(file.path(dir_M, 'git-annex/globalprep/_raw_data/SocialProgressIndex/d2016/spi_categories.csv')) %>%
mutate(category = gsub(" ", "", category))
spi <- read.csv(file.path(dir_M, 'git-annex/globalprep/_raw_data/SocialProgressIndex/d2016/2016-Results_spi.csv'), check.names=FALSE, stringsAsFactors=FALSE)
names(spi) <- gsub(" ", "", names(spi))
spi <- spi %>%
dplyr::select(-CountryCode) %>%
gather('category', 'score', -Country) %>%
left_join(cats, by = "category")
Gapfill bhn component
[NOTE: There seems like there must be a better way to do this.]
bhn_tmp <- spi %>%
filter(subcategory == "bhn") %>%
spread(category, score)
## gapfill Watershed and Sanitation data
mod <- lm(WaterandSanitation ~ Shelter, data=bhn_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = WaterandSanitation ~ Shelter, data = bhn_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.039 -4.556 0.191 5.181 25.924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.37068 2.75362 -3.04 0.00278 **
## Shelter 1.21274 0.03914 30.98 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.712 on 154 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.8617, Adjusted R-squared: 0.8608
## F-statistic: 959.9 on 1 and 154 DF, p-value: < 2.2e-16
bhn_tmp$WaterandSanitation_pred <- predict(mod, newdata = select(bhn_tmp, Shelter))
bhn_tmp <- bhn_tmp %>%
mutate(gapfill = ifelse(is.na(WaterandSanitation) & !is.na(WaterandSanitation_pred), "WaterandSanitation_gf", NA)) %>%
mutate(WaterandSanitation = ifelse(is.na(WaterandSanitation), WaterandSanitation_pred, WaterandSanitation))
## gapfill Shelter data
mod <- lm(Shelter ~ WaterandSanitation, data=bhn_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = Shelter ~ WaterandSanitation, data = bhn_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.075 -3.879 0.208 3.869 34.703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.27905 1.78092 8.579 9.29e-15 ***
## WaterandSanitation 0.71058 0.02286 31.083 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.41 on 155 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.8618, Adjusted R-squared: 0.8609
## F-statistic: 966.2 on 1 and 155 DF, p-value: < 2.2e-16
bhn_tmp$Shelter_pred <- predict(mod, newdata = select(bhn_tmp, WaterandSanitation))
bhn_tmp <- bhn_tmp %>%
mutate(gapfill = ifelse(is.na(Shelter) & !is.na(Shelter_pred), paste(bhn_tmp$gapfill, "Shelter_gf"), gapfill)) %>%
mutate(Shelter = ifelse(is.na(Shelter), Shelter_pred, Shelter))
## gapfill Nutrition and Basic Medical Care
mod <- lm(NutritionandBasicMedicalCare ~ WaterandSanitation + Shelter, data=bhn_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = NutritionandBasicMedicalCare ~ WaterandSanitation +
## Shelter, data = bhn_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.2332 -2.1777 0.0963 2.6544 14.0977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.56510 1.71758 26.529 < 2e-16 ***
## WaterandSanitation 0.25936 0.04744 5.467 1.92e-07 ***
## Shelter 0.33963 0.06315 5.379 2.89e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.547 on 147 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.8488, Adjusted R-squared: 0.8467
## F-statistic: 412.5 on 2 and 147 DF, p-value: < 2.2e-16
bhn_tmp$NutritionandBasicMedicalCare_pred <- predict(mod, newdata = select(bhn_tmp, WaterandSanitation, Shelter))
bhn_tmp <- bhn_tmp %>%
mutate(gapfill = ifelse(is.na(NutritionandBasicMedicalCare) & !is.na(NutritionandBasicMedicalCare_pred), paste(bhn_tmp$gapfill, "NutritionandBasicMedicalCare_gf"), gapfill)) %>%
mutate(NutritionandBasicMedicalCare = ifelse(is.na(NutritionandBasicMedicalCare), NutritionandBasicMedicalCare_pred, NutritionandBasicMedicalCare))
## gapfill personal safety
mod <- lm(PersonalSafety ~ WaterandSanitation + Shelter + NutritionandBasicMedicalCare, data=bhn_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = PersonalSafety ~ WaterandSanitation + Shelter +
## NutritionandBasicMedicalCare, data = bhn_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.849 -4.886 1.154 8.305 18.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.56168 8.75640 4.404 2.01e-05 ***
## WaterandSanitation -0.01827 0.10923 -0.167 0.86742
## Shelter 0.45499 0.14430 3.153 0.00195 **
## NutritionandBasicMedicalCare 0.01631 0.17749 0.092 0.92690
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.86 on 150 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.3695, Adjusted R-squared: 0.3569
## F-statistic: 29.3 on 3 and 150 DF, p-value: 5.705e-15
bhn_tmp$PersonalSafety_pred <- predict(mod, newdata = select(bhn_tmp, WaterandSanitation, Shelter, NutritionandBasicMedicalCare))
bhn_tmp <- bhn_tmp %>%
mutate(gapfill = ifelse(is.na(PersonalSafety) & !is.na(PersonalSafety_pred), paste(bhn_tmp$gapfill, "PersonalSafety_gf"), gapfill)) %>%
mutate(PersonalSafety = ifelse(is.na(PersonalSafety), PersonalSafety_pred, PersonalSafety))
###
bhn_gf <- bhn_tmp %>%
mutate(gapfill = gsub("NA ", "", gapfill)) %>%
select(-ends_with("pred")) %>%
gather('category', 'score', -Country, -subcategory, -gapfill)
Gapfill fw component
fw_tmp <- spi %>%
filter(subcategory == "fw") %>%
spread(category, score)
## gapfill EnvironmentalQuality
mod <- lm(EnvironmentalQuality ~ AccesstoInformationandCommunications + HealthandWellness, data=fw_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = EnvironmentalQuality ~ AccesstoInformationandCommunications +
## HealthandWellness, data = fw_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.500 -4.999 0.470 6.951 31.978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.84789 5.54672 -3.218 0.00157
## AccesstoInformationandCommunications 0.59338 0.05085 11.670 < 2e-16
## HealthandWellness 0.64771 0.08926 7.257 1.75e-11
##
## (Intercept) **
## AccesstoInformationandCommunications ***
## HealthandWellness ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.667 on 156 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6338, Adjusted R-squared: 0.6291
## F-statistic: 135 on 2 and 156 DF, p-value: < 2.2e-16
fw_tmp$EnvironmentalQuality_pred <- predict(mod, newdata = select(fw_tmp, AccesstoInformationandCommunications, HealthandWellness))
fw_tmp <- fw_tmp %>%
mutate(gapfill = ifelse(is.na(EnvironmentalQuality) & !is.na(EnvironmentalQuality_pred), "EnvironmentalQuality_gf", NA)) %>%
mutate(EnvironmentalQuality = ifelse(is.na(EnvironmentalQuality), EnvironmentalQuality_pred, EnvironmentalQuality))
## gapfill Access to Basic Knowledge
mod <- lm(AccesstoBasicKnowledge ~ AccesstoInformationandCommunications + HealthandWellness + EnvironmentalQuality,
data=fw_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = AccesstoBasicKnowledge ~ AccesstoInformationandCommunications +
## HealthandWellness + EnvironmentalQuality, data = fw_tmp,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.710 -4.850 -0.109 6.597 37.626
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.07838 7.17608 5.446 2.08e-07
## AccesstoInformationandCommunications 0.61979 0.08644 7.170 3.25e-11
## HealthandWellness -0.26308 0.13059 -2.015 0.04574
## EnvironmentalQuality 0.27256 0.09922 2.747 0.00676
##
## (Intercept) ***
## AccesstoInformationandCommunications ***
## HealthandWellness *
## EnvironmentalQuality **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.87 on 149 degrees of freedom
## (7 observations deleted due to missingness)
## Multiple R-squared: 0.5333, Adjusted R-squared: 0.5239
## F-statistic: 56.75 on 3 and 149 DF, p-value: < 2.2e-16
fw_tmp$AccesstoBasicKnowledge_pred <- predict(mod, newdata = select(fw_tmp, AccesstoInformationandCommunications, HealthandWellness, EnvironmentalQuality))
fw_tmp <- fw_tmp %>%
mutate(gapfill = ifelse(is.na(AccesstoBasicKnowledge) & !is.na(AccesstoBasicKnowledge_pred), paste(gapfill, "AccesstoBasicKnowledge_gf", sep=", "), gapfill)) %>%
mutate(AccesstoBasicKnowledge = ifelse(is.na(AccesstoBasicKnowledge), AccesstoBasicKnowledge_pred, AccesstoBasicKnowledge))
###
fw_gf <- fw_tmp %>%
mutate(gapfill = gsub("NA, ", "", gapfill)) %>%
select(-ends_with("pred")) %>%
gather('category', 'score', -Country, -subcategory, -gapfill)
Gapfill op component
op_tmp <- spi %>%
filter(subcategory == "op") %>%
spread(category, score)
## gapfill AccesstoAdvancedEducation
mod <- lm(AccesstoAdvancedEducation ~ PersonalRights + PersonalFreedomandChoice, data=op_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = AccesstoAdvancedEducation ~ PersonalRights + PersonalFreedomandChoice,
## data = op_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.359 -9.485 -0.018 9.400 46.780
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -29.47251 5.06333 -5.821 3.36e-08 ***
## PersonalRights 0.09203 0.05550 1.658 0.0993 .
## PersonalFreedomandChoice 1.05223 0.09809 10.727 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15 on 152 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.5837, Adjusted R-squared: 0.5782
## F-statistic: 106.6 on 2 and 152 DF, p-value: < 2.2e-16
op_tmp$AccesstoAdvancedEducation_pred <- predict(mod, newdata = select(op_tmp, PersonalRights, PersonalFreedomandChoice))
op_tmp <- op_tmp %>%
mutate(gapfill = ifelse(is.na(AccesstoAdvancedEducation) & !is.na(AccesstoAdvancedEducation_pred), "AccesstoAdvancedEducation_gf", NA)) %>%
mutate(AccesstoAdvancedEducation = ifelse(is.na(AccesstoAdvancedEducation), AccesstoAdvancedEducation_pred, AccesstoAdvancedEducation))
## gapfill PersonalFreedomandChoice
mod <- lm(PersonalFreedomandChoice ~ AccesstoAdvancedEducation + PersonalRights, data=op_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = PersonalFreedomandChoice ~ AccesstoAdvancedEducation +
## PersonalRights, data = op_tmp, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.518 -6.478 0.992 6.448 26.550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.20031 1.74812 21.280 < 2e-16 ***
## AccesstoAdvancedEducation 0.41037 0.03793 10.818 < 2e-16 ***
## PersonalRights 0.15036 0.03221 4.668 6.56e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.306 on 154 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.6294, Adjusted R-squared: 0.6246
## F-statistic: 130.8 on 2 and 154 DF, p-value: < 2.2e-16
op_tmp$PersonalFreedomandChoice_pred <- predict(mod, newdata = select(op_tmp, AccesstoAdvancedEducation, PersonalRights))
op_tmp <- op_tmp %>%
mutate(gapfill = ifelse(is.na(PersonalFreedomandChoice) & !is.na(PersonalFreedomandChoice_pred), paste(gapfill, "PersonalFreedomandChoice_gf", sep=", "), gapfill)) %>%
mutate(PersonalFreedomandChoice = ifelse(is.na(PersonalFreedomandChoice), PersonalFreedomandChoice_pred, PersonalFreedomandChoice))
## gapfill ToleranceandInclusion
mod <- lm(ToleranceandInclusion ~ AccesstoAdvancedEducation + PersonalFreedomandChoice + PersonalRights, data=op_tmp, na.action=na.exclude)
summary(mod)
##
## Call:
## lm(formula = ToleranceandInclusion ~ AccesstoAdvancedEducation +
## PersonalFreedomandChoice + PersonalRights, data = op_tmp,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.0043 -6.4644 0.8742 6.8250 21.1301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.58874 3.61295 2.377 0.0187 *
## AccesstoAdvancedEducation 0.01790 0.05219 0.343 0.7321
## PersonalFreedomandChoice 0.47803 0.08364 5.715 5.84e-08 ***
## PersonalRights 0.22884 0.03643 6.282 3.53e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.587 on 148 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.6308, Adjusted R-squared: 0.6233
## F-statistic: 84.28 on 3 and 148 DF, p-value: < 2.2e-16
op_tmp$ToleranceandInclusion_pred <- predict(mod, newdata = select(op_tmp, AccesstoAdvancedEducation, PersonalFreedomandChoice, PersonalRights))
op_tmp <- op_tmp %>%
mutate(gapfill = ifelse(is.na(ToleranceandInclusion) & !is.na(ToleranceandInclusion_pred), paste(gapfill, "ToleranceandInclusion_gf", sep=", "), gapfill)) %>%
mutate(ToleranceandInclusion = ifelse(is.na(ToleranceandInclusion), ToleranceandInclusion_pred, ToleranceandInclusion))
###
op_gf <- op_tmp %>%
mutate(gapfill = gsub("NA, ", "", gapfill)) %>%
select(-ends_with("pred")) %>%
gather('category', 'score', -Country, -subcategory, -gapfill)
Combine data
data_subcategory <- rbind(bhn_gf, op_gf, fw_gf) %>%
group_by(Country, subcategory) %>%
summarize(score = mean(score),
gapfill = paste(unique(gapfill), collapse=", ")) %>%
mutate(gapfill = gsub("NA, ", "", gapfill)) %>%
mutate(gapfill = gsub("NA", NA, gapfill)) %>%
mutate(score = ifelse(score > 100, 100, score),
score = ifelse(score < 0, 0, score)) %>%
rename(score_gf = score) %>%
ungroup()
data_subcategory$subcategory <- mapvalues(data_subcategory$subcategory,
from=c("bhn", "fw", "op"),
to=c("bhn_score", "fw_score", "op_score"))
spi_subcategory <- spi %>%
filter(subcategory %in% c("bhn_score", "fw_score", "op_score")) %>%
left_join(data_subcategory, by=c("Country", "subcategory")) %>%
mutate(score = ifelse(is.na(score), score_gf, score)) %>%
select(Country, score, subcategory, gapfill)
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
## Average subcategories to get spi score
spi_gf <- spi_subcategory %>%
group_by(Country) %>%
summarize(score_gf = mean(score),
gapfill = paste(unique(gapfill), collapse = ", ")) %>%
mutate(gapfill = gsub(", NA", "", gapfill)) %>%
mutate(gapfill = gsub("NA, ", "", gapfill)) %>%
mutate(gapfill = gsub("NA", NA, gapfill)) %>%
mutate(subcategory = "spi")
spi_only <- spi %>%
filter(subcategory == "spi") %>%
left_join(spi_gf, by = c("Country", "subcategory")) %>%
mutate(score = ifelse(is.na(score), score_gf, score)) %>%
select(Country, score, gapfill)
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
write.csv(spi_only, "int/Country_spi.csv", row.names=FALSE)
5 Social Progress Index data
Organize data and gapfill missing countries that have incomplete data. This is the first gapfilling step. The second step will involve gapfilling countries that have no data. This first level of gapfilling probably has very little error.
5.1 Gapfill bhn component
[NOTE: There seems like there must be a better way to do this.]
5.2 Gapfill fw component
5.3 Gapfill op component
5.4 Combine data