[REFERENCE RMD FILE: https://cdn.rawgit.com/OHI-Science/ohiprep/master/globalprep/tr/v2016/tr_dataprep.html]
This document describes the steps needed to obtain the data used to calculate the tourism and recreation goal. This information is specific to the 2016 global assessment.
The general calculation is: tr = Ep * Sr * Tw and Xtr = tr/90th quantile across regions
Tw = A penalty applied to regions with travel warnings from the US State Department
Per capital GDP: (World Bank with gaps filled using CIA data), used to gapfill missing values in Tourism sustainability
Prior to the 2015 global assessment, we used World Bank data to estimate the total labor force and unemployment to calculate the proportion of jobs directly from tourism. In the 2015 assessment, we used the percentages directly from WEF. Consequently, we no longer need to collect these World Bank data. We attempted to see if missing WEF data could be estimated using these values, but this did not improve sample sizes.
We were able to update the following data for the 2016 assessment: * Proportion of jobs in tourism (data now goes to 2014) * Travel warnings for 2016 (downloaded: 7/16/2016)
There have been no updates to the tourism sustainability data. Consequently, we will use the data from last year.
Changes to travel warnings from the 2015 assessment: * Philippines warning changed to regional, which seemed more appropriate given the description * Data prior to 2014 (2012-2013) is just a duplicate of the 2014 data, which seemed more accurate than what we had (N. Korea not included, etc.) * Changed Israel to not be regional
#setwd('globalprep/tr/v2016') #comment out when knitting
# library(devtools)
# devtools::install_github("ohi-science/ohicore@dev")
library(ohicore)
## Warning: replacing previous import 'ggplot2::last_plot' by
## 'plotly::last_plot' when loading 'ohicore'
## Warning: replacing previous import 'git2r::config' by 'plotly::config' when
## loading 'ohicore'
## Warning: replacing previous import 'dplyr::failwith' by 'plyr::failwith'
## when loading 'ohicore'
## Warning: replacing previous import 'dplyr::id' by 'plyr::id' when loading
## 'ohicore'
## Warning: replacing previous import 'dplyr::summarize' by 'plyr::summarize'
## when loading 'ohicore'
## Warning: replacing previous import 'dplyr::count' by 'plyr::count' when
## loading 'ohicore'
## Warning: replacing previous import 'dplyr::desc' by 'plyr::desc' when
## loading 'ohicore'
## Warning: replacing previous import 'dplyr::mutate' by 'plyr::mutate' when
## loading 'ohicore'
## Warning: replacing previous import 'dplyr::arrange' by 'plyr::arrange' when
## loading 'ohicore'
## Warning: replacing previous import 'dplyr::rename' by 'plyr::rename' when
## loading 'ohicore'
## Warning: replacing previous import 'dplyr::summarise' by 'plyr::summarise'
## when loading 'ohicore'
## Warning: replacing previous import 'plyr::arrange' by 'dplyr::arrange' when
## loading 'ohicore'
## Warning: replacing previous import 'plyr::desc' by 'dplyr::desc' when
## loading 'ohicore'
## Warning: replacing previous import 'plyr::failwith' by 'dplyr::failwith'
## when loading 'ohicore'
## Warning: replacing previous import 'plyr::id' by 'dplyr::id' when loading
## 'ohicore'
## Warning: replacing previous import 'plyr::mutate' by 'dplyr::mutate' when
## loading 'ohicore'
## Warning: replacing previous import 'plyr::summarise' by 'dplyr::summarise'
## when loading 'ohicore'
## Warning: replacing previous import 'plyr::summarize' by 'dplyr::summarize'
## when loading 'ohicore'
source('../../../src/R/common.R')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
## maximum year of wttc data:
year_max <- 2014
source('R/tr_fxns.R')
These data are from the World Travel & Tourism Council (http://www.wttc.org/). We use “direct” employment data (eee mazu: globalprep/_raw_data/WTTC/README.md for instructions on obtaining data).
These data are cleaned and formatted using the R/process_WTTC.R script. Missing values are gapfilled using the UN georegion information.
## describe where the raw data are located:
dir_wttc <- file.path(dir_M, 'git-annex/globalprep/_raw_data/WTTC/d2016/raw')
## processing script that formats the WTTC for OHI, saves the following: intermediate/wttc_empd_rgn
source('R/process_WTTC.R', local = TRUE)
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
##
## These data were removed for not having any match in the lookup tables:
##
## Other Oceania
## 1
##
## These data were removed for not being of the proper rgn_type (eez,ohi_region) or mismatching region names in the lookup tables:
## tmp_type
## tmp_name landlocked
## Armenia 39
## Austria 39
## Belarus 39
## Bolivia 39
## Botswana 39
## Burkina Faso 39
## Burundi 39
## Central African Republic 39
## Chad 39
## Czech Republic 39
## Ethiopia 39
## Hungary 39
## Kazakhstan 39
## Kyrgyzstan 39
## Laos 39
## Lesotho 39
## Luxembourg 39
## Macedonia 39
## Malawi 39
## Mali 39
## Moldova 39
## Mongolia 39
## Nepal 39
## Niger 39
## Paraguay 39
## Rwanda 39
## Serbia 39
## Slovakia 39
## Swaziland 39
## Switzerland 39
## Uganda 39
## Uzbekistan 39
## Zambia 39
## Zimbabwe 39
##
## DUPLICATES found. Consider using collapse2rgn to collapse duplicates (function in progress).
## [1] "Guadeloupe and Martinique"
## [2] "Puerto Rico and Virgin Islands of the United States"
## [3] "China"
## read in the dataset created by above function:
tr_jobs_pct_tour <- read.csv('intermediate/wttc_empd_rgn.csv', stringsAsFactors = FALSE) %>%
select(rgn_id, year, jobs_pct)
## format data to have complete years/regions and convert percentage of jobs to proportion of jobs
rgn_names <- read.csv('../../../../ohi-global/eez2013/layers/rgn_global.csv', stringsAsFactors = FALSE) %>%
rename(rgn_name = label)
rgn_names <- rgn_names %>%
left_join(data.frame(rgn_id = rep(1:max(rgn_names$rgn_id), each = 25),
year = rep(c((year_max-24):year_max), max(rgn_names$rgn_id))),
by = 'rgn_id')
tr_data_raw <- rgn_names %>%
full_join(tr_jobs_pct_tour %>%
rename(Ep = jobs_pct) %>%
mutate(Ep = Ep/100,
Ep = ifelse(Ep > 1, NA, Ep)),
by = c('rgn_id', 'year'))
## gapfill missing data using UN georegion data:
georegions <- georegions
georegion_labels <- georegion_labels
tr_data_raw <- tr_data_raw %>%
left_join(georegions, by = 'rgn_id') %>%
left_join(georegion_labels, by = 'rgn_id') %>%
select(-r0) %>%
filter(rgn_id != c(255, 213)) # ditch disputed regions and Antarctica
tr_data_raw_gf <- tr_data_raw %>%
group_by(year, r2) %>%
mutate(Ep_pred_r2 = mean(Ep, na.rm=TRUE)) %>%
ungroup() %>%
group_by(year, r1) %>%
mutate(Ep_pred_r1 = mean(Ep, na.rm=TRUE)) %>%
ungroup()
tr_data_raw_gf <- tr_data_raw_gf %>%
mutate(Ep_all = ifelse(is.na(Ep), Ep_pred_r2, Ep)) %>%
mutate(Ep_all = ifelse(is.na(Ep_all), Ep_pred_r1, Ep_all)) %>%
mutate(gapfilled = ifelse(is.na(Ep) & !is.na(Ep_all), "gapfilled", NA)) %>%
mutate(method = ifelse(is.na(Ep) & !is.na(Ep_pred_r2), "UN georegion (r2)", NA)) %>%
mutate(method = ifelse(is.na(Ep) & is.na(Ep_pred_r2) & !is.na(Ep_pred_r1), "UN georegion (r1)", method))
tr_data_gf <- tr_data_raw_gf %>%
select(rgn_id, year, gapfilled, method)
write.csv(tr_data_gf, "output/tr_jobs_pct_tourism_gf.csv", row.names=FALSE)
tr_data <- tr_data_raw_gf %>%
select(rgn_id, year, Ep=Ep_all)
write.csv(tr_data, "output/tr_jobs_pct_tourism.csv", row.names=FALSE)
## A quick check to make sure last year's values aren't too crazy different
## (NOTE: the source data has been updated, so there are some changes, but they shouldn't be super different)
old <- read.csv('../v2015/data/tr_jobs_pct_tourism.csv')%>%
select(rgn_id, year, ep_old=Ep)
new <- read.csv('output/tr_jobs_pct_tourism.csv') %>%
left_join(old) %>%
filter(year==2013) %>%
arrange(ep_old)
## Joining, by = c("rgn_id", "year")
new
## rgn_id year Ep ep_old
## 1 17 2013 0.00469071 0.00608689
## 2 198 2013 0.00773699 0.00805770
## 3 199 2013 0.00538946 0.00892312
## 4 168 2013 0.00910298 0.00925343
## 5 73 2013 0.01277540 0.01367920
## 6 196 2013 0.01606630 0.01453440
## 7 194 2013 0.01472130 0.01556280
## 8 205 2013 0.01555510 0.01565610
## 9 204 2013 0.01632100 0.01632460
## 10 137 2013 0.01765480 0.01650890
## 11 210 2013 0.01745670 0.01758510
## 12 191 2013 0.01760970 0.01775160
## 13 189 2013 0.01626900 0.01796500
## 14 200 2013 0.01983080 0.01817790
## 15 51 2013 0.01923140 0.01831000
## 16 178 2013 0.01795780 0.01852260
## 17 75 2013 0.01837160 0.01912140
## 18 99 2013 0.02124140 0.01965570
## 19 195 2013 0.02075650 0.01969560
## 20 192 2013 0.01936960 0.01987200
## 21 96 2013 0.02241600 0.02037550
## 22 181 2013 0.02090190 0.02146190
## 23 76 2013 0.02228110 0.02159070
## 24 41 2013 0.02383790 0.02165850
## 25 174 2013 0.02127680 0.02167870
## 26 106 2013 0.02390250 0.02185680
## 27 67 2013 0.02549210 0.02238620
## 28 28 2013 0.03665450 0.02288570
## 29 72 2013 0.02015150 0.02293480
## 30 132 2013 0.02298480 0.02344790
## 31 112 2013 0.02305120 0.02354920
## 32 138 2013 0.02286300 0.02363470
## 33 20 2013 0.02219760 0.02364220
## 34 197 2013 0.02733740 0.02368190
## 35 98 2013 0.03339890 0.02403550
## 36 59 2013 0.02461100 0.02404320
## 37 14 2013 0.02232680 0.02411000
## 38 53 2013 0.02519030 0.02416740
## 39 79 2013 0.02462880 0.02457840
## 40 247 2013 0.02352170 0.02482260
## 41 100 2013 0.01661910 0.02499240
## 42 104 2013 0.02404706 0.02499240
## 43 21 2013 0.02399409 0.02504040
## 44 218 2013 0.03856950 0.02522000
## 45 169 2013 0.02545990 0.02629350
## 46 48 2013 0.02719250 0.02641080
## 47 114 2013 0.02508200 0.02754480
## 48 216 2013 0.02657980 0.02758830
## 49 139 2013 0.02767650 0.02800160
## 50 175 2013 0.02859020 0.02802570
## 51 173 2013 0.02667500 0.02894440
## 52 136 2013 0.02969760 0.02918330
## 53 69 2013 0.03608580 0.02942750
## 54 84 2013 0.02978090 0.02949170
## 55 171 2013 0.02627020 0.03012710
## 56 47 2013 0.03056320 0.03047320
## 57 224 2013 0.03083110 0.03096900
## 58 77 2013 0.03136560 0.03097200
## 59 219 2013 0.03834020 0.03102940
## 60 32 2013 0.04542070 0.03161170
## 61 140 2013 0.03090740 0.03225730
## 62 15 2013 0.03261790 0.03297880
## 63 134 2013 0.03511850 0.03361370
## 64 71 2013 0.03387690 0.03408280
## 65 209 2013 0.03399525 0.03482440
## 66 70 2013 0.03775600 0.03486730
## 67 85 2013 0.03545494 0.03487700
## 68 86 2013 0.03545494 0.03487700
## 69 88 2013 0.03545494 0.03487700
## 70 95 2013 0.03545494 0.03487700
## 71 141 2013 0.03545494 0.03487700
## 72 144 2013 0.03545494 0.03487700
## 73 145 2013 0.03545494 0.03487700
## 74 227 2013 0.03545494 0.03487700
## 75 228 2013 0.03545494 0.03487700
## 76 222 2013 0.03491400 0.03500150
## 77 131 2013 0.03626120 0.03519780
## 78 167 2013 0.03455130 0.03524560
## 79 116 2013 0.03812518 0.03566390
## 80 207 2013 0.04386160 0.03607260
## 81 43 2013 0.03728190 0.03634160
## 82 172 2013 0.03598930 0.03680230
## 83 163 2013 0.03811090 0.03683870
## 84 40 2013 0.03436820 0.03691670
## 85 190 2013 0.04294920 0.03753070
## 86 7 2013 0.02914820 0.03782220
## 87 188 2013 0.03907250 0.03951660
## 88 64 2013 0.04268039 0.03987920
## 89 97 2013 0.04268039 0.03987920
## 90 193 2013 0.04268039 0.03987920
## 91 52 2013 0.04280800 0.04042260
## 92 179 2013 0.04317530 0.04165030
## 93 115 2013 0.04370990 0.04187470
## 94 66 2013 0.04440960 0.04204530
## 95 208 2013 0.04280870 0.04280870
## 96 126 2013 0.04620690 0.04305710
## 97 16 2013 0.04377100 0.04328000
## 98 1 2013 0.05999590 0.04330900
## 99 2 2013 0.05999590 0.04330900
## 100 3 2013 0.05999590 0.04330900
## 101 4 2013 0.05999590 0.04330900
## 102 162 2013 0.07622080 0.04333800
## 103 102 2013 0.04400600 0.04358210
## 104 202 2013 0.03103390 0.04428530
## 105 42 2013 0.03263700 0.04467930
## 106 101 2013 0.03686920 0.04477090
## 107 130 2013 0.04287750 0.04574880
## 108 26 2013 0.04551476 0.04606080
## 109 231 2013 0.04551476 0.04606080
## 110 237 2013 0.04551476 0.04606080
## 111 184 2013 0.04753020 0.04733340
## 112 49 2013 0.02001500 0.04820450
## 113 63 2013 0.04258380 0.04820450
## 114 185 2013 0.04886352 0.04829050
## 115 74 2013 0.04784300 0.04859590
## 116 215 2013 0.04639170 0.04873200
## 117 182 2013 0.04991360 0.04915910
## 118 223 2013 0.05254770 0.05015340
## 119 127 2013 0.05724980 0.05037140
## 120 82 2013 0.05246310 0.05106670
## 121 214 2013 0.04365290 0.05107720
## 122 133 2013 0.04742700 0.05115190
## 123 54 2013 0.05466350 0.05245590
## 124 143 2013 0.05305210 0.05331220
## 125 206 2013 0.04777690 0.05367200
## 126 203 2013 0.05443110 0.05443110
## 127 180 2013 0.05315590 0.05687710
## 128 107 2013 0.05818650 0.05842420
## 129 19 2013 0.05986320 0.05932580
## 130 146 2013 0.05986320 0.05932580
## 131 147 2013 0.05986320 0.05932580
## 132 151 2013 0.05986320 0.05932580
## 133 152 2013 0.05986320 0.05932580
## 134 153 2013 0.05986320 0.05932580
## 135 154 2013 0.05986320 0.05932580
## 136 155 2013 0.05986320 0.05932580
## 137 156 2013 0.05986320 0.05932580
## 138 161 2013 0.05986320 0.05932580
## 139 125 2013 0.05974610 0.05962080
## 140 50 2013 0.05177740 0.06004000
## 141 177 2013 0.05882810 0.06064930
## 142 119 2013 0.06346380 0.06307240
## 143 61 2013 0.06515590 0.06556270
## 144 25 2013 0.06423310 0.06584020
## 145 103 2013 0.06736860 0.06612130
## 146 176 2013 0.06883970 0.06681910
## 147 29 2013 0.07513249 0.06817660
## 148 30 2013 0.07513249 0.06817660
## 149 33 2013 0.07513249 0.06817660
## 150 34 2013 0.07513249 0.06817660
## 151 35 2013 0.07513249 0.06817660
## 152 36 2013 0.07513249 0.06817660
## 153 44 2013 0.07513249 0.06817660
## 154 45 2013 0.07513249 0.06817660
## 155 46 2013 0.07513249 0.06817660
## 156 78 2013 0.07460140 0.06928930
## 157 183 2013 0.07167030 0.06972240
## 158 65 2013 0.07432040 0.07135150
## 159 166 2013 0.07011520 0.07157140
## 160 135 2013 0.07211530 0.07177760
## 161 62 2013 0.07140600 0.07250490
## 162 123 2013 0.08772410 0.07297900
## 163 129 2013 0.07237490 0.07327030
## 164 38 2013 0.07112072 0.07844790
## 165 81 2013 0.07251660 0.07881280
## 166 55 2013 0.07339661 0.07927020
## 167 57 2013 0.07339661 0.07927020
## 168 58 2013 0.07339661 0.07927020
## 169 60 2013 0.07339661 0.07927020
## 170 232 2013 0.02952860 0.07927020
## 171 8 2013 0.06734600 0.08080710
## 172 9 2013 0.06734600 0.08080710
## 173 10 2013 0.06734600 0.08080710
## 174 11 2013 0.06734600 0.08080710
## 175 12 2013 0.06734600 0.08080710
## 176 13 2013 0.06734600 0.08080710
## 177 148 2013 0.06734600 0.08080710
## 178 149 2013 0.06734600 0.08080710
## 179 150 2013 0.06734600 0.08080710
## 180 157 2013 0.06734600 0.08080710
## 181 158 2013 0.06734600 0.08080710
## 182 159 2013 0.06734600 0.08080710
## 183 212 2013 0.06734600 0.08080710
## 184 5 2013 0.08312548 0.08084130
## 185 186 2013 0.09273290 0.08461030
## 186 113 2013 0.08827680 0.08942090
## 187 80 2013 0.10516700 0.08991690
## 188 108 2013 0.08948380 0.10713000
## 189 37 2013 0.11239000 0.10736900
## 190 124 2013 0.10618400 0.11131200
## 191 24 2013 0.11267800 0.11510800
## 192 111 2013 0.12227357 0.11612300
## 193 121 2013 0.12227357 0.11612300
## 194 220 2013 0.14460100 0.11612300
## 195 221 2013 0.12227357 0.11612300
## 196 244 2013 0.14460100 0.11612300
## 197 245 2013 0.14460100 0.11612300
## 198 248 2013 0.14460100 0.11612300
## 199 249 2013 0.14460100 0.11612300
## 200 18 2013 0.13220100 0.12419300
## 201 164 2013 0.12962000 0.12745000
## 202 187 2013 0.09410090 0.13131200
## 203 56 2013 0.15557100 0.14967900
## 204 68 2013 0.15178700 0.15079400
## 205 6 2013 0.16646200 0.15526300
## 206 120 2013 0.16729900 0.16312700
## 207 122 2013 0.19488200 0.19134100
## 208 118 2013 0.21427600 0.21595300
## 209 31 2013 0.28180400 0.23658200
## 210 110 2013 0.27341600 0.26911000
## 211 250 2013 0.30937400 0.31300700
## 212 39 2013 0.27880400 0.32109600
## 213 117 2013 0.34526100 0.34050600
## 214 89 2013 NA NaN
## 215 90 2013 NA NaN
## 216 91 2013 NA NaN
## 217 92 2013 NA NaN
## 218 93 2013 NA NaN
## 219 94 2013 NA NaN
## 220 105 2013 NA NaN
## 221 213 2013 NA NaN
## NOTE: This looks reasonable to me.
Information is from the U.S. State Department (https://travel.state.gov/content/passports/en/alertswarnings.html)
Add the data to tr_travelwarnings_xxx.xls Most of the data can be cut and paste (after it is checked) from the previous year (update the data and year information).
If different regions have different warnings, these are put on two lines and combined in the R script:
assess_year date rgn_name rgn_name_full 2016 4-Feb-15 Cameroon Cameroon risk
2016 4-Feb-15 Cameroon Cameroon (North and Far North region) avoid_all regional
inform: information travelor should be aware of (election violence, be aware due to crime, etc) risk: risks that trevelors should be aware of (“consider carefully risks of travel”, “warns of risks”) avoid_nonessential travel: “defer non-essential travel” avoid_all: “avoid all travel” gtfo: get out!!!
regional: added if the warning only applies to specific regions
The following code is used to clean these data and transform the warnings into a multiplier that is used to calculate tourism and recreation scores:
scores = data.frame(category = c("inform", "risk", "avoid_nonessential", "avoid_all", "gtfo"),
score = c(0, 0.25, 0.75, 1, 1))
warn <- read.csv('raw/tr_travelwarnings_2016.csv') %>%
select(year = assess_year, rgn_name, inform, risk, avoid_nonessential, avoid_all, gtfo, regional) %>%
gather("category", "n", 3:7) %>%
filter(!is.na(n)) %>%
select(-n) %>%
left_join(scores, by="category") %>%
group_by(year, rgn_name) %>%
mutate(regions = n())
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
warn2 <- warn %>%
mutate(score = ifelse(regions %in% 1 & regional %in% 1, score*0.5, score)) %>%
summarize(score = mean(score)) %>%
mutate(multiplier = 1-score) %>%
select(year, rgn_name, multiplier) %>%
data.frame()
data.frame(filter(warn2, year==2015)) # checked to make sure I got conversions correct, looks good!
## year rgn_name multiplier
## 1 2015 Algeria 0.500
## 2 2015 Cameroon 0.375
## 3 2015 Colombia 1.000
## 4 2015 Democratic Republic of the Congo 0.750
## 5 2015 Djibouti 0.750
## 6 2015 El Salvador 0.750
## 7 2015 Eritrea 0.750
## 8 2015 Haiti 1.000
## 9 2015 Honduras 0.750
## 10 2015 Iran 0.750
## 11 2015 Iraq 0.250
## 12 2015 Israel 0.750
## 13 2015 Kenya 0.750
## 14 2015 Lebanon 0.000
## 15 2015 Liberia 0.250
## 16 2015 Libya 0.000
## 17 2015 Mauritania 0.750
## 18 2015 Mexico 0.875
## 19 2015 Nepal 0.750
## 20 2015 Nigeria 0.375
## 21 2015 North Korea 0.000
## 22 2015 Pakistan 0.250
## 23 2015 Philippines 0.625
## 24 2015 Saudi Arabia 0.750
## 25 2015 Sierra Leone 0.250
## 26 2015 Somalia 0.000
## 27 2015 Sudan 0.375
## 28 2015 Syria 0.000
## 29 2015 Ukraine 0.750
## 30 2015 Venezuela 1.000
## 31 2015 Yemen 0.000
warn_rgn <- name_2_rgn(df_in = warn2,
fld_name='rgn_name',
flds_unique=c('rgn_name','year'))
##
## These data were removed for not being of the proper rgn_type (eez,ohi_region) or mismatching region names in the lookup tables:
## tmp_type
## tmp_name landlocked
## Nepal 4
warn_rgn <- warn_rgn %>%
select(rgn_id, year, multiplier)
write.csv(warn_rgn, 'output/tr_travelwarnings.csv', row.names=FALSE)
These data are from the World Economic Forum’s “Travel and Tourism Competitiveness Report” (http://reports.weforum.org/travel-and-tourism-competitiveness-report-2015/downloads/) See mazu: _raw_data/WEF-Economics/ for more details and the raw data.
As of 7/15/2016: no new data for 2016
Only one year of data are available so the same data is used for all years of analysis.
These data are gapfilled using gdppcppp and UN georegion information (see next section for obtaining and preparing these data).
The WEF data are processed with R/process_WEF.R, which creates the file: intermediate/wef_tcci_2015.csv. NOTE: This script has not been updated because no new data were available. The script will need to be modified in the future.
## describe location of raw data:
dir_wef <- file.path(dir_M, 'git-annex/globalprep/_raw_data/WEF-Economics/d2015/raw/WEF_TTCR_Dataset_2015.csv')
## processing the script (NOTE: needs to be updated to run correctly)
source(file.path(dir_git, 'R/process_WEF.R'), local = TRUE)
tr_sust <- read.csv('intermediate/wef_tcci_2015', stringsAsFactors = FALSE) %>%
select(rgn_id, score)
write_csv(tr_sust, 'intermediate/tr_pregap_sustainability.csv', row.names=FALSE)
These data should be compared to the 2015 data to make sure they are similar. #### Preparing the gdppcppp data: These data are used to gapfill missing values in tourism sustainability. Most of the data are from the World Bank, but CIA data fill some gaps (CIA data is available for only the most recent year).
Download the World Bank ppppcgdp data (see mazu: globalprep/_raw_data/WorldBank/d2016 README.md for information).
The data are prepared using the following script (NOTE: there is extra code for additional WorldBank files, but these are no longer needed, but the code may need to be modified to accomomodate this).
Data Downloaded: 7/20/2016
All of the following scripts should be carefully checked. Because these data aren’t used this year, this is only for guidance in future efforts.
## describe location of raw data:
dir_wb <- file.path(dir_M, 'git-annex/globalprep/_raw_data/WorldBank/d2016/raw')
## get list of files
wb_file_list <- list.files(path = dir_wb, pattern=glob2rx('*csv'), full.names = TRUE)
# saves the following file: wb_rgn_GDPPCPPP.csv
source(file.path(dir_git, 'R/process_WorldBank.R'), local = TRUE)
The CIA data are used to fill in missing gaps (https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html)
The following code is used to prepare these data for OHI:
cia_gdp <- read.csv('raw/cia_gdp_pc_ppp.csv', stringsAsFactors = FALSE)
splits <- data.frame(Country = "Saint Helena, Ascension, and Tristan da Cunha", Country2 = c("Saint Helena",
"Ascension",
"Tristan da Cunha")) %>%
mutate(Country = as.character(Country),
Country2 = as.character(Country2))
cia_gdp <- cia_gdp %>%
left_join(splits, by='Country') %>%
mutate(Country2 = ifelse(is.na(Country2), Country, Country2)) %>%
mutate(year = year_max) %>%
select(Country=Country2, year, pcgdp_cia = gdppcppp)
cia_gdp_rgn <- name_2_rgn(df_in = cia_gdp,
fld_name='Country')
## population weighted average of duplicate regions:
pop <- read.csv(file.path(dir_int, 'wb_country_total_pop.csv')) %>%
filter(year==2014) %>%
select(Country=country, year, w_popn)
cia_gdp_rgn <- cia_gdp_rgn %>%
left_join(pop, by=c("Country", "year")) %>%
group_by(rgn_id, year) %>%
summarize(pcgdp_cia = weighted.mean(pcgdp_cia, w_popn, na.rm=TRUE))
cia_gdp_rgn <- cia_gdp_rgn %>%
select(rgn_id, pcgdp_cia)
write.csv(cia_gdp_rgn, "intermediate/wb_rgn_cia_GDPPCPPP.csv", row.names=FALSE)
The following code combines the two gdp datasets and gapfills missing regions using UN georegions.
### world bank gdp data
gdppcppp <- read.csv('intermediate/wb_rgn_GDPPCPPP.csv') %>%
select(rgn_id, year, pcgdp = intl_dollar) %>%
filter(year == year_max) %>% ## only use one year of data
select(rgn_id, pcgdp)
### cia gdp data
gdppcppp2 <- read.csv('intermediate/wb_rgn_cia_GDPPCPPP.csv')
### Use WB data, but if missing, use pcgdp_cia.
### combine with UN georegion data
gdp_raw <- georegions %>%
left_join(georegion_labels, by = 'rgn_id') %>%
left_join(gdppcppp, by = c('rgn_id')) %>%
left_join(gdppcppp2, by = "rgn_id") %>%
mutate(pcgdp2 = ifelse(is.na(pcgdp), pcgdp_cia, pcgdp))
gdp_raw <- gdp_raw %>%
group_by(r2) %>%
mutate(gdp_pred_r2 = mean(pcgdp2, na.rm=TRUE)) %>%
ungroup() %>%
group_by(r1) %>%
mutate(gdp_pred_r1 = mean(pcgdp2, na.rm=TRUE)) %>%
ungroup()
gdp_raw_gf <- gdp_raw %>%
mutate(gdp_all = ifelse(is.na(pcgdp2), gdp_pred_r2, pcgdp2)) %>%
mutate(gdp_all = ifelse(is.na(gdp_all), gdp_pred_r1, gdp_all)) %>%
mutate(gapfilled = ifelse(is.na(pcgdp2) & !is.na(gdp_all), "gapfilled", NA)) %>%
mutate(method = ifelse(is.na(pcgdp2) & !is.na(gdp_pred_r2), "UN georegion (r2)", NA)) %>%
mutate(method = ifelse(is.na(pcgdp2) & is.na(gdp_pred_r2) & !is.na(gdp_pred_r1), "UN georegion (r1)", method))
tr_data_gf <- tr_data_raw_gf %>%
select(rgn_id, gapfilled, method)
write.csv(tr_data_gf, "intermediate/gdp_gf.csv", row.names=FALSE)
tr_data <- gdp_raw_gf %>%
select(rgn_id, pcgdp = gdp_all)
write.csv(tr_data, "intermediate/gdp.csv", row.names=FALSE)
The final step is gapfilling the Sustainability data (code is rough and needs to be modified, there are also some functions in tr_fxns.R that are helpful):
tr_sust <- read.csv('globalprep/tr/v2015/intermediate/tr_pregap_sustainability.csv', stringsAsFactors = FALSE)
rgn_names <- read.csv('../ohi-global/eez2013/layers/rgn_global.csv', stringsAsFactors = FALSE) %>%
rename(rgn_name = label)
tr_sust <- rgn_names %>%
left_join(tr_sust) %>%
rename(S_score = score)
### don't need to gapfill data without tourism data:
ep_gf <- ep %>%
filter(year==2014) %>%
select(rgn_id, Ep) %>%
filter(!is.na(Ep))
tr_sust <- tr_sust %>%
left_join(ep_gf, by="rgn_id")
### Add gapfill flag variable
tr_sust_gf <- tr_sust %>%
mutate(gapfilled = ifelse(is.na(S_score) & !is.na(Ep), "gapfilled", NA)) %>%
mutate(method = ifelse(is.na(S_score) & !is.na(Ep) & is.na(pcgdp2), "lm georegion + gdppcppp, with est. gdppcppp", NA)) %>%
mutate(method = ifelse(is.na(S_score) & !is.na(Ep) & !is.na(pcgdp2), "lm georegion + gdppcppp", method)) %>%
select(rgn_id, gapfilled, method)
write.csv(tr_sust_gf, "globalprep/tr/v2016/output/tr_sustainability_gf.csv", row.names=FALSE)
##############################################################################=
### Gapfilling ----
##############################################################################=
### Gapfill S using r1 and/or r2 regional data and PPP-adjusted per-capita GDP
mod3 <- lm(S_score ~ r2 + gdppcppp, data=tr_sust)
summary(mod3)
anova(mod3)
for (i in 1:dim(tr_sust)[1]){ #i=85
tt <- tryCatch(predict(mod3, newdata=tr_sust[i,]),error=function(e) e, warning=function(w) w)
if(is(tt, "error")){
tr_sust$S_score_pred1[i] <-NA
} else {
tr_sust$S_score_pred1[i] <- predict(mod3, newdata=tr_sust[i,])
}
}
mod4 <- lm(S_score ~ r1 + gdppcppp, data=tr_sust)
summary(mod4)
anova(mod4)
for (i in 1:dim(tr_sust)[1]){ #i=85
tt <- tryCatch(predict(mod4, newdata=tr_sust[i,]),error=function(e) e, warning=function(w) w)
if(is(tt, "error")){
tr_sust$S_score_pred2[i] <-NA
} else {
tr_sust$S_score_pred2[i] <- predict(mod4, newdata=tr_sust[i,])
}
}
tr_sust <- tr_sust %>%
mutate(S_score_2 = ifelse(is.na(S_score), S_score_pred1, S_score)) %>%
mutate(S_score_2 = ifelse(is.na(S_score_2), S_score_pred2, S_score_2)) %>%
select(rgn_id, S_score=S_score2)
### slight difference in values from Casey's due to gapfilling - but basically the same...just go with last year's data for now
### Could be rounding....also, Casey's method accidentally skips the factor that serves as the intercept