1 Data Processing

Here we read in the data and do some name and format manipulation.

Note, the abbreviation DSM denotes “Data Sharing Mark” and OAM denotes “Open Access Mark.”

# read journal data
exceldat <- read_excel("../Data-Sharing-Policies_2017-01-25.xlsx", sheet = "Data",
                    na = "N/A")
jdata <- exceldat[-1,] # remove line of info
jdata <- jdata[-which(is.na(jdata$Journal)),] # remove NAs

# add labels for DSM
dsm_labels = c(
  "Required as condition of publication, barring exceptions",
  "Required but no explicit statement regarding effect on publication/editorial decisions",
  "Explicitly encouraged/addressed, but not required.",
  "Mentioned indirectly",
  "Only protein, proteomic, and/or genomic data sharing are addressed.",
  "No mention"
)

# Recommended Sharing Method
rsm_labels = c(
  "Public Online Repository", "Journal Hosted",
  "By Reader Request to Authors","Multiple methods equally recommended",
  "Unspecified"
)

journal_host_labels = c(
  "Journal will host regardless of size",
"Journal has data hosting file/s size limit","Unspecified"
)

# Fix typos:
# One line has an end line character when should be 'b'
jdata$`Copyright/Licensing Mark`[jdata$`Copyright/Licensing Mark`=="b\n"] = "b"
jdata$`Reproducibility Noted Mark`[jdata$`Reproducibility Noted Mark`=="a\n"] = "a"
# Space after an a
jdata$`Archival/Retention Mark`[jdata$`Archival/Retention Mark`=="a "] = "a"
jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`[jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`%in%c("a\n","a ")] = "a"
jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`[jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`%in%c("b\n","b ")] = "b"

# easier shorter names
jdata <- jdata%>%mutate(
  citable_2013 = `{2013} Citable Articles`,
  citable_2014 = `2014 Citable Items`,
  citable_2015 = `2015 Citable Articles`,
  dsm=as.numeric(`Data Sharing Mark`), 
  dsm_fac = factor(dsm,levels = 1:6,
                   labels=dsm_labels),
  dsm2 = 1*(dsm<3),
  dsm2_fac = factor(dsm2,levels=0:1,
                    labels=c("Not Required","Required")),
  dsm2_fac_flip = factor(dsm2,levels=c(1,0), # reverse levels for for plotting only
                    labels=c("Required","Not Required")),
  oam=as.numeric(`Open Access Mark`),
  oam_fac = factor(oam,levels=0:1,
                   labels=c("Subscription","Open Access")),
  if_2013=`2013 Impact Factor`,
  if_2014=`2014 Impact Factor`,
  if_2015 = `2015 Impact Factor`,
  total_cites_2013 = `2013 Total Cites`,
  total_cites_2014 = `2014 Total Cites`,
  total_cites_2015 = `2015 Total Cites`,
  omic_data_sharing_fac = factor(`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`,
                                 levels=c("a","b"),
                                 labels=c("Yes","No")),
  rec_pref_sharing_fac = factor(`Recommended/Preferred Sharing Mark`,levels=c("A","B","C","D","E"),
                   labels=rsm_labels),
  journal_hosts_fac = factor(`Size Guidelines if Journal Hosted Provided`,levels=c("a","b","c"),labels=journal_host_labels),
  copyright_licensing_fac = factor(`Copyright/Licensing Mark`,levels=c("a","b"),
                                   labels=c("Explicitly stated or mentioned","No Mention")),
  archival_retention_fac = factor(`Archival/Retention Mark`,levels=c("a","b"),
                                  labels=c("Explicitly stated","No Mention")),
  reproducibility_noted_fac = factor(`Reproducibility Noted Mark`,levels=c("a","b"),
                                  labels=c("Explicitly stated","No Mention"))
)

# create long data for plotting and dplyr
jdata_if_long  <- jdata%>%select(Journal,dsm:if_2015)%>%
  rename(`2013`=if_2013,`2014`=if_2014,`2015`=if_2015)%>%
  gather(year,impact_factor,`2013`:`2015`)
jdata_tc_long <- jdata%>%select(Journal,contains("total_cites"))%>%
  rename(`2013`=total_cites_2013,`2014`=total_cites_2014,`2015`=total_cites_2015)%>%
  gather(year,total_cites,`2013`:`2015`)
jdata_cit_long <- jdata%>%select(Journal,contains("citable_"))%>%
  rename(`2013`=citable_2013,`2014`=citable_2014,`2015`=citable_2015)%>%
  gather(year,citable_items,`2013`:`2015`)

jdata_long = left_join(jdata_if_long,jdata_tc_long)
jdata_long = left_join(jdata_long,jdata_cit_long)

2 Data Summaries and Distributions

We have data from 318 journals, including impact factor (IF) and total number of citations in years 2013 and 2014.

Data Sharing Mark (DSM) has 6 categories, and we create a collapsed DSM variable that has two categories for “required” (DSM = 1 or 2) and “not required” (DSM = 3, 4, 5 or 6).

The labels for Data Sharing Mark (DSM) are as follows:

kable(tibble(`DSM Numeric Value` = 1:6, `DSM Description Label` = dsm_labels))
DSM Numeric Value DSM Description Label
1 Required as condition of publication, barring exceptions
2 Required but no explicit statement regarding effect on publication/editorial decisions
3 Explicitly encouraged/addressed, but not required.
4 Mentioned indirectly
5 Only protein, proteomic, and/or genomic data sharing are addressed.
6 No mention

2.1 Summary of Continuous Data

tmps = summary(jdata %>% select(if_2013:if_2015, citable_2013:citable_2015, 
    total_cites_2013:total_cites_2015))
kable(tmps)
if_2013 if_2014 if_2015 citable_2013 citable_2014 citable_2015 total_cites_2013 total_cites_2014 total_cites_2015
Min. : 0.073 Min. : 0.220 Min. : 0.218 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 115 Min. : 384 Min. : 410
1st Qu.: 2.987 1st Qu.: 2.908 1st Qu.: 2.874 1st Qu.: 154.0 1st Qu.: 148.5 1st Qu.: 152.0 1st Qu.: 3940 1st Qu.: 4517 1st Qu.: 3838
Median : 4.242 Median : 4.157 Median : 4.077 Median : 243.0 Median : 237.5 Median : 240.5 Median : 9164 Median : 10126 Median : 7748
Mean : 5.495 Mean : 5.417 Mean : 5.285 Mean : 409.8 Mean : 414.9 Mean : 414.9 Mean : 24267 Mean : 25747 Mean : 17688
3rd Qu.: 5.785 3rd Qu.: 5.772 3rd Qu.: 5.567 3rd Qu.: 341.0 3rd Qu.: 356.2 3rd Qu.: 341.8 3rd Qu.: 21307 3rd Qu.: 22129 3rd Qu.: 13320
Max. :42.351 Max. :41.456 Max. :38.138 Max. :31496.0 Max. :30040.0 Max. :28114.0 Max. :590324 Max. :617363 Max. :627846
NA NA’s :2 NA’s :4 NA NA’s :2 NA’s :4 NA NA’s :2 NA’s :4

2.2 TABLE METHODS X: Distribution of Impact Factors for Journal Titles

We include 2013 values in the manuscript and include 2014 values in the appendix.

This is a table categorizing journal impact factor (JIF) into ranges. The number of journals within each range and the percentage of the total number of journals is presented.

2.2.1 2013

jdata$if_2013_cat = cut(jdata$if_2013, breaks = c(0, 2, 4, 6, 8, 10, 30, 43), 
    include.lowest = TRUE, right = FALSE, labels = c("<2", "2-3.99", "4-5.99", 
        "6-7.99", "8-9.99", "10-29.99", "30-43"))
jdata$if_2014_cat = cut(jdata$if_2014, breaks = c(0, 2, 4, 6, 8, 10, 30, 43), 
    include.lowest = TRUE, right = FALSE, c("<2", "2-3.99", "4-5.99", "6-7.99", 
        "8-9.99", "10-29.99", "30-43"))
jdata$if_2015_cat = cut(jdata$if_2015, breaks = c(0, 2, 4, 6, 8, 10, 30, 43), 
    include.lowest = TRUE, right = FALSE, c("<2", "2-3.99", "4-5.99", "6-7.99", 
        "8-9.99", "10-29.99", "30-43"))


jdata %>% group_by(if_2013_cat) %>% summarize(Number = n()) %>% mutate(`# (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`IF 2013 Category` = if_2013_cat) %>% kable()
IF 2013 Category # (%)
<2 19 (6%)
2-3.99 125 (39.3%)
4-5.99 102 (32.1%)
6-7.99 25 (7.9%)
8-9.99 15 (4.7%)
10-29.99 29 (9.1%)
30-43 3 (0.9%)

2.2.2 2014, 2015

tmp = jdata %>% group_by(if_2014_cat) %>% summarize(Number = n()) %>% mutate(`2014 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`IF Category` = if_2014_cat)
tmp2 = jdata %>% group_by(if_2015_cat) %>% summarize(Number = n()) %>% mutate(`2015 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`IF Category` = if_2015_cat)
# replace NA
left_join(tmp, tmp2) %>% mutate(`IF Category` = ifelse(is.na(`IF Category`), 
    "Not Reported", as.character(`IF Category`))) %>% kable()
IF Category 2014 # (%) 2015 # (%)
<2 20 (6.3%) 21 (6.6%)
2-3.99 127 (39.9%) 133 (41.8%)
4-5.99 96 (30.2%) 97 (30.5%)
6-7.99 26 (8.2%) 18 (5.7%)
8-9.99 17 (5.3%) 16 (5%)
10-29.99 27 (8.5%) 26 (8.2%)
30-43 3 (0.9%) 3 (0.9%)
Not Reported 2 (0.6%) 4 (1.3%)

2.3 TABLE METHODS X: Distribution of Citable Items for Journal Titles by Category

The total number of citable items per journal is categorized into ranges and the number of journals and percentage of total number of journals in each category is presented.

2.3.1 2013

my_cut_fun <- function(x) {
    cut(x, breaks = c(0, 100, 500, 1000, 32000), labels = c("<100", "100-500", 
        "500-1000", "1000-32000"), right = FALSE)
}

jdata = jdata %>% mutate(citable_2013_cat = my_cut_fun(jdata$citable_2013), 
    citable_2014_cat = my_cut_fun(jdata$citable_2014), citable_2015_cat = my_cut_fun(jdata$citable_2015))


jdata %>% group_by(citable_2013_cat) %>% summarize(Number = n()) %>% mutate(`# (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`Total Citable Items 2013 Category` = citable_2013_cat) %>% kable()
Total Citable Items 2013 Category # (%)
<100 42 (13.2%)
100-500 239 (75.2%)
500-1000 28 (8.8%)
1000-32000 9 (2.8%)

2.3.2 2014, 2015

tmp1 = jdata %>% group_by(citable_2014_cat) %>% summarize(Number = n()) %>% 
    rename(`Total Citable Items Category` = citable_2014_cat) %>% mutate(`2014 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number)
tmp2 = jdata %>% group_by(citable_2015_cat) %>% summarize(Number = n()) %>% 
    rename(`Total Citable Items Category` = citable_2015_cat) %>% mutate(`2015 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number)

left_join(tmp1, tmp2) %>% mutate(`Total Citable Items Category` = ifelse(is.na(`Total Citable Items Category`), 
    "Not Reported", as.character(`Total Citable Items Category`))) %>% kable()
Total Citable Items Category 2014 # (%) 2015 # (%)
<100 42 (13.2%) 38 (11.9%)
100-500 235 (73.9%) 239 (75.2%)
500-1000 27 (8.5%) 23 (7.2%)
1000-32000 12 (3.8%) 14 (4.4%)
Not Reported 2 (0.6%) 4 (1.3%)

2.4 TABLE METHODS X: Distribution of Citable Items for Journal Titles

Min, 25th percentile, mean, median, 75th percentile, and max.

tmp1 = jdata_long %>% group_by(year) %>% summarize_at(vars(citable_items), funs(Total = "sum", 
    Min = "min", Q25 = quantile(., probs = 0.25), Mean = "mean", Median = "median", 
    Q75 = quantile(., probs = 0.75), Max = "max"), na.rm = T) %>% add_column(Journals = "With PLoS One", 
    .after = "year")

tmp2 = jdata_long %>% filter(!Journal == "PLoS One") %>% group_by(year) %>% 
    summarize_at(vars(citable_items), funs(Total = "sum", Min = "min", Q25 = quantile(., 
        probs = 0.25), Mean = "mean", Median = "median", Q75 = quantile(., probs = 0.75), 
        Max = "max"), na.rm = T) %>% add_column(Journals = "Remove PLoS One", 
    .after = "year")

tmp = bind_rows(tmp1, tmp2) %>% arrange(Journals, year)
tmp = tmp %>% arrange(desc(Journals), year)
kable(tmp, caption = "Citable Items by Year", digits = 1)
Citable Items by Year
year Journals Total Min Q25 Mean Median Q75 Max
2013 With PLoS One 130330 0 154.0 409.8 243.0 341.0 31496
2014 With PLoS One 131107 0 148.5 414.9 237.5 356.2 30040
2015 With PLoS One 130277 0 152.0 414.9 240.5 341.8 28114
2013 Remove PLoS One 98834 0 153.0 311.8 242.0 341.0 3901
2014 Remove PLoS One 101067 0 148.0 320.8 236.0 355.0 3579
2015 Remove PLoS One 102163 0 152.0 326.4 240.0 341.0 3281

2.5 TABLE: Number of journals per data sharing mark (DSM)

tmp = jdata %>% group_by(dsm, dsm_fac) %>% summarize(N = n(), Percent = 100 * 
    n()/nrow(jdata))

tmp1 = tmp %>% mutate(`# Journals (%)` = paste0(N, "\t(", round(Percent, 1), 
    "%)")) %>% select(-N, -Percent) %>% rename(DSM = dsm, `DSM Description` = dsm_fac)
kable(tmp1)
DSM DSM Description # Journals (%)
1 Required as condition of publication, barring exceptions 38 (11.9%)
2 Required but no explicit statement regarding effect on publication/editorial decisions 29 (9.1%)
3 Explicitly encouraged/addressed, but not required. 74 (23.3%)
4 Mentioned indirectly 29 (9.1%)
5 Only protein, proteomic, and/or genomic data sharing are addressed. 47 (14.8%)
6 No mention 101 (31.8%)

2.6 TABLE: Publishing Volume by Data Sharing Mark

tmp1 = jdata%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "MedianC"=median(citable_2013,na.rm=T),
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2013" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2013 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])
            )%>%
  mutate("# Citable Items 2013, Remove PLoS One (%)" = 
           paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp = left_join(tmp1,tmp2)

tmp1 = jdata%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm=T), 
            "MedianC"=median(citable_2014,na.rm=T),
            "PercentC" = 100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
        "Median # Citable Items per Journal 2014" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2014 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2014, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


tmp1 = jdata%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm=T), 
            "MedianC"=median(citable_2015,na.rm=T),
            "PercentC" = 100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
        "Median # Citable Items per Journal 2015" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2015 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2015, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


#kable(tmp)

tmpt = data_frame("DSM Description"="Total",
           "# Journals (%)"=paste0(nrow(jdata)," (100%)"),
           "Median # Citable Items per Journal 2013"  =as.character(median(jdata$citable_2013)),
           "# Citable Items 2013 (%)"=paste0(sum(jdata$citable_2013)," (100%)"),
           "# Citable Items 2013, Remove PLoS One (%)"=
             paste0(sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])," (100%)"),
           "Median # Citable Items per Journal 2014"  =as.character(median(jdata$citable_2014,na.rm=T)),
          "# Citable Items 2014 (%)"=paste0(sum(jdata$citable_2014,na.rm=T)," (100%)"),
           "# Citable Items 2014, Remove PLoS One (%)"=
             paste0(sum(jdata$citable_2014[!jdata$Journal=="PLoS One"],na.rm=T)," (100%)"),
            "Median # Citable Items per Journal 2015"  =as.character(median(jdata$citable_2015,na.rm=T)),
          "# Citable Items 2015 (%)"=paste0(sum(jdata$citable_2015,na.rm=T)," (100%)"),
           "# Citable Items 2015, Remove PLoS One (%)"=
             paste0(sum(jdata$citable_2015[!jdata$Journal=="PLoS One"],na.rm=T)," (100%)")
)
tmpt = bind_rows(tmp,tmpt)
kable(tmpt)
DSM DSM Description # Journals (%) Median # Citable Items per Journal 2013 # Citable Items 2013 (%) # Citable Items 2013, Remove PLoS One (%) Median # Citable Items per Journal 2014 # Citable Items 2014 (%) # Citable Items 2014, Remove PLoS One (%) Median # Citable Items per Journal 2015 # Citable Items 2015 (%) # Citable Items 2015, Remove PLoS One (%)
1 Required as condition of publication, barring exceptions 38 (11.9%) 230.5 42669 (32.7%) 11173 (11.3%) 220 42794 (32.6%) 12754 (12.6%) 226.5 40870 (31.4%) 12756 (12.5%)
2 Required but no explicit statement regarding effect on publication/editorial decisions 29 (9.1%) 209 12138 (9.3%) 12138 (12.3%) 227 12436 (9.5%) 12436 (12.3%) 266 14233 (10.9%) 14233 (13.9%)
3 Explicitly encouraged/addressed, but not required. 74 (23.3%) 259.5 25519 (19.6%) 25519 (25.8%) 282.5 26026 (19.9%) 26026 (25.8%) 278 26731 (20.5%) 26731 (26.2%)
4 Mentioned indirectly 29 (9.1%) 256 8062 (6.2%) 8062 (8.2%) 225 7894 (6%) 7894 (7.8%) 225 7928 (6.1%) 7928 (7.8%)
5 Only protein, proteomic, and/or genomic data sharing are addressed. 47 (14.8%) 277 19339 (14.8%) 19339 (19.6%) 316 19080 (14.6%) 19080 (18.9%) 285 17734 (13.6%) 17734 (17.4%)
6 No mention 101 (31.8%) 211 22603 (17.3%) 22603 (22.9%) 213 22877 (17.4%) 22877 (22.6%) 206 22781 (17.5%) 22781 (22.3%)
NA Total 318 (100%) 243 130330 (100%) 98834 (100%) 237.5 131107 (100%) 101067 (100%) 240.5 130277 (100%) 102163 (100%)
tmplong$type1 = factor(tmplong$type2,levels=c("Citable Items without PLoS One","Citable Items with PLoS One","Journals"))

ggplot(tmplong,aes(x=type1,y=Percent,fill=dsm_fac))+geom_bar(stat="identity") +
  geom_text(aes(label=paste0(round(Percent,1),"%"),y=Percent),
            #alpha=1,
            position=position_stack(),hjust=1.2,size=3,color="black",show.legend = FALSE)+
  guides(alpha=guide_legend(title="Percent of",ncol=1),
         fill = guide_legend(title = "Data Sharing Mark", ncol = 1))+
  theme_minimal()+xlab("")+coord_flip()+ theme(legend.position = "bottom")

2.7 FIGURE 1 - Percentage of journals per each data sharing mark (DSM).

The top bar shows the percentage of all journals for each data sharing mark. The middle bar shows the percentage of citable items from each journal (including PLoS One) for each data sharing mark. The lower bar shows the percentage of citable items for each journal (excluding PLoS One) for each data sharing mark. Because of the journal PLoS One’s high publishing activity, we analyzed the percentage of citable items for each data sharing mark including and excluding PLoS One. The shades from dark to light represent DSM 1-6.

tmplong$type1 = factor(tmplong$type2,levels=c("Citable Items without PLoS One","Citable Items with PLoS One","Journals"))
tmplong$dsm_fac_reverse = factor(tmplong$dsm_fac,levels=levels(tmplong$dsm_fac)[6:1])

ggplot(tmplong,aes(x=type1,y=Percent,fill=dsm_fac_reverse))+geom_bar(stat="identity") +
  geom_text(aes(label=paste0(round(Percent,1),"%"),y=Percent),
            #alpha=1,
            position=position_stack(),hjust=1.2,size=3,color="black",show.legend = FALSE)+
  #scale_fill_brewer(palette="Greys")+
  scale_fill_manual(values = RColorBrewer::brewer.pal(n=7, name="Blues")[1:6],
                    breaks=levels(tmplong$dsm_fac),
                    #labels=levels(tmplong$dsm_fac))+
                      labels=paste(1:6,levels(tmplong$dsm_fac)))+
  guides(fill = guide_legend(title = "Data Sharing Mark", ncol = 1))+
  theme_minimal()+xlab("")+coord_flip()+ theme(legend.position = "bottom")

2.8 TABLE RESULTS X: Number of journals & Citable Items per data sharing 2 categories

#summarize citable items
tmp1 = jdata%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),
            "Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "MedianC"=median(citable_2013,na.rm=T),
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2013" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2013 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)"))%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

#summarize citable items, removing PLoS One
#do not need median since removing one journal will not change this except by .5
tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])
            )%>%
  mutate("# Citable Items 2013, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp = left_join(tmp1,tmp2)

tmp1 = jdata%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm=T), 
            "MedianC"=median(citable_2014,na.rm=T),
            "PercentC" = 100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2014" = paste(MedianC," "),
         "# Citable Items 2014 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") 
         )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2014, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


tmp1 = jdata%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm=T), 
            "MedianC"=median(citable_2015,na.rm=T),
            "PercentC" = 100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2015" = paste(MedianC," "),
         "# Citable Items 2015 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") 
         )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2015, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)

tmp = tmp%>%arrange(desc(DSM))
tmp$DSM[1] = "DSM 1&2"
tmp$DSM[2] = "DSM 3-6"
kable(tmp)
DSM DSM Description # Journals (%) Median # Citable Items per Journal 2013 # Citable Items 2013 (%) # Citable Items 2013, Remove PLoS One (%) Median # Citable Items per Journal 2014 # Citable Items 2014 (%) # Citable Items 2014, Remove PLoS One (%) Median # Citable Items per Journal 2015 # Citable Items 2015 (%) # Citable Items 2015, Remove PLoS One (%)
DSM 1&2 Required 67 (21.1%) 226 54807 (42.1%) 23311 (23.6%) 221 55230 (42.1%) 25190 (24.9%) 242 55103 (42.3%) 26989 (26.4%)
DSM 3-6 Not Required 251 (78.9%) 248 75523 (57.9%) 75523 (76.4%) 244 75877 (57.9%) 75877 (75.1%) 240 75174 (57.7%) 75174 (73.6%)

2.9 TABLE RESULTS X: Distribution of Open Access by Journal and Citable Item

tmp1 = jdata%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "MedianC"=median(citable_2013,na.rm=T),
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2013" = paste(MedianC," "),
         "# Citable Items 2013 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("Open Access"=oam_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])
            )%>%
  mutate("# Citable Items 2013, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("Open Access"=oam_fac)

tmp = left_join(tmp1,tmp2)



tmp1 = jdata%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm =T), 
            "MedianC"=median(citable_2014,na.rm=T),
            "PercentC" = 100*sum(citable_2014, na.rm =T)/sum(jdata$citable_2014, na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2014" = paste(MedianC," "), # need  a space or kable malfunctions for some reason
         "# Citable Items 2014 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("Open Access"=oam_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm =T), 
            "PercentC" = 100*sum(citable_2014, na.rm =T)/sum(jdata$citable_2014[!jdata$Journal=="PLoS One"], na.rm=T)
            )%>%
  mutate("# Citable Items 2014, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("Open Access"=oam_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)

tmp1 = jdata%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm =T), 
            "MedianC"=median(citable_2015,na.rm=T),
            "PercentC" = 100*sum(citable_2015, na.rm =T)/sum(jdata$citable_2015, na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2015" = paste(MedianC," "), # need  a space or kable malfunctions for some reason
         "# Citable Items 2015 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("Open Access"=oam_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm =T), 
            "PercentC" = 100*sum(citable_2015, na.rm =T)/sum(jdata$citable_2015[!jdata$Journal=="PLoS One"], na.rm=T)
            )%>%
  mutate("# Citable Items 2015, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("Open Access"=oam_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


tmp = tmp%>%arrange(desc(`Open Access`))
kable(tmp)
Open Access # Journals (%) Median # Citable Items per Journal 2013 # Citable Items 2013 (%) # Citable Items 2013, Remove PLoS One (%) Median # Citable Items per Journal 2014 # Citable Items 2014 (%) # Citable Items 2014, Remove PLoS One (%) Median # Citable Items per Journal 2015 # Citable Items 2015 (%) # Citable Items 2015, Remove PLoS One (%)
Open Access 44 (13.8%) 199.5 43789 (33.6%) 12293 (12.4%) 207 45831 (35%) 15791 (15.6%) 222 47573 (36.5%) 19459 (19%)
Subscription 274 (86.2%) 246.5 86541 (66.4%) 86541 (87.6%) 240 85276 (65%) 85276 (84.4%) 242 82704 (63.5%) 82704 (81%)

2.10 TABLE X: DSM by Open Access

Open Access Mark (OAM) has two categories, Subscription and Open Access.

Here we present a table of number of open access or subscription journals (number of citable items) within each DSM as well as the percent of journals (percent of citable items) that are open access within each DSM. For the DSM 1 we present number and percent citable items with and without PLoS One.

# obtain open access counts at journal level
tmptable = with(jdata, table(dsm, oam_fac))
tmptable = cbind(tmptable, `% Open Access` = 100 * tmptable[, 2]/rowSums(tmptable))
tmptable = as_data_frame(tmptable)
tmptable = tmptable %>% add_column()

# obtain open access counts at citable item level

jcit = left_join(left_join(jdata %>% group_by(dsm) %>% summarize(tot_cit = sum(citable_2013)), 
    jdata %>% filter(oam_fac == "Subscription") %>% group_by(dsm) %>% summarize(sub_cit = sum(citable_2013))), 
    jdata %>% filter(oam_fac == "Open Access") %>% group_by(dsm) %>% summarize(open_cit = sum(citable_2013)))
jcit$open_cit[is.na(jcit$open_cit)] = 0
jcit$pct_open_cit = 100 * jcit$open_cit/jcit$tot_cit


# without Plos One
open_cit = jdata %>% filter(Journal != "PLoS One", oam_fac == "Open Access", 
    dsm == 1) %>% summarize(open_cit = sum(citable_2013))
tot_cit = jdata %>% filter(Journal != "PLoS One", dsm == 1) %>% summarize(open_cit = sum(citable_2013))
pct_cit = round(100 * open_cit/tot_cit, 2)


printtab = data_frame(DSM = dsm_labels, Subscription = paste0(tmptable$Subscription, 
    " (", jcit$sub_cit, ")"), `Open Access` = paste0(tmptable$`Open Access`, 
    " (", jcit$open_cit, ")"), `% Open Access` = paste0(round(tmptable$`% Open Access`, 
    2), "% (", round(jcit$pct_open_cit, 2), "%)"))
# add in without Plos One #s
printtab[1, 3] = gsub(")", paste0("; ", open_cit, "*)"), printtab[1, 3])
printtab[1, 4] = gsub(")", paste0("; ", pct_cit, "%*)"), printtab[1, 4])


kable(printtab)
DSM Subscription Open Access % Open Access
Required as condition of publication, barring exceptions 29 (7709) 9 (34960; 3464*) 23.68% (81.93%; 31%*)
Required but no explicit statement regarding effect on publication/editorial decisions 27 (11864) 2 (274) 6.9% (2.26%)
Explicitly encouraged/addressed, but not required. 63 (22884) 11 (2635) 14.86% (10.33%)
Mentioned indirectly 29 (8062) 0 (0) 0% (0%)
Only protein, proteomic, and/or genomic data sharing are addressed. 40 (17401) 7 (1938) 14.89% (10.02%)
No mention 86 (18621) 15 (3982) 14.85% (17.62%)

*After removing PLoS One

2.11 Table of 2 category DSM by Open Access

# obtain open access counts at journal level
tmptable = with(jdata, table(dsm2, oam_fac))
tmptable = cbind(tmptable, `% Open Access` = 100 * tmptable[, 2]/rowSums(tmptable))
tmptable = as_data_frame(tmptable)

# obtain open access counts at citable item level

jcit = left_join(left_join(jdata %>% group_by(dsm2) %>% summarize(tot_cit = sum(citable_2013)), 
    jdata %>% filter(oam_fac == "Subscription") %>% group_by(dsm2) %>% summarize(sub_cit = sum(citable_2013))), 
    jdata %>% filter(oam_fac == "Open Access") %>% group_by(dsm2) %>% summarize(open_cit = sum(citable_2013)))
jcit$open_cit[is.na(jcit$open_cit)] = 0
jcit$pct_open_cit = 100 * jcit$open_cit/jcit$tot_cit


# without Plos One
open_cit = jdata %>% filter(Journal != "PLoS One", oam_fac == "Open Access", 
    dsm2 == 1) %>% summarize(open_cit = sum(citable_2013))
tot_cit = jdata %>% filter(Journal != "PLoS One", dsm2 == 1) %>% summarize(open_cit = sum(citable_2013))
pct_cit = round(100 * open_cit/tot_cit, 2)


printtab = data_frame(DSM = levels(jdata$dsm2_fac), Subscription = paste0(tmptable$Subscription, 
    " (", jcit$sub_cit, ")"), `Open Access` = paste0(tmptable$`Open Access`, 
    " (", jcit$open_cit, ")"), `% Open Access` = paste0(round(tmptable$`% Open Access`, 
    2), "% (", round(jcit$pct_open_cit, 2), "%)"))
printtab = printtab %>% arrange(desc(DSM))
# add in without Plos One #s
printtab[1, 3] = gsub(")", paste0("; ", open_cit, "*)"), printtab[1, 3])
printtab[1, 4] = gsub(")", paste0("; ", pct_cit, "%*)"), printtab[1, 4])


kable(printtab)
DSM Subscription Open Access % Open Access
Required 56 (19573) 11 (35234; 3738*) 16.42% (64.29%; 16.04%*)
Not Required 218 (66968) 33 (8555) 13.15% (11.33%)

*After removing PLoS One

2.12 TABLE X: Frequencies of Data Sharing Policies

jdata$omic_data_sharing_fac = relevel(jdata$omic_data_sharing_fac, ref = "No")
jdata$copyright_licensing_fac = relevel(jdata$copyright_licensing_fac, ref = "No Mention")
jdata$archival_retention_fac = relevel(jdata$archival_retention_fac, ref = "No Mention")
jdata$reproducibility_noted_fac = relevel(jdata$reproducibility_noted_fac, ref = "No Mention")
varlist = c("omic_data_sharing_fac", "rec_pref_sharing_fac", "journal_hosts_fac", 
    "copyright_licensing_fac", "archival_retention_fac", "reproducibility_noted_fac")
t1 = CreateTableOne(varlist, data = jdata)
x1 <- print(t1, printToggle = FALSE, noSpaces = FALSE)
tmp = rownames(x1)
tmptab = as_data_frame(x1) %>% add_column(Variable = tmp, .before = "Overall")
tmptab$Variable = gsub("omic_data_sharing_fac", "Omic Data Sharing Required", 
    tmptab$Variable)
tmptab$Variable = gsub("rec_pref_sharing_fac", "Recommended Sharing Method", 
    tmptab$Variable)
tmptab$Variable = gsub("journal_hosts_fac", "Journal Hosting Limit", tmptab$Variable)
tmptab$Variable = gsub("copyright_licensing_fac", "Copyright Licensing of Data", 
    tmptab$Variable)
tmptab$Variable = gsub("archival_retention_fac", "Archival Retention Policy", 
    tmptab$Variable)
tmptab$Variable = gsub("reproducibility_noted_fac", "Reproducibility or Analogous Concepts Noted as Purpose of Data Policy", 
    tmptab$Variable)


tmptab$Overall = gsub(")", "%)", tmptab$Overall)

kable(tmptab)
Variable Overall
n 318
Omic Data Sharing Required = Yes (%) 131 (41.2%)
Recommended Sharing Method (%)
Public Online Repository 125 (39.3%)
Journal Hosted 45 (14.2%)
By Reader Request to Authors 4 ( 1.3%)
Multiple methods equally recommended 11 ( 3.5%)
Unspecified 133 (41.8%)
Journal Hosting Limit (%)
Journal will host regardless of size 1 ( 0.8%)
Journal has data hosting file/s size limit 8 ( 6.6%)
Unspecified 112 (92.6%)
Copyright Licensing of Data = Explicitly stated or mentioned (%) 16 ( 5.0%)
Archival Retention Policy = Explicitly stated (%) 3 ( 0.9%)
Reproducibility or Analogous Concepts Noted as Purpose of Data Policy = Explicitly stated (%) 54 (17.0%)

2.16 Distribution of Impact Factor - Normality Tests

First we must determine whether IF or some transformation of IF is normally distributed. We can visually assess this by plotting histograms and density plots.

# check normality
ggplot(jdata_long, aes(impact_factor, fill = year)) + geom_density(alpha = 0.4) + 
    theme_minimal() + xlab("Impact Factor") + ggtitle("Density of IF by Year")
ggplot(jdata_long, aes(impact_factor, fill = year)) + geom_histogram(alpha = 0.4, 
    bins = 40) + theme_minimal() + xlab("Impact Factor") + ggtitle("Histogram of IF by Year")

# check normality of log
ggplot(jdata_long, aes(log(impact_factor), fill = year)) + geom_density(alpha = 0.4) + 
    theme_minimal() + xlab("log(IF)") + ggtitle("Density of log(IF) by Year")
ggplot(jdata_long, aes(log(impact_factor), fill = year)) + geom_histogram(alpha = 0.4, 
    bins = 40) + theme_minimal() + xlab("Impact Factor") + ggtitle("Histogram of log(IF) by Year")

We can also test the Normality assumption with the Shapiro Wilk’s Test. We can test IF by year, as well as log(IF). We can also test within each DSM group since the main assumption for ANOVA is that the outcome (IF) is normal within each group.

norm_test1 = data.frame(Y = c("IF 2013", "IF 2014", "log(IF) 2013", "log(IF) 2014"), 
    rbind(broom::tidy(shapiro.test(jdata$if_2013)), broom::tidy(shapiro.test(jdata$if_2014)), 
        broom::tidy(shapiro.test(log(jdata$if_2013))), broom::tidy(shapiro.test(log(jdata$if_2014)))))
norm_test1$p.value = as.character(signif(norm_test1$p.value, 2))
kable(norm_test1, digits = 4, caption = "Shapiro Wilk Normality Test p-values of IF")
Shapiro Wilk Normality Test p-values of IF
Y statistic p.value method
IF 2013 0.6353 1.7e-25 Shapiro-Wilk normality test
IF 2014 0.6315 1.6e-25 Shapiro-Wilk normality test
log(IF) 2013 0.9105 8.3e-13 Shapiro-Wilk normality test
log(IF) 2014 0.9467 2.8e-09 Shapiro-Wilk normality test
# check normality within dsm groups
norm_test = jdata %>% group_by(dsm_fac) %>% summarize(if_2013.pvalue = shapiro.test(if_2013)$p.value, 
    if_2014.pvalue = shapiro.test(if_2014)$p.value, log_if_2013.pvalue = shapiro.test(log(if_2013))$p.value, 
    log_if_2014.pvalue = shapiro.test(log(if_2014))$p.value)
norm_test[, -1] = apply(norm_test[, -1], 2, function(k) as.character(signif(k, 
    2)))

kable(norm_test, digits = 4, caption = "Shapiro Wilk Normality Test p-values of IF within DSM Groups")
Shapiro Wilk Normality Test p-values of IF within DSM Groups
dsm_fac if_2013.pvalue if_2014.pvalue log_if_2013.pvalue log_if_2014.pvalue
Required as condition of publication, barring exceptions 6e-06 8.6e-06 0.75 0.95
Required but no explicit statement regarding effect on publication/editorial decisions 9.5e-05 4.4e-05 0.18 0.15
Explicitly encouraged/addressed, but not required. 1.4e-10 9.4e-11 0.0042 0.0042
Mentioned indirectly 0.0016 0.00063 0.78 0.66
Only protein, proteomic, and/or genomic data sharing are addressed. 0.00016 1e-04 0.036 0.095
No mention 3.2e-06 2.6e-07 1.7e-10 1.7e-07

The p-values from the Shapiro Wilk’s test are mostly very significant (hence we reject the normality assumption) and in general Impact Factor appears to be quite skewed so nonparametric tests (Wilcoxon, Kruskal-Wallis) will be more appropriate. Taking the log helps somewhat but does not solve the problem. Furthermore log-IF is much less intepretable. Hence, we use nonparametric tests to compare distributions.

2.17 Impact Factor vs. Data Sharing Mark

We wish to assess how Impact Factor differs between journals with different data sharing types.

2.17.1 Summarize Impact Factor Within Groups

We can describe the minimum, maximum, mean, and median of Impact factor as well as total citations for each DSM type.

tmpsum = jdata_long %>% group_by(dsm, dsm_fac, year) %>% summarize(`Number of Journals` = n(), 
    min_IF = min(impact_factor, na.rm = T), mean_IF = mean(impact_factor, na.rm = T), 
    median_IF = median(impact_factor, na.rm = T), max_IF = max(impact_factor, 
        na.rm = T), min_TotalCites = min(total_cites, na.rm = T), mean_TotalCites = mean(total_cites, 
        na.rm = T), median_TotalCites = median(total_cites, na.rm = T), max_TotalCites = max(total_cites, 
        na.rm = T))
tmpsum = tmpsum %>% rename(DSM = dsm_fac)
kable(tmpsum, digits = 2)
dsm DSM year Number of Journals min_IF mean_IF median_IF max_IF min_TotalCites mean_TotalCites median_TotalCites max_TotalCites
1 Required as condition of publication, barring exceptions 2013 38 2.25 10.09 8.22 42.35 374 43076.95 19967.5 590324
1 Required as condition of publication, barring exceptions 2014 38 1.99 9.87 8.08 41.46 748 48221.05 23867.0 617363
1 Required as condition of publication, barring exceptions 2015 38 2.11 9.43 7.47 38.14 681 35612.34 15361.0 627846
2 Required but no explicit statement regarding effect on publication/editorial decisions 2013 29 2.00 9.66 6.34 33.12 979 62583.83 15492.0 565934
2 Required but no explicit statement regarding effect on publication/editorial decisions 2014 29 1.82 9.53 5.62 33.61 1448 65363.17 17231.0 586144
2 Required but no explicit statement regarding effect on publication/editorial decisions 2015 29 1.53 9.43 5.30 34.66 2080 55740.28 11614.0 593284
3 Explicitly encouraged/addressed, but not required. 2013 74 2.13 4.27 3.88 16.75 335 17758.84 9696.0 167915
3 Explicitly encouraged/addressed, but not required. 2014 74 2.12 4.24 3.82 17.57 577 18691.88 10228.5 173265
3 Explicitly encouraged/addressed, but not required. 2015 74 2.05 4.18 3.79 17.30 410 11499.77 8038.0 76694
4 Mentioned indirectly 2013 29 1.09 3.61 3.29 9.92 1491 11688.97 6787.0 47233
4 Mentioned indirectly 2014 29 1.24 3.49 3.10 9.57 1798 12351.10 7306.0 48946
4 Mentioned indirectly 2015 29 1.52 3.47 3.20 8.98 1975 8548.17 5989.0 37148
5 Only protein, proteomic, and/or genomic data sharing are addressed. 2013 47 1.32 5.65 5.01 13.91 440 38265.09 20478.0 406586
5 Only protein, proteomic, and/or genomic data sharing are addressed. 2014 47 1.52 5.52 4.66 12.52 557 38497.47 20513.0 396051
5 Only protein, proteomic, and/or genomic data sharing are addressed. 2015 47 1.68 5.20 4.32 12.48 676 18308.17 8389.0 143465
6 No mention 2013 101 0.07 3.94 3.48 11.98 115 8053.62 4693.0 46347
6 No mention 2014 101 0.22 3.90 3.50 12.41 384 8660.03 5078.0 45541
6 No mention 2015 101 0.22 3.85 3.48 14.81 410 6494.86 4832.0 32282
tmpsum_IF_within_DSM = tmpsum

We can also visualize the distribution with a boxplot of IF by DSM type.

Boxplot description: The lower and upper “hinges” correspond to the first and third quartiles (the 25th and 75th percentiles). The upper whisker extends from the hinge to the highest value that is within 1.5 * IQR of the hinge, where IQR is the inter-quartile range, or distance between the first and third quartiles. The lower whisker extends from the hinge to the lowest value within 1.5 * IQR of the hinge. Data beyond the end of the whiskers are outliers and plotted as points (as specified by Tukey). (from geom_boxplot help page in the R ggplot2 package.) The Triangle represents the mean.

ggplot(jdata_long %>% filter(year != 2015), aes(x = dsm, y = impact_factor, 
    fill = dsm_fac, alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + 
    theme_minimal() + scale_alpha_manual(values = c(0.3, 0.7)) + stat_summary(fun.y = mean, 
    geom = "point", position = position_dodge(0.9), aes(x = dsm, y = impact_factor, 
        shape = "triangle")) + scale_shape_manual(values = 2) + guides(alpha = "none", 
    shape = "none", fill = guide_legend(title = "Data Sharing Mark", ncol = 1)) + 
    xlab("Data Sharing Mark (1-6), Year (2013 light, 2014 dark)") + ylab("Impact Factor") + 
    # ggtitle('Boxplot of Impact Factor by Data Sharing Mark')+
scale_x_continuous(breaks = 1:6) + theme(legend.position = "bottom")

ggplot(jdata_long, aes(x = dsm, y = impact_factor, fill = dsm_fac, alpha = year)) + 
    geom_boxplot(position = position_dodge(0.9)) + theme_minimal() + scale_alpha_manual(values = c(0.3, 
    0.5, 0.8)) + stat_summary(fun.y = mean, geom = "point", position = position_dodge(0.9), 
    aes(x = dsm, y = impact_factor, shape = "triangle")) + scale_shape_manual(values = 2) + 
    guides(alpha = "none", shape = "none", fill = guide_legend(title = "Data Sharing Mark", 
        ncol = 1)) + xlab("Data Sharing Mark (1-6), Year (2013-2015 light-dark)") + 
    ylab("Impact Factor") + # ggtitle('Boxplot of Impact Factor by Data Sharing Mark')+
scale_x_continuous(breaks = 1:6) + theme(legend.position = "bottom")

2.18 FIGURE 2: Impact factors were higher for journals with the strongest data sharing policies (DSM 1) compared to journals with no mention of data sharing (DSM 6).

The median Impact Factor was calculated for the journals with each data sharing mark for each report year (light color=2013, dark color=2014). The lower and upper hinges of the boxplots represent the first and third quartiles of journal Impact Factor, the horizontal line represents the median, the triangle represents the mean, and the upper and lower whiskers extend from the hinge to the highest (lowest) value that is within 1.5 times the interquartile range of the hinge, with journals outside this range represented as points.

tmplong = jdata_long%>%filter(year!=2015)
tmplong$dsm_fac_reverse = factor(tmplong$dsm_fac,levels=levels(tmplong$dsm_fac)[6:1])
ggplot(tmplong,aes(x=dsm,y=impact_factor,alpha=year,fill=dsm_fac_reverse))+
  geom_boxplot(position=position_dodge(.9))+theme_minimal()+
  scale_alpha_manual(values=c(0.7,0.9))+ 
  stat_summary(fun.y=mean,geom="point",position=position_dodge(.9),
               aes(x=dsm,y=impact_factor,shape="triangle"))+
  scale_shape_manual(values = 2)+
  scale_fill_manual(values = RColorBrewer::brewer.pal(n=7, name="Blues")[2:7],
                    #values=RColorBrewer::brewer.pal(n=6,name="Blues"),
                    breaks=levels(tmplong$dsm_fac),
                    labels=paste(1:6,levels(tmplong$dsm_fac)))+
  guides(alpha="none",shape="none",
         fill=guide_legend(title="Data Sharing Mark",ncol=1))+
  xlab("Data Sharing Mark (1-6), Year (2013 light, 2014 dark)")+
  ylab("Impact Factor")+
  #ggtitle("Boxplot of Impact Factor by Data Sharing Mark")+
  scale_x_continuous(breaks=1:6)+
  theme(legend.position="bottom")

ggplot(jdata_long %>% filter(!Journal == "PLoS One"), aes(x = dsm, y = citable_items, 
    fill = dsm_fac, alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + 
    theme_minimal() + scale_alpha_manual(values = c(0.3, 0.5, 0.8)) + stat_summary(fun.y = mean, 
    geom = "point", position = position_dodge(0.9), aes(x = dsm, y = citable_items, 
        shape = "triangle")) + scale_shape_manual(values = 2) + guides(alpha = "none", 
    shape = "none", fill = "none") + xlab("Data Sharing Mark (1-6), Year (2013-2015 light-dark)") + 
    # ggtitle('Number of Citable Items per Journal by Data Sharing Mark')+
ylab("Number of Citable Items per Journal") + scale_x_continuous(breaks = 1:6) + 
    theme(legend.position = "bottom")

Collapsing DSM into two categories:

tmpsum = jdata_long %>% group_by(dsm2_fac, year) %>% summarize(`Number of Journals` = n(), 
    min_IF = min(impact_factor, na.rm = T), mean_IF = mean(impact_factor, na.rm = T), 
    median_IF = median(impact_factor, na.rm = T), max_IF = max(impact_factor, 
        na.rm = T), min_TotalCites = min(total_cites, na.rm = T), mean_TotalCites = mean(total_cites, 
        na.rm = T), median_TotalCites = median(total_cites, na.rm = T), max_TotalCites = max(total_cites, 
        na.rm = T))
tmpsum = tmpsum %>% rename(DSM = dsm2_fac)
kable(tmpsum, digits = 2)
DSM year Number of Journals min_IF mean_IF median_IF max_IF min_TotalCites mean_TotalCites median_TotalCites max_TotalCites
Not Required 2013 251 0.07 4.32 3.99 16.75 115 16992.07 7897 406586
Not Required 2014 251 0.22 4.26 3.88 17.57 384 17703.23 8490 396051
Not Required 2015 251 0.22 4.16 3.75 17.30 410 10463.00 6914 143465
Required 2013 67 2.00 9.91 6.79 42.35 374 51520.22 17144 590324
Required 2014 67 1.82 9.72 7.05 41.46 748 55640.78 18098 617363
Required 2015 67 1.53 9.43 6.40 38.14 681 44324.43 14322 627846
tmpsum_IF_within_DSM2 = tmpsum
ggplot(jdata_long, aes(x = dsm2_fac_flip, y = impact_factor, fill = dsm2_fac_flip, 
    alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + theme_minimal() + 
    scale_alpha_manual(values = c(0.3, 0.5, 0.8)) + stat_summary(fun.y = mean, 
    geom = "point", position = position_dodge(0.9), aes(x = dsm2_fac, y = impact_factor, 
        shape = "triangle")) + scale_shape_manual(values = 2) + guides(alpha = "none", 
    shape = "none", fill = guide_legend(title = "Data Sharing")) + xlab("Data Sharing Required (No, Yes), Year (2013-2015 light-dark)") + 
    # ggtitle('Boxplot of Impact Factor by Data Sharing Requirement')+
ylab("Impact Factor")

ggplot(jdata_long %>% filter(!Journal == "PLoS One"), aes(x = dsm2_fac_flip, 
    y = citable_items, fill = dsm2_fac_flip, alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + 
    theme_minimal() + stat_summary(fun.y = mean, geom = "point", position = position_dodge(0.9), 
    aes(x = dsm2_fac, y = citable_items, shape = "triangle")) + scale_alpha_manual(values = c(0.3, 
    0.5, 0.8)) + scale_shape_manual(values = 2) + guides(alpha = "none", shape = "none", 
    fill = guide_legend(title = "Data Sharing")) + xlab("Data Sharing Required (No, Yes), Year (2013-2015 light-dark)") + 
    # ggtitle('Number of Citable Items per Journal by Data Sharing
# Requirement')+
ylab("Number of Citable Items per Journal")

3 Analysis: Journal Impact Factor, Open Access, Data Sharing Mark

3.1 Methods

Continuous variables are summarized with medians and interquartile ranges (IQRs) denoting the 25th and 75th percentiles. Categorical variables are summarized with counts and percentages. The variables Impact Factor and total citable items are not normally distributed (Shapiro Wilk’s Test p-values < 0.001) so medians are presented instead of means, and nonparametric methods are used for statistical tests.

The association of Impact Factor (IF) with 6-level data sharing mark (DSM) was tested with a nonparametric Kruskal-Wallis one-way analysis of variance (ANOVA) of IF in 2013 and 2014 with DSM as a grouping factor. Post-hoc pairwise two-sample Wilcoxon tests were used to determine whether the median IF for journals differ between the two level data sharing policy (required vs. not required) categories. P-values from the Wilcoxon tests were adjusted for multiple comparisons with the Holm procedure.

Pearson’s chi-square test was used to test the association of data sharing policy (two levels: required vs not required) and open access status. Fisher’s Exact Test was used to test the association of the 6-level DSM with open access status. Fisher’s Test was used as opposed to Chi-square test due to the low number of Open Access journals within some DSM categories. To examine the association of open access status and data sharing weighted by publishing volume we examined the number of citable items in each category and tested for the association of open access and data sharing with Pearson’s chi-square test.

All statistical analyses were performed with R version 3.2.1 (2015-06-18). All code and data to reproduce these results can be found on github (https://github.com/OHSU-Ontology-Development-Group/DataSharingPolicies).

3.2 Notes

  • The Kruskal-Wallis test is a nonparametric version of ANOVA that tests whether the distribution of IF varies between DSM groups.
  • The Wilcoxon test is testing the difference in medians between two groups when the distributions of the outcome (IF in this case) are the same. Based on the boxplots and densities the distributions do look similar within DSM group so we are comfortable making inferences on the medians.

3.3 IF ~ DSM

We perform a Kruskal Wallis test for the difference in median JIF between 6 category DSM.

We can also collapse DSM into two categories and perform a Wilcoxon test for difference in median JIF between required vs not required data sharing.

k_if13 = with(jdata, kruskal.test(if_2013 ~ dsm))
k_if14 = with(jdata, kruskal.test(if_2014 ~ dsm))
k_if15 = with(jdata, kruskal.test(if_2015 ~ dsm))

w_if13 = with(jdata, pairwise.wilcox.test(if_2013, dsm))
w_if14 = with(jdata, pairwise.wilcox.test(if_2014, dsm))
w_if15 = with(jdata, pairwise.wilcox.test(if_2015, dsm))

w_if13_2cat = with(jdata, wilcox.test(if_2013 ~ dsm2))
w_if14_2cat = with(jdata, wilcox.test(if_2014 ~ dsm2))
w_if15_2cat = with(jdata, wilcox.test(if_2015 ~ dsm2))
# kw results
kw_results = data.frame(Y = c("if_2013_6catDSM", "if_2014_6catDSM", "if_2015_6catDSM", 
    "if_2013_2catDSM", "if_2014_2catDSM", "if_2015_2catDSM"), bind_rows(broom::tidy(k_if13), 
    broom::tidy(k_if14), broom::tidy(k_if15), broom::tidy(w_if13_2cat), broom::tidy(w_if14_2cat), 
    broom::tidy(w_if15_2cat)))
kw_results$p.value = as.character(signif(kw_results$p.value, 2))
kw_results = kw_results %>% rename(`degrees of freedom` = parameter)
kable(kw_results, digits = 2)
Y statistic p.value degrees of freedom method alternative
if_2013_6catDSM 61.77 5.2e-12 5 Kruskal-Wallis rank sum test NA
if_2014_6catDSM 60.04 1.2e-11 5 Kruskal-Wallis rank sum test NA
if_2015_6catDSM 52.23 4.8e-10 5 Kruskal-Wallis rank sum test NA
if_2013_2catDSM 4339.00 1.2e-09 NA Wilcoxon rank sum test with continuity correction two.sided
if_2014_2catDSM 4279.50 9.5e-10 NA Wilcoxon rank sum test with continuity correction two.sided
if_2015_2catDSM 4268.00 1.2e-09 NA Wilcoxon rank sum test with continuity correction two.sided

The pairwise Wilcoxon test p-values are below (by DSM group number) for 2013, 2014, and 2015. The p-values are adjusted for multiple comparisons with the holm method.

Significant p-values suggest that the median JIF is different for journals in the two DSM categories.

Pairwise Wilcoxon p-values for IF 2013 between DSM:

signif_13 = signif(w_if13$p.value, 2)
signifind = which(signif_13 < 0.05, arr.ind = T)
emphasize.strong.cells(signifind)
signif_13[which(signif_13 < 0.001, arr.ind = T)] = "< 0.001"
signif_13[which(is.na(signif_13), arr.ind = T)] = ""
pandoc.table(signif_13)
  1 2 3 4 5
2 0.86
3 < 0.001 0.034
4 < 0.001 0.0072 0.23
5 0.04 0.86 0.0022 < 0.001
6 < 0.001 0.0033 0.69 0.86 < 0.001

Pairwise Wilcoxon p-values for IF 2014 between DSM:

signif_14 = signif(w_if14$p.value, 2)
signifind = which(signif_14 < 0.05, arr.ind = T)
emphasize.strong.cells(signifind)
signif_14[which(signif_14 < 0.001, arr.ind = T)] = "< 0.001"
signif_14[which(is.na(signif_14), arr.ind = T)] = ""
pandoc.table(signif_14)
  1 2 3 4 5
2 0.82
3 < 0.001 0.034
4 < 0.001 0.0058 0.17
5 0.016 0.82 0.0062 < 0.001
6 < 0.001 0.0035 0.82 0.82 < 0.001

Pairwise Wilcoxon p-values for IF 2015 between DSM:

signif_15 = signif(w_if15$p.value, 2)
signifind = which(signif_15 < 0.05, arr.ind = T)
emphasize.strong.cells(signifind)
signif_15[which(signif_15 < 0.001, arr.ind = T)] = "< 0.001"
signif_15[which(is.na(signif_15), arr.ind = T)] = ""
pandoc.table(signif_15)
  1 2 3 4 5
2 0.89
3 < 0.001 0.051
4 < 0.001 0.0062 0.15
5 0.0098 0.42 0.14 0.012
6 < 0.001 0.0051 0.42 0.89 0.0062

3.3.1 Results

Impact factor is significantly associated with the six category data sharing mark (Kruskal-Wallis rank sum test, 5 df, p < 0.001, 2013 and 2014). Examining pairwise differences between DSMs we see that journals with DSM 1 have significantly higher JIF than journals with DSM 3, 4, 5, or 6 (Wilcoxon test, p < 0.001, < 0.001, 0.04, < 0.001; 2013 data, 2014 similar). Journals with DSM 2 have significantly higher JIF than journals with DSM 3, 4, or 6 (Wilcoxon test, p = 0.034,0.0072, 0.0033; 2013 data, 2014 similar). Journals with DSM 5 have significantly higher JIF than journals with DSM 3, 4, and 6 (Wilcoxon test, p 0.0022, < 0.001, < 0.001; 2013 data, 2014 similar). In general, IF is not significantly different between DSM 1&2 and DSM 2&5, reflecting the similar JIF for journals with data sharing requirements, either full or partial sharing. After collapsing DSM into two categories, required (DSM 1-2) and not required (DSM 3-6) we still see a highly significant increase in JIF for journals with required data sharing (Wilcoxon Rank Sum Test, p < 0.001, 2013 and 2014 data). The median JIFs for DSM 1-6 are 8.22, 6.34, 3.88, 3.29, 5.01, 3.48 in 2013 and 8.08, 5.62, 3.82, 3.1, 4.66, 3.5 in 2014. The median JIFs for the collapsed two category DSM required and not required are 6.79, 3.99 in 2013 and 7.05, 3.88 in 2014.

Results for 2015 are similar, except DSM 3 and 5 are no longer significantly different.

3.4 Kruskal-Wallis test for year effect

A Kruskal-Wallis test was performed for year effect with impact factor and number of citable items. There were no significant differences in distribution impact factor nor citable items by year in the entire data set nor in subsets defined by DSM group.

3.4.1 Impact Factor

tmp1 = broom::tidy(with(jdata_long, kruskal.test(impact_factor ~ as.factor(year))))

tmp2 = bind_rows(lapply(1:6, function(k) {
    broom::tidy(with(jdata_long %>% filter(dsm == k), kruskal.test(impact_factor ~ 
        as.factor(year))))
}))
tmp = bind_rows(tmp1, tmp2)
tmp = cbind(DSM = c("All", paste0("DSM = ", 1:6)), tmp)
tmp %>% kable(digits = 3)
DSM statistic p.value parameter method
All 1.286 0.526 2 Kruskal-Wallis rank sum test
DSM = 1 0.186 0.911 2 Kruskal-Wallis rank sum test
DSM = 2 0.055 0.973 2 Kruskal-Wallis rank sum test
DSM = 3 0.260 0.878 2 Kruskal-Wallis rank sum test
DSM = 4 0.225 0.894 2 Kruskal-Wallis rank sum test
DSM = 5 3.514 0.173 2 Kruskal-Wallis rank sum test
DSM = 6 0.207 0.902 2 Kruskal-Wallis rank sum test

3.4.2 Citable Items

tmp1 = broom::tidy(with(jdata_long, kruskal.test(citable_items ~ as.factor(year))))

tmp2 = bind_rows(lapply(1:6, function(k) {
    broom::tidy(with(jdata_long %>% filter(dsm == k), kruskal.test(citable_items ~ 
        as.factor(year))))
}))
tmp = bind_rows(tmp1, tmp2)
tmp = cbind(DSM = c("All", paste0("DSM = ", 1:6)), tmp)
tmp %>% kable(digits = 3)
DSM statistic p.value parameter method
All 0.019 0.990 2 Kruskal-Wallis rank sum test
DSM = 1 0.051 0.975 2 Kruskal-Wallis rank sum test
DSM = 2 0.135 0.935 2 Kruskal-Wallis rank sum test
DSM = 3 0.293 0.864 2 Kruskal-Wallis rank sum test
DSM = 4 0.584 0.747 2 Kruskal-Wallis rank sum test
DSM = 5 0.331 0.848 2 Kruskal-Wallis rank sum test
DSM = 6 0.002 0.999 2 Kruskal-Wallis rank sum test

3.5 Open Access vs. DSM

3.5.1 Fisher’s Exact Test for 6 category DSM vs. OAM

The Fisher’s Exact Test for DSM vs. OAM tests for the independence of the categories of DSM and OAM (unordered).

Table of counts for 6 category DSM and two category OAM.

tab1 = with(jdata, table(oam_fac, dsm))
kable(tab1)
1 2 3 4 5 6
Subscription 29 27 63 29 40 86
Open Access 9 2 11 0 7 15

Table of proportion with Open Access in each DSM category:

tibble(DSM = dsm_labels, `Proportion Open Access` = round(tab1[2, ]/colSums(tab1), 
    2)) %>% kable()
DSM Proportion Open Access
Required as condition of publication, barring exceptions 0.24
Required but no explicit statement regarding effect on publication/editorial decisions 0.07
Explicitly encouraged/addressed, but not required. 0.15
Mentioned indirectly 0.00
Only protein, proteomic, and/or genomic data sharing are addressed. 0.15
No mention 0.15

We can test for the association of DSM and OAM with Fisher’s Exact Test. The Test result is below:

fishres = broom::tidy(fisher.test(tab1))
kable(fishres)
p.value method alternative
0.0697445 Fisher’s Exact Test for Count Data two.sided

3.5.2 Chi-Square test for 2 category DSM vs. OAM

Collapsing the categories into a 2x2 table makes the test hypothesis and result easier to interpret. When we collapse DSM into two categories (required vs. not required) there are more counts in each cell so we do not need to use the Fisher’s Exact Test but instead can use a Chi-square test (commonly used for large samples).

The number of journals in each of the 2x2 table categories are below:

tab2 = with(jdata, table(oam_fac, dsm2_fac))
kable(tab2)
Not Required Required
Subscription 218 56
Open Access 33 11

Table of proportion with Open Access in each DSM category:

jdata %>% group_by(dsm2_fac) %>% summarize(`Proportion  Open Access` = mean(oam)) %>% 
    rename(DSM = dsm2_fac) %>% kable(digits = 3)
DSM Proportion Open Access
Not Required 0.131
Required 0.164

Table of proportion data sharing required in each Open Access category:

jdata %>% group_by(oam_fac) %>% summarize(`Proportion  DSM Required` = mean(dsm2)) %>% 
    rename(OAM = oam_fac) %>% kable(digits = 3)
OAM Proportion DSM Required
Subscription 0.204
Open Access 0.250
chires = broom::tidy(chisq.test(tab2)) %>% rename(df = parameter)
kable(chires, digits = 3)
statistic p.value df method
0.24 0.624 1 Pearson’s Chi-squared test with Yates’ continuity correction

3.5.3 Results

The Fisher’s Exact test is testing the hypothesis that open access status is associated with data sharing mark. The test is not significant (Fisher’s Exact Test, p = 0.07) which suggests that the proportion of open access journals is not significantly differenct across data sharing mark categories.

The Chi-square test is testing the hypothesis that open access status is associated with data sharing requirement (two categories DSM 1-2 vs DSM 3-6). The test is not significant (Chi-square Test, df=1, p = 0.62) which suggests that journals with data sharing requirements are not any more likely to be open access than journals without data sharing requirement. Also, open access journals are not more likely to have data sharing requirements than subscription journals. This is further supported by the evidence that the proportion of open access journals that is similar for data sharing “required” vs. “non-required” journals.

4 Publishing Volume

4.1 All Journals

Here we determine how data sharing and open access are related once incorporating publishing volume. In this case, we are considering the “citable item” as the unit of measurement as opposed to journal. In other words, we ask, if we are given a citable item that is open access, is it more likely to have data sharing requirements than a citable item that is subscription based?

4.1.1 Summary of Number of Citable Items by DSM and OAM

In 2013, the total number of citable items in the set of studied journals was 130330, in 2014 it was 131107 and in 2015 it was 130277.

Summary of citable items in 2013/2014/2015:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, caption = "Citable Items by DSM")
Citable Items by DSM
year DSM Num Journals Total Citable
2013 1 38 42669
2013 2 29 12138
2013 3 74 25519
2013 4 29 8062
2013 5 47 19339
2013 6 101 22603
2014 1 38 42794
2014 2 29 12436
2014 3 74 26026
2014 4 29 7894
2014 5 47 19080
2014 6 101 22877
2015 1 38 40870
2015 2 29 14233
2015 3 74 26731
2015 4 29 7928
2015 5 47 17734
2015 6 101 22781
tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, caption = "Citable Items by Required/Not Required DSM")
Citable Items by Required/Not Required DSM
year DSM Num Journals Total Citable
2013 Not Required 251 75523
2013 Required 67 54807
2014 Not Required 251 75877
2014 Required 67 55230
2015 Not Required 251 75174
2015 Required 67 55103
tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, caption = "Citable Items by Open Access")
Citable Items by Open Access
year OAM Num Journals Total Citable
2013 Subscription 274 86541
2013 Open Access 44 43789
2014 Subscription 274 85276
2014 Open Access 44 45831
2015 Subscription 274 82704
2015 Open Access 44 47573

Summary with proportions:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, digits = 3)
year DSM Num Journals Total Citable Proportion Citable Items Open Access
2013 1 38 42669 0.819
2013 2 29 12138 0.023
2013 3 74 25519 0.103
2013 4 29 8062 0.000
2013 5 47 19339 0.100
2013 6 101 22603 0.176
2014 1 38 42794 0.818
2014 2 29 12436 0.086
2014 3 74 26026 0.136
2014 4 29 7894 0.000
2014 5 47 19080 0.117
2014 6 101 22877 0.174
2015 1 38 40870 0.812
2015 2 29 14233 0.232
2015 3 74 26731 0.149
2015 4 29 7928 0.000
2015 5 47 17734 0.115
2015 6 101 22781 0.223
tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, digits = 3)
year DSM Num Journals Total Citable Proportion Citable Items Open Access
2013 Not Required 251 75523 0.113
2013 Required 67 54807 0.643
2014 Not Required 251 75877 0.129
2014 Required 67 55230 0.653
2015 Not Required 251 75174 0.148
2015 Required 67 55103 0.662
tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Required DSM` = sum(citable_items[dsm2 == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, digits = 3)
year OAM Num Journals Total Citable Proportion Citable Items Required DSM
2013 Subscription 274 86541 0.226
2013 Open Access 44 43789 0.805
2014 Subscription 274 85276 0.225
2014 Open Access 44 45831 0.787
2015 Subscription 274 82704 0.225
2015 Open Access 44 47573 0.767

4.1.2 Chi-square analysis - Citable Item

When we weight the number of open access journals and required data sharing journals by total citable items within each category there is a significant association between open access and data sharing requirement at the citable item level. That is, a citable item that is open access is much more likely to also have a data sharing requirement. This is mainly due to the fact that although the number of journals who have these open access or data sharing requirements is smaller than the number of journals that do not, the total citable articles within those journals is much larger. The p-values for the chi-square test at the citable item level are <2e-16, very significant.

Chi-square test results:

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2013")
Citable Items by OAM and DSM, 2013
Not Required Required
Subscription 66968 19573
Open Access 8555 35234
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
39924.41 < 2e-16 1 Pearson’s Chi-squared test with Yates’ continuity correction
tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2014")
Citable Items by OAM and DSM, 2014
Not Required Required
Subscription 66115 19161
Open Access 9762 36069
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
38658.64 < 2e-16 1 Pearson’s Chi-squared test with Yates’ continuity correction
tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 8], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2014")
Citable Items by OAM and DSM, 2014
Not Required Required
Subscription 64067 18637
Open Access 11107 36466
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
36238.57 < 2e-16 1 Pearson’s Chi-squared test with Yates’ continuity correction

Chi-square test results for DSM1-6 vs Open Access:

2013:

tmp = jdata_long %>% group_by(year, dsm, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
tmp = add_row(as.data.frame(tmp), year = 2013, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0)
tmp = add_row(as.data.frame(tmp), year = 2014, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0) %>% arrange(year, dsm, oam_fac)

tmpm = data.frame(matrix(tmp %>% filter(year == 2013) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
67706.64 < 2e-16 5 Pearson’s Chi-squared test

2014:

tmpm = data.frame(matrix(tmp %>% filter(year == 2014) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
62151.23 < 2e-16 5 Pearson’s Chi-squared test

2015:

tmpm = data.frame(matrix(tmp %>% filter(year == 2015) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
45573.45 < 2e-16 5 Pearson’s Chi-squared test

4.2 Trend test for proportion of citable items with required data sharing within open access categories across years

tmp = jdata_long %>% group_by(year, oam_fac, dsm2) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))

# within OA
tmpreq = tmp %>% filter(oam_fac == "Open Access", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Open Access", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_oa = prop.test(tmpreq, tmptot)
tmpres_oa_trend = prop.trend.test(tmpreq, tmptot)


# within subscription
tmpreq = tmp %>% filter(oam_fac == "Subscription", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Subscription", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_sub_trend = prop.trend.test(tmpreq, tmptot)
tmpres_sub = prop.test(tmpreq, tmptot)


tmpres = bind_rows(broom::tidy(tmpres_sub), broom::tidy(tmpres_oa))
tmpres_trend = bind_rows(broom::tidy(tmpres_sub_trend), broom::tidy(tmpres_oa_trend))

colnames(tmpres)[1:3] = c("2013", "2014", "2015")
tmpres_withPLOS = cbind(tmpres[, 1:3], tmpres_trend)
tmp = jdata_long %>% filter(!Journal == "PLoS One") %>% group_by(year, oam_fac, 
    dsm2) %>% summarize(`Num Journals` = n(), `Total Citable` = sum(citable_items, 
    na.rm = T))

# within OA
tmpreq = tmp %>% filter(oam_fac == "Open Access", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Open Access", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_oa = prop.test(tmpreq, tmptot)
tmpres_oa_trend = prop.trend.test(tmpreq, tmptot)


# within subscription
tmpreq = tmp %>% filter(oam_fac == "Subscription", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Subscription", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_sub_trend = prop.trend.test(tmpreq, tmptot)
tmpres_sub = prop.test(tmpreq, tmptot)


tmpres = bind_rows(broom::tidy(tmpres_sub), broom::tidy(tmpres_oa))
tmpres_trend = bind_rows(broom::tidy(tmpres_sub_trend), broom::tidy(tmpres_oa_trend))

colnames(tmpres)[1:3] = c("2013", "2014", "2015")
tmpres = cbind(tmpres[, 1:3], tmpres_trend)

When looking at the open access journals including PLoS One, we can see that the proportion of data sharing required citable items is decreasing from 2013 to 2015 within open access journals (0.8, 0.79, 0.77, p=9.310^{-45}).

After removing PLos One, we can see that the proportion of data sharing required citable items is increasing from 2013 to 2015 within open access journals (0.3, 0.38, 0.43, p=1.410^{-108}).

However, this proportion does not increase within subscription journals (0.23, 0.22, 0.23, p=0.68).

The results are below:

kable(cbind(PLOS = c("withPLOS", "withPLOS", "withoutPLOS", "withoutPLOS"), 
    bind_rows(tmpres_withPLOS, tmpres)), digits = 3)
PLOS 2013 2014 2015 statistic p.value parameter method
withPLOS 0.226 0.225 0.225 0.171 0.679 1 Chi-squared Test for Trend in Proportions
withPLOS 0.805 0.787 0.767 197.023 0.000 1 Chi-squared Test for Trend in Proportions
withoutPLOS 0.226 0.225 0.225 0.171 0.679 1 Chi-squared Test for Trend in Proportions
withoutPLOS 0.304 0.382 0.429 490.105 0.000 1 Chi-squared Test for Trend in Proportions

4.3 Remove PLoS One

PLoS One may be skewing the results since it has such high volume, so we try the above analysis after removing this journal.

jdata_long0 = jdata_long  # keep old jdata_long object
jdata_long = jdata_long %>% filter(!Journal == "PLoS One")

4.3.1 Summary of Number of Citable Items by DSM and OAM

In 2013, the total number of citable items in the set of studied journals (removing Plos One) was 98834, in 2014 it was 101067, and in 2015 it was 102163

Summary of citable items in 2013/2014/2015:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, caption = "Citable Items by DSM")
Citable Items by DSM
year DSM Num Journals Total Citable
2013 1 37 11173
2013 2 29 12138
2013 3 74 25519
2013 4 29 8062
2013 5 47 19339
2013 6 101 22603
2014 1 37 12754
2014 2 29 12436
2014 3 74 26026
2014 4 29 7894
2014 5 47 19080
2014 6 101 22877
2015 1 37 12756
2015 2 29 14233
2015 3 74 26731
2015 4 29 7928
2015 5 47 17734
2015 6 101 22781
tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, caption = "Citable Items by Required/Not Required DSM")
Citable Items by Required/Not Required DSM
year DSM Num Journals Total Citable
2013 Not Required 251 75523
2013 Required 66 23311
2014 Not Required 251 75877
2014 Required 66 25190
2015 Not Required 251 75174
2015 Required 66 26989
tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, caption = "Citable Items by Open Access")
Citable Items by Open Access
year OAM Num Journals Total Citable
2013 Subscription 274 86541
2013 Open Access 43 12293
2014 Subscription 274 85276
2014 Open Access 43 15791
2015 Subscription 274 82704
2015 Open Access 43 19459

Summary with proportions:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, digits = 3)
year DSM Num Journals Total Citable Proportion Citable Items Open Access
2013 1 37 11173 0.310
2013 2 29 12138 0.023
2013 3 74 25519 0.103
2013 4 29 8062 0.000
2013 5 47 19339 0.100
2013 6 101 22603 0.176
2014 1 37 12754 0.389
2014 2 29 12436 0.086
2014 3 74 26026 0.136
2014 4 29 7894 0.000
2014 5 47 19080 0.117
2014 6 101 22877 0.174
2015 1 37 12756 0.396
2015 2 29 14233 0.232
2015 3 74 26731 0.149
2015 4 29 7928 0.000
2015 5 47 17734 0.115
2015 6 101 22781 0.223
tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, digits = 3)
year DSM Num Journals Total Citable Proportion Citable Items Open Access
2013 Not Required 251 75523 0.113
2013 Required 66 23311 0.160
2014 Not Required 251 75877 0.129
2014 Required 66 25190 0.239
2015 Not Required 251 75174 0.148
2015 Required 66 26989 0.309
tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Required DSM` = sum(citable_items[dsm2 == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, digits = 3)
year OAM Num Journals Total Citable Proportion Citable Items Required DSM
2013 Subscription 274 86541 0.226
2013 Open Access 43 12293 0.304
2014 Subscription 274 85276 0.225
2014 Open Access 43 15791 0.382
2015 Subscription 274 82704 0.225
2015 Open Access 43 19459 0.429

4.3.2 Chi-square analysis - Citable Item

The results are still highly significant, likely due to the large number of citable items, but an open access article is still more likely to have been published under data sharing requirements than a subscription article. In 2013, 30.4% of open access articles had data sharing requirements as opposed to 22.6% for subscription articles. In 2014 and 2015 it was 38.2% vs. 22.5% and 42.9% vs. 22.5%.

tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
kable(tmp)
year oam_fac Num Journals Total Citable
2013 Subscription 274 86541
2013 Open Access 43 12293
2014 Subscription 274 85276
2014 Open Access 43 15791
2015 Subscription 274 82704
2015 Open Access 43 19459
tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
kable(tmp)
year dsm2_fac Num Journals Total Citable
2013 Not Required 251 75523
2013 Required 66 23311
2014 Not Required 251 75877
2014 Required 66 25190
2015 Not Required 251 75174
2015 Required 66 26989
tmp = jdata_long %>% group_by(year, dsm2_fac, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
kable(tmp)
year dsm2_fac oam_fac Num Journals Total Citable
2013 Not Required Subscription 218 66968
2013 Not Required Open Access 33 8555
2013 Required Subscription 56 19573
2013 Required Open Access 10 3738
2014 Not Required Subscription 218 66115
2014 Not Required Open Access 33 9762
2014 Required Subscription 56 19161
2014 Required Open Access 10 6029
2015 Not Required Subscription 218 64067
2015 Not Required Open Access 33 11107
2015 Required Subscription 56 18637
2015 Required Open Access 10 8352

Chi-squre test results:

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2013")
Citable Items by OAM and DSM, 2013
Not Required Required
Subscription 66968 19573
Open Access 8555 3738
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
362.0439 < 2e-16 1 Pearson’s Chi-squared test with Yates’ continuity correction
tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2014")
Citable Items by OAM and DSM, 2014
Not Required Required
Subscription 66115 19161
Open Access 9762 6029
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
1756.647 < 2e-16 1 Pearson’s Chi-squared test with Yates’ continuity correction
tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 8], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2015")
Citable Items by OAM and DSM, 2015
Not Required Required
Subscription 64067 18637
Open Access 11107 8352
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
3366.925 < 2e-16 1 Pearson’s Chi-squared test with Yates’ continuity correction

Chi-square test results for DSM1-6 vs Open Access:

2013:

tmp = jdata_long %>% group_by(year, dsm, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
tmp = add_row(as.data.frame(tmp), year = 2013, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0)
tmp = add_row(as.data.frame(tmp), year = 2014, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0) %>% arrange(year, dsm, oam_fac)

tmpm = data.frame(matrix(tmp %>% filter(year == 2013) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
6601.23 < 2e-16 5 Pearson’s Chi-squared test

2014:

tmpm = data.frame(matrix(tmp %>% filter(year == 2014) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
7512.572 < 2e-16 5 Pearson’s Chi-squared test

2015:

tmpm = data.frame(matrix(tmp %>% filter(year == 2015) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)
statistic p.value parameter method
33259.23 < 2e-16 5 Pearson’s Chi-squared test

4.4 Results

Of the 318 journals examined, 38 (11.9%) required data sharing as a condition of publication, barring exceptions and 29 (9.1%) required data sharing but made no explicit statement regarding the effect on publication and editorial decisions. 74 (23.3%) journals explicitly encouraged or addressed data sharing, but did not require it. And, 47 (14.8%) journals only addressed data sharing for proteomic, genomic data, or other specific structural data.

In order to understand the potential influence of the policies on the published literature, we also evaluated the distribution of policies by publication volume. In 2013, the total number of citable items (papers) in the studied journals was 98834 (after removing PlOS One), in 2014 the total number of citable items was 101067 and in 2015 the total number of citable items was 102163,

with median numbers of citable items per journal of 242, 236, and 240, respectively.

5 Sample Representation of All Journals

We can compare sampled journals to non-sampled journals (“out of sample”“) or to the entire set of journals (”All Journals“).

jdata_all <- read_excel("../Data-Sharing-Policies_2017-01-25.xlsx", sheet = "2014 ESI Data", 
    na = "N/A")

jdata_all = jdata_all %>% rename(Journal = `Full Journal Title`, if_2014 = `Journal Impact Factor`, 
    citable_2014 = `Citable Items`, total_cites_2014 = `Total Cites`)

jdata_sample = jdata %>% select(Journal, ISSN, if_2014, citable_2014, total_cites_2014) %>% 
    add_column(in_sample = "Sample")

Note, these journals are in the sampled data but not in the “full data”, but they have no data anyway:

table(jdata_sample$Journal %in% jdata_all$Journal)
## 
## FALSE  TRUE 
##     2   316
jdata_sample %>% filter(!Journal %in% jdata_all$Journal) %>% kable()
Journal ISSN if_2014 citable_2014 total_cites_2014 in_sample
Neuron Glia Biology 1740-925X NA NA NA Sample
Journal of Pure and Applied Microbiology 0973-7510 NA NA NA Sample

Number of journals in and out of sample (removing journals with missing data):

# for comparing in sample to all journals
jdata_all_long = jdata_all
jdata_all_long$in_sample = "All Journals"
jdata_all_long = bind_rows(jdata_all_long, jdata_sample)
jdata_all_long = jdata_all_long %>% filter(!is.na(total_cites_2014))
table(jdata_all_long$in_sample)
## 
## All Journals       Sample 
##         1169          316
# for comparing in sample to out of sample
jdata_all = left_join(jdata_all, jdata_sample %>% select(-ISSN))
jdata_all$in_sample[is.na(jdata_all$in_sample)] = "Out of sample"
jdata_all = jdata_all %>% filter(!is.na(Journal))
table(jdata_all$in_sample)
## 
## Out of sample        Sample 
##           853           316

5.1 Distribution of Impact Factor

ggplot(jdata_all, aes(x = in_sample, y = if_2014, fill = in_sample)) + geom_boxplot() + 
    theme_minimal() + ggtitle("Distribution of Impact Factor")

ggplot(jdata_all_long, aes(x = in_sample, y = if_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Impact Factor")

jdata_all_long %>% group_by(in_sample) %>% summarize_at(vars(if_2014), funs(Min = "min", 
    Q25 = quantile(., probs = 0.25), Mean = "mean", Median = "median", Q75 = quantile(., 
        probs = 0.75), Max = "max"), na.rm = T) %>% kable(digits = 2)
in_sample Min Q25 Mean Median Q75 Max
All Journals 0.00 1.46 3.44 2.50 3.83 41.46
Sample 0.22 2.91 5.42 4.16 5.77 41.46

5.2 Distribution of Citable Items

ggplot(jdata_all, aes(x = in_sample, y = citable_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items")

ggplot(jdata_all %>% filter(citable_2014 < 20000), aes(x = in_sample, y = citable_2014, 
    fill = in_sample)) + geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items (remove Plos One)")

ggplot(jdata_all_long, aes(x = in_sample, y = citable_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items")

ggplot(jdata_all_long %>% filter(citable_2014 < 20000), aes(x = in_sample, y = citable_2014, 
    fill = in_sample)) + geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items (remove Plos One)")

jdata_all_long %>% group_by(in_sample) %>% summarize_at(vars(citable_2014), 
    funs(Total = "sum", Min = "min", Q25 = quantile(., probs = 0.25), Mean = "mean", 
        Median = "median", Q75 = quantile(., probs = 0.75), Max = "max"), na.rm = T) %>% 
    kable(digits = 2)
in_sample Total Min Q25 Mean Median Q75 Max
All Journals 219640 0 51.0 187.89 94.0 184.00 30040
Sample 131107 0 148.5 414.90 237.5 356.25 30040

After removing PLoS One:

jdata_all_long %>% filter(!Journal == "PLoS One") %>% group_by(in_sample) %>% 
    summarize_at(vars(citable_2014), funs(Min = "min", Q25 = quantile(., probs = 0.25), 
        Mean = "mean", Median = "median", Q75 = quantile(., probs = 0.75), Max = "max"), 
        na.rm = T) %>% kable(digits = 2)
in_sample Min Q25 Mean Median Q75 Max
All Journals 0 51 162.33 94 184 3931
Sample 0 148 320.85 236 355 3579

5.3 Distribution ofTotal Citations

ggplot(jdata_all, aes(x = in_sample, y = total_cites_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Total Citations")

ggplot(jdata_all_long, aes(x = in_sample, y = total_cites_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Total Citations")

jdata_all_long %>% group_by(in_sample) %>% summarize_at(vars(total_cites_2014), 
    funs(Total = "sum", Min = "min", Q25 = quantile(., probs = 0.25), Mean = "mean", 
        Median = "median", Q75 = quantile(., probs = 0.75), Max = "max"), na.rm = T) %>% 
    kable(digits = 2)
in_sample Total Min Q25 Mean Median Q75 Max
All Journals 11132916 5 928 9523.45 2625.0 7394.00 617363
Sample 8136037 384 4517 25746.95 10126.5 22129.25 617363

After removing PLoS One:

jdata_all_long %>% filter(!Journal == "PLoS One") %>% group_by(in_sample) %>% 
    summarize_at(vars(total_cites_2014), funs(Total = "sum", Min = "min", Q25 = quantile(., 
        probs = 0.25), Mean = "mean", Median = "median", Q75 = quantile(., probs = 0.75), 
        Max = "max"), na.rm = T) %>% kable(digits = 2)
in_sample Total Min Q25 Mean Median Q75 Max
All Journals 10800200 5 927.5 9246.75 2621 7392.5 617363
Sample 7803321 384 4505.0 24772.45 10099 21813.5 617363

6 Publisher Comparison

Number of journals for each publisher, sorted:

table(jdata$Publisher) %>% sort
## 
##                                          ACS Publications 
##                                                         1 
##                     American Association of Immunologists 
##                                                         1 
##                        American Phytopathological Society 
##                                                         1 
##                         American Society for Cell Biology 
##                                                         1 
##                          American Society of Microbiology 
##                                                         1 
##                                 American Thoracic Society 
##                                                         1 
##                               AO Research Institute Davos 
##                                                         1 
##                   Associated Professional Sleep Societies 
##                                                         1 
##                                Cambridge University Press 
##                                                         1 
##                              Canadian Medical Association 
##                                                         1 
##                Centers for Disease Control and Prevention 
##                                                         1 
##                                                     eLife 
##                                                         1 
##                                          Emory University 
##                                                         1 
## Federation of American Societies for Experimental Biology 
##                                                         1 
##                 Feinstein Institutue for Medical Research 
##                                                         1 
##                           Genetics and Molecular Research 
##                                                         1 
##                                           Impact Journals 
##                                                         1 
##                                International AIDS Society 
##                                                         1 
##                                            IOP Publishing 
##                                                         1 
##                                                 IOS Press 
##                                                         1 
##                         Ivyspring International Publisher 
##                                                         1 
##                                         JMIR Publications 
##                                                         1 
##                  Journal of Pure and Applied Microbiology 
##                                                         1 
##         Korean Society for Microbiology and Biotechnology 
##                                                         1 
##                                              LWW Journals 
##                                                         1 
##                                          Mary Ann Liebert 
##                                                         1 
##                                         Mary Ann Lierbert 
##                                                         1 
##                             Microscopy Society of America 
##                                                         1 
##                                                 MIT Press 
##                                                         1 
##                                                       NAS 
##                                                         1 
##                               Radiation Research Society  
##                                                         1 
##                                               RNA Society 
##                                                         1 
##                                             Royal Society 
##                                                         1 
##                                SERBIAN BIOLOGICAL SOCIETY 
##                                                         1 
##                             Society for Leukocyte Biology 
##                                                         1 
##                       Society for Neuroscience / Highwire 
##                                                         1 
##                                 The Company of Biologists 
##                                                         1 
##                                           Wolters Kluwer  
##                                                         1 
##                            American Physiological Society 
##                                                         2 
##   American Society for Biochemistry and Molecular Biology 
##                                                         2 
##                                                      EMBO 
##                                                         2 
##                                           Future Medicine 
##                                                         2 
##                                         Mary Anne Liebert 
##                                                         2 
##                                      Microbiology Society 
##                                                         2 
##                                            Portland Press 
##                                                         2 
##                           The Rockefeller University Pres 
##                                                         2 
##                                                      AAAS 
##                                                         3 
##                                 American Chemical Society 
##                                                         3 
##                                 Cold Springs Harbor Press 
##                                                         3 
##                                     Company of Biologists 
##                                                         3 
##                            Hindawi Publishing Corporation 
##                                                         3 
##                                               IOP Science 
##                                                         3 
##                                                      PLoS 
##                                                         3 
##                                Royal Society of Chemistry 
##                                                         3 
##                                                 Frontiers 
##                                                         4 
##                                                      IEEE 
##                                                         4 
##                                                    Karger 
##                                                         4 
##                                         SAGE Publications 
##                                                         5 
##                                            Wolters Kluwer 
##                                                         5 
##                         American Society for Microbiology 
##                                                         6 
##                              The Royal Society Publishing 
##                                                         6 
##                                          Taylor & Francis 
##                                                         8 
##                                            BioMed Central 
##                                                         9 
##                                           Oxford Journals 
##                                                        13 
##                                   Nature Publishing Group 
##                                                        19 
##                                                  Springer 
##                                                        24 
##                                                     Wiley 
##                                                        43 
##                                                  Elsevier 
##                                                        90

Filter out publishers with 3 or fewer journals:

tmp = table(jdata$Publisher)
tmp[tmp > 3] %>% sort
## 
##                         Frontiers                              IEEE 
##                                 4                                 4 
##                            Karger                 SAGE Publications 
##                                 4                                 5 
##                    Wolters Kluwer American Society for Microbiology 
##                                 5                                 6 
##      The Royal Society Publishing                  Taylor & Francis 
##                                 6                                 8 
##                    BioMed Central                   Oxford Journals 
##                                 9                                13 
##           Nature Publishing Group                          Springer 
##                                19                                24 
##                             Wiley                          Elsevier 
##                                43                                90
tmpjournals = names(tmp[tmp > 3])

Percent of sampled journals in each DSM category by publisher:

library(janitor)
tmpdat = jdata %>% filter(Publisher %in% tmpjournals)
tmpout1 = tmpdat %>% tabyl(Publisher)
tmpout = tmpdat %>% crosstab(Publisher, dsm, "row")
tmpout2 = tmpout
tmpout2[, -1] = tmpout2[, -1] * 100
tmpout2 = cbind(tmpout2, `# Journals` = tmpout1[, 2])
kable(tmpout2, digits = 1)
Publisher 1 2 3 4 5 6 # Journals
American Society for Microbiology 0.0 0.0 0.0 0.0 100.0 0.0 6
BioMed Central 11.1 0.0 77.8 0.0 11.1 0.0 9
Elsevier 0.0 12.2 53.3 28.9 2.2 3.3 90
Frontiers 0.0 0.0 100.0 0.0 0.0 0.0 4
IEEE 0.0 0.0 0.0 0.0 0.0 100.0 4
Karger 0.0 0.0 0.0 0.0 0.0 100.0 4
Nature Publishing Group 84.2 5.3 0.0 0.0 0.0 10.5 19
Oxford Journals 0.0 0.0 0.0 0.0 46.2 53.8 13
SAGE Publications 0.0 0.0 20.0 0.0 20.0 60.0 5
Springer 0.0 0.0 0.0 8.3 0.0 91.7 24
Taylor & Francis 0.0 0.0 12.5 0.0 62.5 25.0 8
The Royal Society Publishing 66.7 33.3 0.0 0.0 0.0 0.0 6
Wiley 9.3 9.3 18.6 2.3 14.0 46.5 43
Wolters Kluwer 0.0 0.0 0.0 0.0 20.0 80.0 5

Summary of maximum percentages for each publisher:

summary(apply(tmpout2[, -1], 1, max))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   46.51   63.54   82.11   79.51   97.92  100.00

Categorize into 2 level data sharing mark:

tmpdat = jdata %>% filter(Publisher %in% tmpjournals)
tmpout = tmpdat %>% crosstab(Publisher, dsm2, "row")
tmpout1 = tmpdat %>% tabyl(Publisher)
tmpout2 = tmpout
tmpout2[, -1] = tmpout2[, -1] * 100
tmpout2 = cbind(tmpout2, `# Journals` = tmpout1[, 2])
kable(tmpout2, digits = 1)
Publisher 0 1 # Journals
American Society for Microbiology 100.0 0.0 6
BioMed Central 88.9 11.1 9
Elsevier 87.8 12.2 90
Frontiers 100.0 0.0 4
IEEE 100.0 0.0 4
Karger 100.0 0.0 4
Nature Publishing Group 10.5 89.5 19
Oxford Journals 100.0 0.0 13
SAGE Publications 100.0 0.0 5
Springer 100.0 0.0 24
Taylor & Francis 100.0 0.0 8
The Royal Society Publishing 0.0 100.0 6
Wiley 81.4 18.6 43
Wolters Kluwer 100.0 0.0 5

Summary of maximum percentages for each publisher:

summary(apply(tmpout2[, -1], 1, max))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   81.40   92.50  100.00   96.41  100.00  100.00

7 Curator Reliability

cdata1 <- read_excel("../Data-Sharing-Policies_2017-01-25.xlsx", sheet = "Curator Data")
cdata0 <- read_excel("../Data-Sharing-Policies_2017-01-25.xlsx", sheet = "Data")
cdata1 <- cdata1[-1, ]
cdata0 <- cdata0[-1, ]



cdata1_all = cdata1 %>% clean_names() %>% remove_empty_rows() %>% select(journal, 
    open_access_mark, data_sharing_mark:reproducibility_noted_mark)
cdata0_all = cdata0 %>% clean_names() %>% remove_empty_rows() %>% select(journal, 
    open_access_mark, data_sharing_mark:reproducibility_noted_mark)

cdata0_all = data.frame(cdata0_all)

cdata0_all[which(cdata0_all == "a\n", arr.ind = T)] = "a"
cdata0_all[which(cdata0_all == "b\n", arr.ind = T)] = "b"


cdata1_all = data.frame(cdata1_all)

cdata1_all[which(cdata1_all == "a\n", arr.ind = T)] = "a"
cdata1_all[which(cdata1_all == "b\n", arr.ind = T)] = "b"

allvars = colnames(cdata1_all)[-1]

For each we have a table of agreement, the % agreement and Cohen’s Kappa.

restable = NULL
for (tmpvar in allvars) {
    tmp0 = cdata0_all %>% select(journal, contains(tmpvar))
    tmp1 = cdata1_all %>% select(journal, contains(tmpvar))
    colnames(tmp0)[2] = "var_orig"
    colnames(tmp1)[2] = "var_new"
    newdat = left_join(tmp1, tmp0)
    newdat = na.omit(newdat)
    mytab = table(newdat[, -1])
    print(tmpvar)
    print(mytab)
    
    res = data_frame(Score = tmpvar, `# Journals` = nrow(newdat), `% Agreement` = 100 * 
        sum(diag(mytab))/nrow(newdat), `Cohen's Kappa` = kappa2(newdat[, -1])$value)
    restable = bind_rows(restable, res)
    
    
}
## [1] "open_access_mark"
##        var_orig
## var_new  0  1
##       0 35  0
##       1  0  5
## [1] "data_sharing_mark"
##        var_orig
## var_new  1  2  3  4  5  6
##       1  7  2  0  0  1  0
##       2  0  2  0  0  0  0
##       3  0  0 10  0  0  0
##       4  0  0  0  1  0  0
##       5  0  0  0  0  9  0
##       6  0  0  0  0  0  8
## [1] "protein_proteomic_genomic_or_microaray_sequence_or_structural_data_sharing_addressed_required_with_deposit_to_specific_data_banks"
##        var_orig
## var_new  a  b
##       a 23  0
##       b  0 17
## [1] "recommended_preferred_sharing_mark"
##        var_orig
## var_new  A  B  D  E
##       A 21  0  0  0
##       B  0  7  0  0
##       D  1  0  1  0
##       E  0  0  0 10
## [1] "size_guidelines_if_journal_hosted_provided"
##        var_orig
## var_new  b  c
##       b  1  0
##       c  1 11
## [1] "copyright_licensing_mark"
##        var_orig
## var_new  a  b
##       a  1  0
##       b  0 39
## [1] "archival_retention_mark"
##        var_orig
## var_new a   b
##      a   1  1
##      b   0 38
## [1] "reproducibility_noted_mark"
##        var_orig
## var_new  a  b
##       a  6  1
##       b  2 31
restable$Score = gsub("_", " ", restable$Score)

kable(restable, digits = 3)
Score # Journals % Agreement Cohen’s Kappa
open access mark 40 100.000 1.000
data sharing mark 40 92.500 0.905
protein proteomic genomic or microaray sequence or structural data sharing addressed required with deposit to specific data banks 40 100.000 1.000
recommended preferred sharing mark 40 97.500 0.959
size guidelines if journal hosted provided 13 92.308 0.629
copyright licensing mark 40 100.000 1.000
archival retention mark 40 97.500 0.655
reproducibility noted mark 40 92.500 0.754

8 Session Info

This analysis was performed in R/Rstudio using knitr with the following session:

devtools::session_info()
##  setting  value                       
##  version  R version 3.2.1 (2015-06-18)
##  system   x86_64, darwin10.8.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Los_Angeles         
##  date     2017-02-02                  
## 
##  package      * version date       source                        
##  assertthat     0.1     2013-12-06 CRAN (R 3.2.0)                
##  backports      1.0.5   2017-01-18 CRAN (R 3.2.1)                
##  broom          0.4.1   2016-06-24 CRAN (R 3.2.5)                
##  colorspace     1.2-6   2015-03-11 CRAN (R 3.2.0)                
##  DBI            0.5-1   2016-09-10 CRAN (R 3.2.1)                
##  devtools       1.12.0  2016-06-24 CRAN (R 3.2.5)                
##  digest         0.6.10  2016-08-02 CRAN (R 3.2.5)                
##  dplyr        * 0.5.0   2016-06-24 CRAN (R 3.2.5)                
##  evaluate       0.9     2016-04-29 CRAN (R 3.2.5)                
##  formatR        1.4     2016-05-09 CRAN (R 3.2.5)                
##  ggplot2      * 2.2.0   2016-11-11 CRAN (R 3.2.1)                
##  gtable         0.2.0   2016-02-26 CRAN (R 3.2.3)                
##  highr          0.6     2016-05-09 CRAN (R 3.2.5)                
##  htmltools      0.3.5   2016-03-21 CRAN (R 3.2.4)                
##  irr          * 0.84    2012-07-16 CRAN (R 3.2.0)                
##  janitor      * 0.2.1   2016-10-31 CRAN (R 3.2.1)                
##  knitr        * 1.14    2016-08-13 CRAN (R 3.2.5)                
##  labeling       0.3     2014-08-23 CRAN (R 3.2.0)                
##  lattice        0.20-33 2015-07-14 CRAN (R 3.2.1)                
##  lazyeval       0.2.0   2016-06-12 CRAN (R 3.2.5)                
##  lpSolve      * 5.6.13  2015-09-19 CRAN (R 3.2.3)                
##  magrittr     * 1.5     2014-11-22 CRAN (R 3.2.0)                
##  Matrix         1.2-6   2016-05-02 CRAN (R 3.2.5)                
##  memoise        1.0.0   2016-01-29 CRAN (R 3.2.3)                
##  mnormt         1.5-4   2016-03-09 CRAN (R 3.2.4)                
##  munsell        0.4.3   2016-02-13 CRAN (R 3.2.3)                
##  nlme           3.1-128 2016-05-10 CRAN (R 3.2.5)                
##  pander       * 0.6.0   2015-11-23 CRAN (R 3.2.3)                
##  plyr           1.8.4   2016-06-08 CRAN (R 3.2.5)                
##  psych          1.6.6   2016-06-28 CRAN (R 3.2.5)                
##  purrr        * 0.2.2   2016-06-18 CRAN (R 3.2.5)                
##  R6             2.1.3   2016-08-19 CRAN (R 3.2.1)                
##  RColorBrewer   1.1-2   2014-12-07 CRAN (R 3.2.0)                
##  Rcpp           0.12.6  2016-07-19 CRAN (R 3.2.5)                
##  readr        * 1.0.0   2016-08-03 CRAN (R 3.2.1)                
##  readxl       * 0.1.1   2016-03-28 CRAN (R 3.2.4)                
##  reshape2     * 1.4.1   2014-12-06 CRAN (R 3.2.0)                
##  rmarkdown      1.3     2016-12-21 CRAN (R 3.2.1)                
##  rprojroot      1.2     2017-01-16 CRAN (R 3.2.1)                
##  scales         0.4.1   2016-11-09 CRAN (R 3.2.1)                
##  stringi        1.1.1   2016-05-27 CRAN (R 3.2.5)                
##  stringr        1.1.0   2016-08-19 CRAN (R 3.2.1)                
##  survey         3.31    2016-08-05 CRAN (R 3.2.5)                
##  survival       2.39-5  2016-06-26 CRAN (R 3.2.5)                
##  tableone     * 0.7.3   2015-11-11 CRAN (R 3.2.0)                
##  tibble       * 1.2-12  2016-09-02 Github (hadley/tibble@6d2bb08)
##  tidyr        * 0.6.0   2016-08-12 CRAN (R 3.2.5)                
##  tidyverse    * 1.0.0   2016-09-09 CRAN (R 3.2.1)                
##  withr          1.0.2   2016-06-20 CRAN (R 3.2.5)                
##  yaml           2.1.13  2014-06-12 CRAN (R 3.2.0)                
##  zoo            1.7-13  2016-05-03 CRAN (R 3.2.5)