1 Data Processing

Here we read in the data and do some name and format manipulation.

Note, the abbreviation DSM denotes “Data Sharing Mark” and OAM denotes “Open Access Mark.”

# read journal data
exceldat <- read_excel("../Data-Sharing-Policies_2017-01-25.xlsx", sheet = "Data",
                    na = "N/A")
jdata <- exceldat[-1,] # remove line of info
jdata <- jdata[-which(is.na(jdata$Journal)),] # remove NAs

# add labels for DSM
dsm_labels = c(
  "Required as condition of publication, barring exceptions",
  "Required but no explicit statement regarding effect on publication/editorial decisions",
  "Explicitly encouraged/addressed, but not required.",
  "Mentioned indirectly",
  "Only protein, proteomic, and/or genomic data sharing are addressed.",
  "No mention"
)

# Recommended Sharing Method
rsm_labels = c(
  "Public Online Repository", "Journal Hosted",
  "By Reader Request to Authors","Multiple methods equally recommended",
  "Unspecified"
)

journal_host_labels = c(
  "Journal will host regardless of size",
"Journal has data hosting file/s size limit","Unspecified"
)

# Fix typos:
# One line has an end line character when should be 'b'
jdata$`Copyright/Licensing Mark`[jdata$`Copyright/Licensing Mark`=="b\n"] = "b"
jdata$`Reproducibility Noted Mark`[jdata$`Reproducibility Noted Mark`=="a\n"] = "a"
# Space after an a
jdata$`Archival/Retention Mark`[jdata$`Archival/Retention Mark`=="a "] = "a"
jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`[jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`%in%c("a\n","a ")] = "a"
jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`[jdata$`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`%in%c("b\n","b ")] = "b"

# easier shorter names
jdata <- jdata%>%mutate(
  citable_2013 = `{2013} Citable Articles`,
  citable_2014 = `2014 Citable Items`,
  citable_2015 = `2015 Citable Articles`,
  dsm=as.numeric(`Data Sharing Mark`), 
  dsm_fac = factor(dsm,levels = 1:6,
                   labels=dsm_labels),
  dsm2 = 1*(dsm<3),
  dsm2_fac = factor(dsm2,levels=0:1,
                    labels=c("Not Required","Required")),
  dsm2_fac_flip = factor(dsm2,levels=c(1,0), # reverse levels for for plotting only
                    labels=c("Required","Not Required")),
  oam=as.numeric(`Open Access Mark`),
  oam_fac = factor(oam,levels=0:1,
                   labels=c("Subscription","Open Access")),
  if_2013=`2013 Impact Factor`,
  if_2014=`2014 Impact Factor`,
  if_2015 = `2015 Impact Factor`,
  total_cites_2013 = `2013 Total Cites`,
  total_cites_2014 = `2014 Total Cites`,
  total_cites_2015 = `2015 Total Cites`,
  omic_data_sharing_fac = factor(`Protein, Proteomic, Genomic, or Microaray (Sequence or Structural) Data Sharing Addressed/Required with Deposit to Specific Data Banks`,
                                 levels=c("a","b"),
                                 labels=c("Yes","No")),
  rec_pref_sharing_fac = factor(`Recommended/Preferred Sharing Mark`,levels=c("A","B","C","D","E"),
                   labels=rsm_labels),
  journal_hosts_fac = factor(`Size Guidelines if Journal Hosted Provided`,levels=c("a","b","c"),labels=journal_host_labels),
  copyright_licensing_fac = factor(`Copyright/Licensing Mark`,levels=c("a","b"),
                                   labels=c("Explicitly stated or mentioned","No Mention")),
  archival_retention_fac = factor(`Archival/Retention Mark`,levels=c("a","b"),
                                  labels=c("Explicitly stated","No Mention")),
  reproducibility_noted_fac = factor(`Reproducibility Noted Mark`,levels=c("a","b"),
                                  labels=c("Explicitly stated","No Mention"))
)

# create long data for plotting and dplyr
jdata_if_long  <- jdata%>%select(Journal,dsm:if_2015)%>%
  rename(`2013`=if_2013,`2014`=if_2014,`2015`=if_2015)%>%
  gather(year,impact_factor,`2013`:`2015`)
jdata_tc_long <- jdata%>%select(Journal,contains("total_cites"))%>%
  rename(`2013`=total_cites_2013,`2014`=total_cites_2014,`2015`=total_cites_2015)%>%
  gather(year,total_cites,`2013`:`2015`)
jdata_cit_long <- jdata%>%select(Journal,contains("citable_"))%>%
  rename(`2013`=citable_2013,`2014`=citable_2014,`2015`=citable_2015)%>%
  gather(year,citable_items,`2013`:`2015`)

jdata_long = left_join(jdata_if_long,jdata_tc_long)
jdata_long = left_join(jdata_long,jdata_cit_long)

2 Data Summaries and Distributions

We have data from 318 journals, including impact factor (IF) and total number of citations in years 2013 and 2014.

Data Sharing Mark (DSM) has 6 categories, and we create a collapsed DSM variable that has two categories for “required” (DSM = 1 or 2) and “not required” (DSM = 3, 4, 5 or 6).

The labels for Data Sharing Mark (DSM) are as follows:

kable(tibble(`DSM Numeric Value` = 1:6, `DSM Description Label` = dsm_labels))

DSM Numeric Value	DSM Description Label
1	Required as condition of publication, barring exceptions
2	Required but no explicit statement regarding effect on publication/editorial decisions
3	Explicitly encouraged/addressed, but not required.
4	Mentioned indirectly
5	Only protein, proteomic, and/or genomic data sharing are addressed.
6	No mention

2.1 Summary of Continuous Data

tmps = summary(jdata %>% select(if_2013:if_2015, citable_2013:citable_2015, 
    total_cites_2013:total_cites_2015))
kable(tmps)

if_2013	if_2014	if_2015	citable_2013	citable_2014	citable_2015	total_cites_2013	total_cites_2014	total_cites_2015
Min. : 0.073	Min. : 0.220	Min. : 0.218	Min. : 0.0	Min. : 0.0	Min. : 0.0	Min. : 115	Min. : 384	Min. : 410
1st Qu.: 2.987	1st Qu.: 2.908	1st Qu.: 2.874	1st Qu.: 154.0	1st Qu.: 148.5	1st Qu.: 152.0	1st Qu.: 3940	1st Qu.: 4517	1st Qu.: 3838
Median : 4.242	Median : 4.157	Median : 4.077	Median : 243.0	Median : 237.5	Median : 240.5	Median : 9164	Median : 10126	Median : 7748
Mean : 5.495	Mean : 5.417	Mean : 5.285	Mean : 409.8	Mean : 414.9	Mean : 414.9	Mean : 24267	Mean : 25747	Mean : 17688
3rd Qu.: 5.785	3rd Qu.: 5.772	3rd Qu.: 5.567	3rd Qu.: 341.0	3rd Qu.: 356.2	3rd Qu.: 341.8	3rd Qu.: 21307	3rd Qu.: 22129	3rd Qu.: 13320
Max. :42.351	Max. :41.456	Max. :38.138	Max. :31496.0	Max. :30040.0	Max. :28114.0	Max. :590324	Max. :617363	Max. :627846
NA	NA’s :2	NA’s :4	NA	NA’s :2	NA’s :4	NA	NA’s :2	NA’s :4

2.2 TABLE METHODS X: Distribution of Impact Factors for Journal Titles

We include 2013 values in the manuscript and include 2014 values in the appendix.

This is a table categorizing journal impact factor (JIF) into ranges. The number of journals within each range and the percentage of the total number of journals is presented.

2.2.1 2013

jdata$if_2013_cat = cut(jdata$if_2013, breaks = c(0, 2, 4, 6, 8, 10, 30, 43), 
    include.lowest = TRUE, right = FALSE, labels = c("<2", "2-3.99", "4-5.99", 
        "6-7.99", "8-9.99", "10-29.99", "30-43"))
jdata$if_2014_cat = cut(jdata$if_2014, breaks = c(0, 2, 4, 6, 8, 10, 30, 43), 
    include.lowest = TRUE, right = FALSE, c("<2", "2-3.99", "4-5.99", "6-7.99", 
        "8-9.99", "10-29.99", "30-43"))
jdata$if_2015_cat = cut(jdata$if_2015, breaks = c(0, 2, 4, 6, 8, 10, 30, 43), 
    include.lowest = TRUE, right = FALSE, c("<2", "2-3.99", "4-5.99", "6-7.99", 
        "8-9.99", "10-29.99", "30-43"))


jdata %>% group_by(if_2013_cat) %>% summarize(Number = n()) %>% mutate(`# (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`IF 2013 Category` = if_2013_cat) %>% kable()

IF 2013 Category	# (%)
<2	19 (6%)
2-3.99	125 (39.3%)
4-5.99	102 (32.1%)
6-7.99	25 (7.9%)
8-9.99	15 (4.7%)
10-29.99	29 (9.1%)
30-43	3 (0.9%)

2.2.2 2014, 2015

tmp = jdata %>% group_by(if_2014_cat) %>% summarize(Number = n()) %>% mutate(`2014 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`IF Category` = if_2014_cat)
tmp2 = jdata %>% group_by(if_2015_cat) %>% summarize(Number = n()) %>% mutate(`2015 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`IF Category` = if_2015_cat)
# replace NA
left_join(tmp, tmp2) %>% mutate(`IF Category` = ifelse(is.na(`IF Category`), 
    "Not Reported", as.character(`IF Category`))) %>% kable()

IF Category	2014 # (%)	2015 # (%)
<2	20 (6.3%)	21 (6.6%)
2-3.99	127 (39.9%)	133 (41.8%)
4-5.99	96 (30.2%)	97 (30.5%)
6-7.99	26 (8.2%)	18 (5.7%)
8-9.99	17 (5.3%)	16 (5%)
10-29.99	27 (8.5%)	26 (8.2%)
30-43	3 (0.9%)	3 (0.9%)
Not Reported	2 (0.6%)	4 (1.3%)

2.3 TABLE METHODS X: Distribution of Citable Items for Journal Titles by Category

The total number of citable items per journal is categorized into ranges and the number of journals and percentage of total number of journals in each category is presented.

2.3.1 2013

my_cut_fun <- function(x) {
    cut(x, breaks = c(0, 100, 500, 1000, 32000), labels = c("<100", "100-500", 
        "500-1000", "1000-32000"), right = FALSE)
}

jdata = jdata %>% mutate(citable_2013_cat = my_cut_fun(jdata$citable_2013), 
    citable_2014_cat = my_cut_fun(jdata$citable_2014), citable_2015_cat = my_cut_fun(jdata$citable_2015))


jdata %>% group_by(citable_2013_cat) %>% summarize(Number = n()) %>% mutate(`# (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number) %>% 
    rename(`Total Citable Items 2013 Category` = citable_2013_cat) %>% kable()

Total Citable Items 2013 Category	# (%)
<100	42 (13.2%)
100-500	239 (75.2%)
500-1000	28 (8.8%)
1000-32000	9 (2.8%)

2.3.2 2014, 2015

tmp1 = jdata %>% group_by(citable_2014_cat) %>% summarize(Number = n()) %>% 
    rename(`Total Citable Items Category` = citable_2014_cat) %>% mutate(`2014 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number)
tmp2 = jdata %>% group_by(citable_2015_cat) %>% summarize(Number = n()) %>% 
    rename(`Total Citable Items Category` = citable_2015_cat) %>% mutate(`2015 # (%)` = paste0(Number, 
    "\t(", round(100 * Number/nrow(jdata), 1), "%)")) %>% select(-Number)

left_join(tmp1, tmp2) %>% mutate(`Total Citable Items Category` = ifelse(is.na(`Total Citable Items Category`), 
    "Not Reported", as.character(`Total Citable Items Category`))) %>% kable()

Total Citable Items Category	2014 # (%)	2015 # (%)
<100	42 (13.2%)	38 (11.9%)
100-500	235 (73.9%)	239 (75.2%)
500-1000	27 (8.5%)	23 (7.2%)
1000-32000	12 (3.8%)	14 (4.4%)
Not Reported	2 (0.6%)	4 (1.3%)

2.4 TABLE METHODS X: Distribution of Citable Items for Journal Titles

Min, 25th percentile, mean, median, 75th percentile, and max.

tmp1 = jdata_long %>% group_by(year) %>% summarize_at(vars(citable_items), funs(Total = "sum", 
    Min = "min", Q25 = quantile(., probs = 0.25), Mean = "mean", Median = "median", 
    Q75 = quantile(., probs = 0.75), Max = "max"), na.rm = T) %>% add_column(Journals = "With PLoS One", 
    .after = "year")

tmp2 = jdata_long %>% filter(!Journal == "PLoS One") %>% group_by(year) %>% 
    summarize_at(vars(citable_items), funs(Total = "sum", Min = "min", Q25 = quantile(., 
        probs = 0.25), Mean = "mean", Median = "median", Q75 = quantile(., probs = 0.75), 
        Max = "max"), na.rm = T) %>% add_column(Journals = "Remove PLoS One", 
    .after = "year")

tmp = bind_rows(tmp1, tmp2) %>% arrange(Journals, year)
tmp = tmp %>% arrange(desc(Journals), year)
kable(tmp, caption = "Citable Items by Year", digits = 1)

Citable Items by Year
year	Journals	Total	Q25	Mean	Median	Q75	Max
2013	With PLoS One	130330	154.0	409.8	243.0	341.0	31496
2014	With PLoS One	131107	148.5	414.9	237.5	356.2	30040
2015	With PLoS One	130277	152.0	414.9	240.5	341.8	28114
2013	Remove PLoS One	98834	153.0	311.8	242.0	341.0	3901
2014	Remove PLoS One	101067	148.0	320.8	236.0	355.0	3579
2015	Remove PLoS One	102163	152.0	326.4	240.0	341.0	3281

2.5 TABLE: Number of journals per data sharing mark (DSM)

tmp = jdata %>% group_by(dsm, dsm_fac) %>% summarize(N = n(), Percent = 100 * 
    n()/nrow(jdata))

tmp1 = tmp %>% mutate(`# Journals (%)` = paste0(N, "\t(", round(Percent, 1), 
    "%)")) %>% select(-N, -Percent) %>% rename(DSM = dsm, `DSM Description` = dsm_fac)
kable(tmp1)

DSM	DSM Description	# Journals (%)
1	Required as condition of publication, barring exceptions	38 (11.9%)
2	Required but no explicit statement regarding effect on publication/editorial decisions	29 (9.1%)
3	Explicitly encouraged/addressed, but not required.	74 (23.3%)
4	Mentioned indirectly	29 (9.1%)
5	Only protein, proteomic, and/or genomic data sharing are addressed.	47 (14.8%)
6	No mention	101 (31.8%)

2.6 TABLE: Publishing Volume by Data Sharing Mark

tmp1 = jdata%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "MedianC"=median(citable_2013,na.rm=T),
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2013" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2013 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])
            )%>%
  mutate("# Citable Items 2013, Remove PLoS One (%)" = 
           paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp = left_join(tmp1,tmp2)

tmp1 = jdata%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm=T), 
            "MedianC"=median(citable_2014,na.rm=T),
            "PercentC" = 100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
        "Median # Citable Items per Journal 2014" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2014 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2014, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


tmp1 = jdata%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm=T), 
            "MedianC"=median(citable_2015,na.rm=T),
            "PercentC" = 100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
        "Median # Citable Items per Journal 2015" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2015 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm,dsm_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2015, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm,"DSM Description"=dsm_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


#kable(tmp)

tmpt = data_frame("DSM Description"="Total",
           "# Journals (%)"=paste0(nrow(jdata)," (100%)"),
           "Median # Citable Items per Journal 2013"  =as.character(median(jdata$citable_2013)),
           "# Citable Items 2013 (%)"=paste0(sum(jdata$citable_2013)," (100%)"),
           "# Citable Items 2013, Remove PLoS One (%)"=
             paste0(sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])," (100%)"),
           "Median # Citable Items per Journal 2014"  =as.character(median(jdata$citable_2014,na.rm=T)),
          "# Citable Items 2014 (%)"=paste0(sum(jdata$citable_2014,na.rm=T)," (100%)"),
           "# Citable Items 2014, Remove PLoS One (%)"=
             paste0(sum(jdata$citable_2014[!jdata$Journal=="PLoS One"],na.rm=T)," (100%)"),
            "Median # Citable Items per Journal 2015"  =as.character(median(jdata$citable_2015,na.rm=T)),
          "# Citable Items 2015 (%)"=paste0(sum(jdata$citable_2015,na.rm=T)," (100%)"),
           "# Citable Items 2015, Remove PLoS One (%)"=
             paste0(sum(jdata$citable_2015[!jdata$Journal=="PLoS One"],na.rm=T)," (100%)")
)
tmpt = bind_rows(tmp,tmpt)
kable(tmpt)

DSM	DSM Description	# Journals (%)	Median # Citable Items per Journal 2013	# Citable Items 2013 (%)	# Citable Items 2013, Remove PLoS One (%)	Median # Citable Items per Journal 2014	# Citable Items 2014 (%)	# Citable Items 2014, Remove PLoS One (%)	Median # Citable Items per Journal 2015	# Citable Items 2015 (%)	# Citable Items 2015, Remove PLoS One (%)
1	Required as condition of publication, barring exceptions	38 (11.9%)	230.5	42669 (32.7%)	11173 (11.3%)	220	42794 (32.6%)	12754 (12.6%)	226.5	40870 (31.4%)	12756 (12.5%)
2	Required but no explicit statement regarding effect on publication/editorial decisions	29 (9.1%)	209	12138 (9.3%)	12138 (12.3%)	227	12436 (9.5%)	12436 (12.3%)	266	14233 (10.9%)	14233 (13.9%)
3	Explicitly encouraged/addressed, but not required.	74 (23.3%)	259.5	25519 (19.6%)	25519 (25.8%)	282.5	26026 (19.9%)	26026 (25.8%)	278	26731 (20.5%)	26731 (26.2%)
4	Mentioned indirectly	29 (9.1%)	256	8062 (6.2%)	8062 (8.2%)	225	7894 (6%)	7894 (7.8%)	225	7928 (6.1%)	7928 (7.8%)
5	Only protein, proteomic, and/or genomic data sharing are addressed.	47 (14.8%)	277	19339 (14.8%)	19339 (19.6%)	316	19080 (14.6%)	19080 (18.9%)	285	17734 (13.6%)	17734 (17.4%)
6	No mention	101 (31.8%)	211	22603 (17.3%)	22603 (22.9%)	213	22877 (17.4%)	22877 (22.6%)	206	22781 (17.5%)	22781 (22.3%)
NA	Total	318 (100%)	243	130330 (100%)	98834 (100%)	237.5	131107 (100%)	101067 (100%)	240.5	130277 (100%)	102163 (100%)

tmplong$type1 = factor(tmplong$type2,levels=c("Citable Items without PLoS One","Citable Items with PLoS One","Journals"))

ggplot(tmplong,aes(x=type1,y=Percent,fill=dsm_fac))+geom_bar(stat="identity") +
  geom_text(aes(label=paste0(round(Percent,1),"%"),y=Percent),
            #alpha=1,
            position=position_stack(),hjust=1.2,size=3,color="black",show.legend = FALSE)+
  guides(alpha=guide_legend(title="Percent of",ncol=1),
         fill = guide_legend(title = "Data Sharing Mark", ncol = 1))+
  theme_minimal()+xlab("")+coord_flip()+ theme(legend.position = "bottom")

2.7 FIGURE 1 - Percentage of journals per each data sharing mark (DSM).

The top bar shows the percentage of all journals for each data sharing mark. The middle bar shows the percentage of citable items from each journal (including PLoS One) for each data sharing mark. The lower bar shows the percentage of citable items for each journal (excluding PLoS One) for each data sharing mark. Because of the journal PLoS One’s high publishing activity, we analyzed the percentage of citable items for each data sharing mark including and excluding PLoS One. The shades from dark to light represent DSM 1-6.

tmplong$type1 = factor(tmplong$type2,levels=c("Citable Items without PLoS One","Citable Items with PLoS One","Journals"))
tmplong$dsm_fac_reverse = factor(tmplong$dsm_fac,levels=levels(tmplong$dsm_fac)[6:1])

ggplot(tmplong,aes(x=type1,y=Percent,fill=dsm_fac_reverse))+geom_bar(stat="identity") +
  geom_text(aes(label=paste0(round(Percent,1),"%"),y=Percent),
            #alpha=1,
            position=position_stack(),hjust=1.2,size=3,color="black",show.legend = FALSE)+
  #scale_fill_brewer(palette="Greys")+
  scale_fill_manual(values = RColorBrewer::brewer.pal(n=7, name="Blues")[1:6],
                    breaks=levels(tmplong$dsm_fac),
                    #labels=levels(tmplong$dsm_fac))+
                      labels=paste(1:6,levels(tmplong$dsm_fac)))+
  guides(fill = guide_legend(title = "Data Sharing Mark", ncol = 1))+
  theme_minimal()+xlab("")+coord_flip()+ theme(legend.position = "bottom")

2.8 TABLE RESULTS X: Number of journals & Citable Items per data sharing 2 categories

#summarize citable items
tmp1 = jdata%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),
            "Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "MedianC"=median(citable_2013,na.rm=T),
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2013" = paste(MedianC," "), # need a space or kable malfunctions
         "# Citable Items 2013 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)"))%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

#summarize citable items, removing PLoS One
#do not need median since removing one journal will not change this except by .5
tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])
            )%>%
  mutate("# Citable Items 2013, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp = left_join(tmp1,tmp2)

tmp1 = jdata%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm=T), 
            "MedianC"=median(citable_2014,na.rm=T),
            "PercentC" = 100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2014" = paste(MedianC," "),
         "# Citable Items 2014 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") 
         )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2014,na.rm=T)/sum(jdata$citable_2014[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2014, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


tmp1 = jdata%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm=T), 
            "MedianC"=median(citable_2015,na.rm=T),
            "PercentC" = 100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015,na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2015" = paste(MedianC," "),
         "# Citable Items 2015 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") 
         )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(dsm2,dsm2_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015,na.rm=T), 
            "PercentC" = 
              100*sum(citable_2015,na.rm=T)/sum(jdata$citable_2015[!jdata$Journal=="PLoS One"],na.rm=T)
            )%>%
  mutate("# Citable Items 2015, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("DSM"=dsm2,"DSM Description"=dsm2_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)

tmp = tmp%>%arrange(desc(DSM))
tmp$DSM[1] = "DSM 1&2"
tmp$DSM[2] = "DSM 3-6"
kable(tmp)

DSM	DSM Description	# Journals (%)	Median # Citable Items per Journal 2013	# Citable Items 2013 (%)	# Citable Items 2013, Remove PLoS One (%)	Median # Citable Items per Journal 2014	# Citable Items 2014 (%)	# Citable Items 2014, Remove PLoS One (%)	Median # Citable Items per Journal 2015	# Citable Items 2015 (%)	# Citable Items 2015, Remove PLoS One (%)
DSM 1&2	Required	67 (21.1%)	226	54807 (42.1%)	23311 (23.6%)	221	55230 (42.1%)	25190 (24.9%)	242	55103 (42.3%)	26989 (26.4%)
DSM 3-6	Not Required	251 (78.9%)	248	75523 (57.9%)	75523 (76.4%)	244	75877 (57.9%)	75877 (75.1%)	240	75174 (57.7%)	75174 (73.6%)

2.9 TABLE RESULTS X: Distribution of Open Access by Journal and Citable Item

tmp1 = jdata%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "MedianC"=median(citable_2013,na.rm=T),
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2013" = paste(MedianC," "),
         "# Citable Items 2013 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("Open Access"=oam_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2013), 
            "PercentC" = 100*sum(citable_2013)/sum(jdata$citable_2013[!jdata$Journal=="PLoS One"])
            )%>%
  mutate("# Citable Items 2013, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("Open Access"=oam_fac)

tmp = left_join(tmp1,tmp2)



tmp1 = jdata%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm =T), 
            "MedianC"=median(citable_2014,na.rm=T),
            "PercentC" = 100*sum(citable_2014, na.rm =T)/sum(jdata$citable_2014, na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2014" = paste(MedianC," "), # need  a space or kable malfunctions for some reason
         "# Citable Items 2014 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("Open Access"=oam_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2014, na.rm =T), 
            "PercentC" = 100*sum(citable_2014, na.rm =T)/sum(jdata$citable_2014[!jdata$Journal=="PLoS One"], na.rm=T)
            )%>%
  mutate("# Citable Items 2014, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("Open Access"=oam_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)

tmp1 = jdata%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm =T), 
            "MedianC"=median(citable_2015,na.rm=T),
            "PercentC" = 100*sum(citable_2015, na.rm =T)/sum(jdata$citable_2015, na.rm=T)
            )%>%
  mutate("# Journals (%)" = paste0(N, "\t(",round(Percent,1),"%)"),
         "Median # Citable Items per Journal 2015" = paste(MedianC," "), # need  a space or kable malfunctions for some reason
         "# Citable Items 2015 (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC,-MedianC)%>%
  rename("Open Access"=oam_fac)

tmp2 = jdata%>%filter(!Journal=="PLoS One")%>%group_by(oam_fac)%>%
  summarize("N"=n(),"Percent"=100*n()/nrow(jdata),
            "Nc"=sum(citable_2015, na.rm =T), 
            "PercentC" = 100*sum(citable_2015, na.rm =T)/sum(jdata$citable_2015[!jdata$Journal=="PLoS One"], na.rm=T)
            )%>%
  mutate("# Citable Items 2015, Remove PLoS One (%)" = paste0(Nc, "\t(",round(PercentC,1),"%)") )%>%
  select(-N,-Nc,-Percent,-PercentC)%>%
  rename("Open Access"=oam_fac)

tmp3 = left_join(tmp1,tmp2)
tmp = left_join(tmp,tmp3)


tmp = tmp%>%arrange(desc(`Open Access`))
kable(tmp)

Open Access	# Journals (%)	Median # Citable Items per Journal 2013	# Citable Items 2013 (%)	# Citable Items 2013, Remove PLoS One (%)	Median # Citable Items per Journal 2014	# Citable Items 2014 (%)	# Citable Items 2014, Remove PLoS One (%)	Median # Citable Items per Journal 2015	# Citable Items 2015 (%)	# Citable Items 2015, Remove PLoS One (%)
Open Access	44 (13.8%)	199.5	43789 (33.6%)	12293 (12.4%)	207	45831 (35%)	15791 (15.6%)	222	47573 (36.5%)	19459 (19%)
Subscription	274 (86.2%)	246.5	86541 (66.4%)	86541 (87.6%)	240	85276 (65%)	85276 (84.4%)	242	82704 (63.5%)	82704 (81%)

2.10 TABLE X: DSM by Open Access

Open Access Mark (OAM) has two categories, Subscription and Open Access.

Here we present a table of number of open access or subscription journals (number of citable items) within each DSM as well as the percent of journals (percent of citable items) that are open access within each DSM. For the DSM 1 we present number and percent citable items with and without PLoS One.

# obtain open access counts at journal level
tmptable = with(jdata, table(dsm, oam_fac))
tmptable = cbind(tmptable, `% Open Access` = 100 * tmptable[, 2]/rowSums(tmptable))
tmptable = as_data_frame(tmptable)
tmptable = tmptable %>% add_column()

# obtain open access counts at citable item level

jcit = left_join(left_join(jdata %>% group_by(dsm) %>% summarize(tot_cit = sum(citable_2013)), 
    jdata %>% filter(oam_fac == "Subscription") %>% group_by(dsm) %>% summarize(sub_cit = sum(citable_2013))), 
    jdata %>% filter(oam_fac == "Open Access") %>% group_by(dsm) %>% summarize(open_cit = sum(citable_2013)))
jcit$open_cit[is.na(jcit$open_cit)] = 0
jcit$pct_open_cit = 100 * jcit$open_cit/jcit$tot_cit


# without Plos One
open_cit = jdata %>% filter(Journal != "PLoS One", oam_fac == "Open Access", 
    dsm == 1) %>% summarize(open_cit = sum(citable_2013))
tot_cit = jdata %>% filter(Journal != "PLoS One", dsm == 1) %>% summarize(open_cit = sum(citable_2013))
pct_cit = round(100 * open_cit/tot_cit, 2)


printtab = data_frame(DSM = dsm_labels, Subscription = paste0(tmptable$Subscription, 
    " (", jcit$sub_cit, ")"), `Open Access` = paste0(tmptable$`Open Access`, 
    " (", jcit$open_cit, ")"), `% Open Access` = paste0(round(tmptable$`% Open Access`, 
    2), "% (", round(jcit$pct_open_cit, 2), "%)"))
# add in without Plos One #s
printtab[1, 3] = gsub(")", paste0("; ", open_cit, "*)"), printtab[1, 3])
printtab[1, 4] = gsub(")", paste0("; ", pct_cit, "%*)"), printtab[1, 4])


kable(printtab)

DSM	Subscription	Open Access	% Open Access
Required as condition of publication, barring exceptions	29 (7709)	9 (34960; 3464*)	23.68% (81.93%; 31%*)
Required but no explicit statement regarding effect on publication/editorial decisions	27 (11864)	2 (274)	6.9% (2.26%)
Explicitly encouraged/addressed, but not required.	63 (22884)	11 (2635)	14.86% (10.33%)
Mentioned indirectly	29 (8062)	0 (0)	0% (0%)
Only protein, proteomic, and/or genomic data sharing are addressed.	40 (17401)	7 (1938)	14.89% (10.02%)
No mention	86 (18621)	15 (3982)	14.85% (17.62%)

*After removing PLoS One

2.11 Table of 2 category DSM by Open Access

# obtain open access counts at journal level
tmptable = with(jdata, table(dsm2, oam_fac))
tmptable = cbind(tmptable, `% Open Access` = 100 * tmptable[, 2]/rowSums(tmptable))
tmptable = as_data_frame(tmptable)

# obtain open access counts at citable item level

jcit = left_join(left_join(jdata %>% group_by(dsm2) %>% summarize(tot_cit = sum(citable_2013)), 
    jdata %>% filter(oam_fac == "Subscription") %>% group_by(dsm2) %>% summarize(sub_cit = sum(citable_2013))), 
    jdata %>% filter(oam_fac == "Open Access") %>% group_by(dsm2) %>% summarize(open_cit = sum(citable_2013)))
jcit$open_cit[is.na(jcit$open_cit)] = 0
jcit$pct_open_cit = 100 * jcit$open_cit/jcit$tot_cit


# without Plos One
open_cit = jdata %>% filter(Journal != "PLoS One", oam_fac == "Open Access", 
    dsm2 == 1) %>% summarize(open_cit = sum(citable_2013))
tot_cit = jdata %>% filter(Journal != "PLoS One", dsm2 == 1) %>% summarize(open_cit = sum(citable_2013))
pct_cit = round(100 * open_cit/tot_cit, 2)


printtab = data_frame(DSM = levels(jdata$dsm2_fac), Subscription = paste0(tmptable$Subscription, 
    " (", jcit$sub_cit, ")"), `Open Access` = paste0(tmptable$`Open Access`, 
    " (", jcit$open_cit, ")"), `% Open Access` = paste0(round(tmptable$`% Open Access`, 
    2), "% (", round(jcit$pct_open_cit, 2), "%)"))
printtab = printtab %>% arrange(desc(DSM))
# add in without Plos One #s
printtab[1, 3] = gsub(")", paste0("; ", open_cit, "*)"), printtab[1, 3])
printtab[1, 4] = gsub(")", paste0("; ", pct_cit, "%*)"), printtab[1, 4])


kable(printtab)

DSM	Subscription	Open Access	% Open Access
Required	56 (19573)	11 (35234; 3738*)	16.42% (64.29%; 16.04%*)
Not Required	218 (66968)	33 (8555)	13.15% (11.33%)

*After removing PLoS One

2.12 TABLE X: Frequencies of Data Sharing Policies

jdata$omic_data_sharing_fac = relevel(jdata$omic_data_sharing_fac, ref = "No")
jdata$copyright_licensing_fac = relevel(jdata$copyright_licensing_fac, ref = "No Mention")
jdata$archival_retention_fac = relevel(jdata$archival_retention_fac, ref = "No Mention")
jdata$reproducibility_noted_fac = relevel(jdata$reproducibility_noted_fac, ref = "No Mention")
varlist = c("omic_data_sharing_fac", "rec_pref_sharing_fac", "journal_hosts_fac", 
    "copyright_licensing_fac", "archival_retention_fac", "reproducibility_noted_fac")
t1 = CreateTableOne(varlist, data = jdata)
x1 <- print(t1, printToggle = FALSE, noSpaces = FALSE)
tmp = rownames(x1)
tmptab = as_data_frame(x1) %>% add_column(Variable = tmp, .before = "Overall")
tmptab$Variable = gsub("omic_data_sharing_fac", "Omic Data Sharing Required", 
    tmptab$Variable)
tmptab$Variable = gsub("rec_pref_sharing_fac", "Recommended Sharing Method", 
    tmptab$Variable)
tmptab$Variable = gsub("journal_hosts_fac", "Journal Hosting Limit", tmptab$Variable)
tmptab$Variable = gsub("copyright_licensing_fac", "Copyright Licensing of Data", 
    tmptab$Variable)
tmptab$Variable = gsub("archival_retention_fac", "Archival Retention Policy", 
    tmptab$Variable)
tmptab$Variable = gsub("reproducibility_noted_fac", "Reproducibility or Analogous Concepts Noted as Purpose of Data Policy", 
    tmptab$Variable)


tmptab$Overall = gsub(")", "%)", tmptab$Overall)

kable(tmptab)

Variable	Overall
n	318
Omic Data Sharing Required = Yes (%)	131 (41.2%)
Recommended Sharing Method (%)
Public Online Repository	125 (39.3%)
Journal Hosted	45 (14.2%)
By Reader Request to Authors	4 ( 1.3%)
Multiple methods equally recommended	11 ( 3.5%)
Unspecified	133 (41.8%)
Journal Hosting Limit (%)
Journal will host regardless of size	1 ( 0.8%)
Journal has data hosting file/s size limit	8 ( 6.6%)
Unspecified	112 (92.6%)
Copyright Licensing of Data = Explicitly stated or mentioned (%)	16 ( 5.0%)
Archival Retention Policy = Explicitly stated (%)	3 ( 0.9%)
Reproducibility or Analogous Concepts Noted as Purpose of Data Policy = Explicitly stated (%)	54 (17.0%)

2.13 TABLE: Recommended Data Sharing Method by DSM 1-5

tmp = with(jdata, table(dsm_fac, rec_pref_sharing_fac))
tmp = cbind(dsm = 1:6, tmp)
tmp = tmp[-nrow(tmp), ]
tmp = as_data_frame(tmp)
tmp[6, ] = colSums(tmp)
tmp[6, 1] = "Total"
tmppct = round(100 * tmp[, -1]/rowSums(tmp[, -1]), 2)

for (ii in 2:ncol(tmp)) {
    tmp[, ii] = paste0(unlist(tmp[, ii]), " (", tmppct[, ii - 1], "%)")
}
kable(tmp)

dsm	Public Online Repository	Journal Hosted	By Reader Request to Authors	Multiple methods equally recommended	Unspecified
1	34 (89.47%)	0 (0%)	0 (0%)	4 (10.53%)	0 (0%)
2	23 (79.31%)	0 (0%)	2 (6.9%)	3 (10.34%)	1 (3.45%)
3	22 (29.73%)	44 (59.46%)	0 (0%)	4 (5.41%)	4 (5.41%)
4	1 (3.45%)	0 (0%)	2 (6.9%)	0 (0%)	26 (89.66%)
5	45 (95.74%)	1 (2.13%)	0 (0%)	0 (0%)	1 (2.13%)
Total	125 (57.6%)	45 (20.74%)	4 (1.84%)	11 (5.07%)	32 (14.75%)

2.14 FIGURE: Recommended Data Sharing Method by DSM 1-5

tmp = with(jdata, table(dsm_fac, rec_pref_sharing_fac))
tmp = cbind(dsm = 1:6, tmp)
tmp = tmp[-nrow(tmp), ]
tmp = as_data_frame(tmp)
tmp[6, ] = colSums(tmp)
tmp[6, 1] = "Total (DSM 1-5)"
tmppct = cbind(tmp[, 1], 100 * tmp[, -1]/rowSums(tmp[, -1]))
tmpn = tmp

for (ii in 2:ncol(tmp)) {
    tmp[, ii] = paste0(unlist(tmp[, ii]), "\n(", round(tmppct[, ii], 1), "%)")
}
tmplonglab = tmp %>% gather(key = type, value = label, -dsm)

tmplong = tmpn %>% gather(key = type, value = n, -dsm)
tmplong2 = tmppct %>% gather(key = type, value = Percent, -dsm)
tmplong = left_join(tmplong, tmplong2)
tmplong = left_join(tmplonglab, tmplong)
tmplong$type = factor(tmplong$type, levels = c("Unspecified", "Multiple methods equally recommended", 
    "By Reader Request to Authors", "Journal Hosted", "Public Online Repository"), 
    labels = c("Unspecified", "Multiple Methods Equally Recommended", "By Reader Request to Authors", 
        "Journal Hosted", "Public Online Repository"))

ggplot(tmplong, aes(dsm, type, fill = Percent)) + geom_tile(show.legend = TRUE) + 
    geom_text(aes(label = label), color = "white") + theme_minimal() + ylab("") + 
    xlab("Data Sharing Mark") + scale_fill_continuous(limits = c(0, 100))

2.15 FIGURE 3: Recommended data sharing method by data sharing mark (DSM) 1-5.

The number (percent) of journals with each recommended data sharing method is represented by each tile, with darker shades denoting higher percentages of journals with the given data sharing method.

tmplong$lowpct = factor(1 * tmplong$Percent < 25)
ggplot(tmplong, aes(dsm, type, fill = Percent)) + geom_tile(show.legend = TRUE) + 
    geom_text(aes(label = label, color = lowpct), show.legend = FALSE) + theme_minimal() + 
    ylab("") + xlab("Data Sharing Mark") + scale_color_manual(values = c("White", 
    "Black")) + guides(color = "none", label = "none") + scale_fill_continuous(limits = c(0, 
    100), low = RColorBrewer::brewer.pal(n = 7, name = "Blues")[1], high = RColorBrewer::brewer.pal(n = 7, 
    name = "Blues")[7])

tmptable = data.frame(table(jdata$omic_data_sharing_fac, useNA = "ifany"))
colnames(tmptable) = c("Protein, Proteomic, Genomic Data Sharing Required with Deposit to Specific Data Banks", 
    "# (%)")
tmptable[, 2] = paste0(tmptable[, 2], " (", sprintf("%.1f", 100 * tmptable[, 
    2]/sum(tmptable[, 2]), 1), "%)")
kable(tmptable)

Protein, Proteomic, Genomic Data Sharing Required with Deposit to Specific Data Banks	# (%)
No	187 (58.8%)
Yes	131 (41.2%)

tmptable = data.frame(table(jdata$rec_pref_sharing_fac, useNA = "ifany"))
colnames(tmptable) = c("Recommended Sharing Method", "# (%)")
tmptable[, 2] = paste0(tmptable[, 2], " (", sprintf("%.1f", 100 * tmptable[, 
    2]/sum(tmptable[, 2]), 1), "%)")
kable(tmptable)

Recommended Sharing Method	# (%)
Public Online Repository	125 (39.3%)
Journal Hosted	45 (14.2%)
By Reader Request to Authors	4 (1.3%)
Multiple methods equally recommended	11 (3.5%)
Unspecified	133 (41.8%)

tmptable = data.frame(table(jdata$journal_hosts_fac, useNA = "ifany"))
tmptable[, 1] = as.character(tmptable[, 1])
colnames(tmptable) = c("Journal Hosting Limit", "# (%)")
tmptable[nrow(tmptable), 1] = "N/A"
tmptable[, 2] = paste0(tmptable[, 2], " (", sprintf("%.1f", 100 * tmptable[, 
    2]/sum(tmptable[, 2]), 1), "%)")
kable(tmptable)

Journal Hosting Limit	# (%)
Journal will host regardless of size	1 (0.3%)
Journal has data hosting file/s size limit	8 (2.5%)
Unspecified	112 (35.2%)
N/A	197 (61.9%)

tmptable = data.frame(table(jdata$copyright_licensing_fac, useNA = "ifany"))
colnames(tmptable) = c("Copyright Licensing of Data", "# (%)")
tmptable[, 2] = paste0(tmptable[, 2], " (", sprintf("%.1f", 100 * tmptable[, 
    2]/sum(tmptable[, 2]), 1), "%)")
kable(tmptable)

Copyright Licensing of Data	# (%)
No Mention	302 (95.0%)
Explicitly stated or mentioned	16 (5.0%)

tmptable = data.frame(table(jdata$archival_retention_fac, useNA = "ifany"))
colnames(tmptable) = c("Archival Retention Policy", "# (%)")
tmptable[, 2] = paste0(tmptable[, 2], " (", sprintf("%.1f", 100 * tmptable[, 
    2]/sum(tmptable[, 2]), 1), "%)")
kable(tmptable)

Archival Retention Policy	# (%)
No Mention	315 (99.1%)
Explicitly stated	3 (0.9%)

tmptable = data.frame(table(jdata$reproducibility_noted_fac, useNA = "ifany"))
colnames(tmptable) = c("Reproducibility or Analogous Concepts Noted as Purpose of Data Policy", 
    "# (%)")
tmptable[, 2] = paste0(tmptable[, 2], " (", sprintf("%.1f", 100 * tmptable[, 
    2]/sum(tmptable[, 2]), 1), "%)")
kable(tmptable)

Reproducibility or Analogous Concepts Noted as Purpose of Data Policy	# (%)
No Mention	264 (83.0%)
Explicitly stated	54 (17.0%)

Journal Hosting by Recommended Sharing Policy

tmptable = with(jdata, table(journal_hosts_fac, rec_pref_sharing_fac, useNA = "ifany"))
rownames(tmptable)[nrow(tmptable)] = "N/A"
kable(tmptable)

	Public Online Repository	Journal Hosted	By Reader Request to Authors	Multiple methods equally recommended	Unspecified
Journal will host regardless of size	0	1	0	0	0
Journal has data hosting file/s size limit	2	4	0	2	0
Unspecified	17	38	2	8	47
N/A	106	2	2	1	86

2.16 Distribution of Impact Factor - Normality Tests

First we must determine whether IF or some transformation of IF is normally distributed. We can visually assess this by plotting histograms and density plots.

# check normality
ggplot(jdata_long, aes(impact_factor, fill = year)) + geom_density(alpha = 0.4) + 
    theme_minimal() + xlab("Impact Factor") + ggtitle("Density of IF by Year")
ggplot(jdata_long, aes(impact_factor, fill = year)) + geom_histogram(alpha = 0.4, 
    bins = 40) + theme_minimal() + xlab("Impact Factor") + ggtitle("Histogram of IF by Year")

# check normality of log
ggplot(jdata_long, aes(log(impact_factor), fill = year)) + geom_density(alpha = 0.4) + 
    theme_minimal() + xlab("log(IF)") + ggtitle("Density of log(IF) by Year")
ggplot(jdata_long, aes(log(impact_factor), fill = year)) + geom_histogram(alpha = 0.4, 
    bins = 40) + theme_minimal() + xlab("Impact Factor") + ggtitle("Histogram of log(IF) by Year")

We can also test the Normality assumption with the Shapiro Wilk’s Test. We can test IF by year, as well as log(IF). We can also test within each DSM group since the main assumption for ANOVA is that the outcome (IF) is normal within each group.

norm_test1 = data.frame(Y = c("IF 2013", "IF 2014", "log(IF) 2013", "log(IF) 2014"), 
    rbind(broom::tidy(shapiro.test(jdata$if_2013)), broom::tidy(shapiro.test(jdata$if_2014)), 
        broom::tidy(shapiro.test(log(jdata$if_2013))), broom::tidy(shapiro.test(log(jdata$if_2014)))))
norm_test1$p.value = as.character(signif(norm_test1$p.value, 2))
kable(norm_test1, digits = 4, caption = "Shapiro Wilk Normality Test p-values of IF")

Shapiro Wilk Normality Test p-values of IF
Y	statistic	p.value	method
IF 2013	0.6353	1.7e-25	Shapiro-Wilk normality test
IF 2014	0.6315	1.6e-25	Shapiro-Wilk normality test
log(IF) 2013	0.9105	8.3e-13	Shapiro-Wilk normality test
log(IF) 2014	0.9467	2.8e-09	Shapiro-Wilk normality test

# check normality within dsm groups
norm_test = jdata %>% group_by(dsm_fac) %>% summarize(if_2013.pvalue = shapiro.test(if_2013)$p.value, 
    if_2014.pvalue = shapiro.test(if_2014)$p.value, log_if_2013.pvalue = shapiro.test(log(if_2013))$p.value, 
    log_if_2014.pvalue = shapiro.test(log(if_2014))$p.value)
norm_test[, -1] = apply(norm_test[, -1], 2, function(k) as.character(signif(k, 
    2)))

kable(norm_test, digits = 4, caption = "Shapiro Wilk Normality Test p-values of IF within DSM Groups")

Shapiro Wilk Normality Test p-values of IF within DSM Groups
dsm_fac	if_2013.pvalue	if_2014.pvalue	log_if_2013.pvalue	log_if_2014.pvalue
Required as condition of publication, barring exceptions	6e-06	8.6e-06	0.75	0.95
Required but no explicit statement regarding effect on publication/editorial decisions	9.5e-05	4.4e-05	0.18	0.15
Explicitly encouraged/addressed, but not required.	1.4e-10	9.4e-11	0.0042	0.0042
Mentioned indirectly	0.0016	0.00063	0.78	0.66
Only protein, proteomic, and/or genomic data sharing are addressed.	0.00016	1e-04	0.036	0.095
No mention	3.2e-06	2.6e-07	1.7e-10	1.7e-07

The p-values from the Shapiro Wilk’s test are mostly very significant (hence we reject the normality assumption) and in general Impact Factor appears to be quite skewed so nonparametric tests (Wilcoxon, Kruskal-Wallis) will be more appropriate. Taking the log helps somewhat but does not solve the problem. Furthermore log-IF is much less intepretable. Hence, we use nonparametric tests to compare distributions.

2.17 Impact Factor vs. Data Sharing Mark

We wish to assess how Impact Factor differs between journals with different data sharing types.

2.17.1 Summarize Impact Factor Within Groups

We can describe the minimum, maximum, mean, and median of Impact factor as well as total citations for each DSM type.

tmpsum = jdata_long %>% group_by(dsm, dsm_fac, year) %>% summarize(`Number of Journals` = n(), 
    min_IF = min(impact_factor, na.rm = T), mean_IF = mean(impact_factor, na.rm = T), 
    median_IF = median(impact_factor, na.rm = T), max_IF = max(impact_factor, 
        na.rm = T), min_TotalCites = min(total_cites, na.rm = T), mean_TotalCites = mean(total_cites, 
        na.rm = T), median_TotalCites = median(total_cites, na.rm = T), max_TotalCites = max(total_cites, 
        na.rm = T))
tmpsum = tmpsum %>% rename(DSM = dsm_fac)
kable(tmpsum, digits = 2)

dsm	DSM	year	Number of Journals	min_IF	mean_IF	median_IF	max_IF	min_TotalCites	mean_TotalCites	median_TotalCites	max_TotalCites
1	Required as condition of publication, barring exceptions	2013	38	2.25	10.09	8.22	42.35	374	43076.95	19967.5	590324
1	Required as condition of publication, barring exceptions	2014	38	1.99	9.87	8.08	41.46	748	48221.05	23867.0	617363
1	Required as condition of publication, barring exceptions	2015	38	2.11	9.43	7.47	38.14	681	35612.34	15361.0	627846
2	Required but no explicit statement regarding effect on publication/editorial decisions	2013	29	2.00	9.66	6.34	33.12	979	62583.83	15492.0	565934
2	Required but no explicit statement regarding effect on publication/editorial decisions	2014	29	1.82	9.53	5.62	33.61	1448	65363.17	17231.0	586144
2	Required but no explicit statement regarding effect on publication/editorial decisions	2015	29	1.53	9.43	5.30	34.66	2080	55740.28	11614.0	593284
3	Explicitly encouraged/addressed, but not required.	2013	74	2.13	4.27	3.88	16.75	335	17758.84	9696.0	167915
3	Explicitly encouraged/addressed, but not required.	2014	74	2.12	4.24	3.82	17.57	577	18691.88	10228.5	173265
3	Explicitly encouraged/addressed, but not required.	2015	74	2.05	4.18	3.79	17.30	410	11499.77	8038.0	76694
4	Mentioned indirectly	2013	29	1.09	3.61	3.29	9.92	1491	11688.97	6787.0	47233
4	Mentioned indirectly	2014	29	1.24	3.49	3.10	9.57	1798	12351.10	7306.0	48946
4	Mentioned indirectly	2015	29	1.52	3.47	3.20	8.98	1975	8548.17	5989.0	37148
5	Only protein, proteomic, and/or genomic data sharing are addressed.	2013	47	1.32	5.65	5.01	13.91	440	38265.09	20478.0	406586
5	Only protein, proteomic, and/or genomic data sharing are addressed.	2014	47	1.52	5.52	4.66	12.52	557	38497.47	20513.0	396051
5	Only protein, proteomic, and/or genomic data sharing are addressed.	2015	47	1.68	5.20	4.32	12.48	676	18308.17	8389.0	143465
6	No mention	2013	101	0.07	3.94	3.48	11.98	115	8053.62	4693.0	46347
6	No mention	2014	101	0.22	3.90	3.50	12.41	384	8660.03	5078.0	45541
6	No mention	2015	101	0.22	3.85	3.48	14.81	410	6494.86	4832.0	32282

tmpsum_IF_within_DSM = tmpsum

We can also visualize the distribution with a boxplot of IF by DSM type.

Boxplot description: The lower and upper “hinges” correspond to the first and third quartiles (the 25th and 75th percentiles). The upper whisker extends from the hinge to the highest value that is within 1.5 * IQR of the hinge, where IQR is the inter-quartile range, or distance between the first and third quartiles. The lower whisker extends from the hinge to the lowest value within 1.5 * IQR of the hinge. Data beyond the end of the whiskers are outliers and plotted as points (as specified by Tukey). (from geom_boxplot help page in the R ggplot2 package.) The Triangle represents the mean.

ggplot(jdata_long %>% filter(year != 2015), aes(x = dsm, y = impact_factor, 
    fill = dsm_fac, alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + 
    theme_minimal() + scale_alpha_manual(values = c(0.3, 0.7)) + stat_summary(fun.y = mean, 
    geom = "point", position = position_dodge(0.9), aes(x = dsm, y = impact_factor, 
        shape = "triangle")) + scale_shape_manual(values = 2) + guides(alpha = "none", 
    shape = "none", fill = guide_legend(title = "Data Sharing Mark", ncol = 1)) + 
    xlab("Data Sharing Mark (1-6), Year (2013 light, 2014 dark)") + ylab("Impact Factor") + 
    # ggtitle('Boxplot of Impact Factor by Data Sharing Mark')+
scale_x_continuous(breaks = 1:6) + theme(legend.position = "bottom")

ggplot(jdata_long, aes(x = dsm, y = impact_factor, fill = dsm_fac, alpha = year)) + 
    geom_boxplot(position = position_dodge(0.9)) + theme_minimal() + scale_alpha_manual(values = c(0.3, 
    0.5, 0.8)) + stat_summary(fun.y = mean, geom = "point", position = position_dodge(0.9), 
    aes(x = dsm, y = impact_factor, shape = "triangle")) + scale_shape_manual(values = 2) + 
    guides(alpha = "none", shape = "none", fill = guide_legend(title = "Data Sharing Mark", 
        ncol = 1)) + xlab("Data Sharing Mark (1-6), Year (2013-2015 light-dark)") + 
    ylab("Impact Factor") + # ggtitle('Boxplot of Impact Factor by Data Sharing Mark')+
scale_x_continuous(breaks = 1:6) + theme(legend.position = "bottom")

2.18 FIGURE 2: Impact factors were higher for journals with the strongest data sharing policies (DSM 1) compared to journals with no mention of data sharing (DSM 6).

The median Impact Factor was calculated for the journals with each data sharing mark for each report year (light color=2013, dark color=2014). The lower and upper hinges of the boxplots represent the first and third quartiles of journal Impact Factor, the horizontal line represents the median, the triangle represents the mean, and the upper and lower whiskers extend from the hinge to the highest (lowest) value that is within 1.5 times the interquartile range of the hinge, with journals outside this range represented as points.

tmplong = jdata_long%>%filter(year!=2015)
tmplong$dsm_fac_reverse = factor(tmplong$dsm_fac,levels=levels(tmplong$dsm_fac)[6:1])
ggplot(tmplong,aes(x=dsm,y=impact_factor,alpha=year,fill=dsm_fac_reverse))+
  geom_boxplot(position=position_dodge(.9))+theme_minimal()+
  scale_alpha_manual(values=c(0.7,0.9))+ 
  stat_summary(fun.y=mean,geom="point",position=position_dodge(.9),
               aes(x=dsm,y=impact_factor,shape="triangle"))+
  scale_shape_manual(values = 2)+
  scale_fill_manual(values = RColorBrewer::brewer.pal(n=7, name="Blues")[2:7],
                    #values=RColorBrewer::brewer.pal(n=6,name="Blues"),
                    breaks=levels(tmplong$dsm_fac),
                    labels=paste(1:6,levels(tmplong$dsm_fac)))+
  guides(alpha="none",shape="none",
         fill=guide_legend(title="Data Sharing Mark",ncol=1))+
  xlab("Data Sharing Mark (1-6), Year (2013 light, 2014 dark)")+
  ylab("Impact Factor")+
  #ggtitle("Boxplot of Impact Factor by Data Sharing Mark")+
  scale_x_continuous(breaks=1:6)+
  theme(legend.position="bottom")

ggplot(jdata_long %>% filter(!Journal == "PLoS One"), aes(x = dsm, y = citable_items, 
    fill = dsm_fac, alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + 
    theme_minimal() + scale_alpha_manual(values = c(0.3, 0.5, 0.8)) + stat_summary(fun.y = mean, 
    geom = "point", position = position_dodge(0.9), aes(x = dsm, y = citable_items, 
        shape = "triangle")) + scale_shape_manual(values = 2) + guides(alpha = "none", 
    shape = "none", fill = "none") + xlab("Data Sharing Mark (1-6), Year (2013-2015 light-dark)") + 
    # ggtitle('Number of Citable Items per Journal by Data Sharing Mark')+
ylab("Number of Citable Items per Journal") + scale_x_continuous(breaks = 1:6) + 
    theme(legend.position = "bottom")

Collapsing DSM into two categories:

tmpsum = jdata_long %>% group_by(dsm2_fac, year) %>% summarize(`Number of Journals` = n(), 
    min_IF = min(impact_factor, na.rm = T), mean_IF = mean(impact_factor, na.rm = T), 
    median_IF = median(impact_factor, na.rm = T), max_IF = max(impact_factor, 
        na.rm = T), min_TotalCites = min(total_cites, na.rm = T), mean_TotalCites = mean(total_cites, 
        na.rm = T), median_TotalCites = median(total_cites, na.rm = T), max_TotalCites = max(total_cites, 
        na.rm = T))
tmpsum = tmpsum %>% rename(DSM = dsm2_fac)
kable(tmpsum, digits = 2)

DSM	year	Number of Journals	min_IF	mean_IF	median_IF	max_IF	min_TotalCites	mean_TotalCites	median_TotalCites	max_TotalCites
Not Required	2013	251	0.07	4.32	3.99	16.75	115	16992.07	7897	406586
Not Required	2014	251	0.22	4.26	3.88	17.57	384	17703.23	8490	396051
Not Required	2015	251	0.22	4.16	3.75	17.30	410	10463.00	6914	143465
Required	2013	67	2.00	9.91	6.79	42.35	374	51520.22	17144	590324
Required	2014	67	1.82	9.72	7.05	41.46	748	55640.78	18098	617363
Required	2015	67	1.53	9.43	6.40	38.14	681	44324.43	14322	627846

tmpsum_IF_within_DSM2 = tmpsum

ggplot(jdata_long, aes(x = dsm2_fac_flip, y = impact_factor, fill = dsm2_fac_flip, 
    alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + theme_minimal() + 
    scale_alpha_manual(values = c(0.3, 0.5, 0.8)) + stat_summary(fun.y = mean, 
    geom = "point", position = position_dodge(0.9), aes(x = dsm2_fac, y = impact_factor, 
        shape = "triangle")) + scale_shape_manual(values = 2) + guides(alpha = "none", 
    shape = "none", fill = guide_legend(title = "Data Sharing")) + xlab("Data Sharing Required (No, Yes), Year (2013-2015 light-dark)") + 
    # ggtitle('Boxplot of Impact Factor by Data Sharing Requirement')+
ylab("Impact Factor")

ggplot(jdata_long %>% filter(!Journal == "PLoS One"), aes(x = dsm2_fac_flip, 
    y = citable_items, fill = dsm2_fac_flip, alpha = year)) + geom_boxplot(position = position_dodge(0.9)) + 
    theme_minimal() + stat_summary(fun.y = mean, geom = "point", position = position_dodge(0.9), 
    aes(x = dsm2_fac, y = citable_items, shape = "triangle")) + scale_alpha_manual(values = c(0.3, 
    0.5, 0.8)) + scale_shape_manual(values = 2) + guides(alpha = "none", shape = "none", 
    fill = guide_legend(title = "Data Sharing")) + xlab("Data Sharing Required (No, Yes), Year (2013-2015 light-dark)") + 
    # ggtitle('Number of Citable Items per Journal by Data Sharing
# Requirement')+
ylab("Number of Citable Items per Journal")

3 Analysis: Journal Impact Factor, Open Access, Data Sharing Mark

3.1 Methods

Continuous variables are summarized with medians and interquartile ranges (IQRs) denoting the 25th and 75th percentiles. Categorical variables are summarized with counts and percentages. The variables Impact Factor and total citable items are not normally distributed (Shapiro Wilk’s Test p-values < 0.001) so medians are presented instead of means, and nonparametric methods are used for statistical tests.

The association of Impact Factor (IF) with 6-level data sharing mark (DSM) was tested with a nonparametric Kruskal-Wallis one-way analysis of variance (ANOVA) of IF in 2013 and 2014 with DSM as a grouping factor. Post-hoc pairwise two-sample Wilcoxon tests were used to determine whether the median IF for journals differ between the two level data sharing policy (required vs. not required) categories. P-values from the Wilcoxon tests were adjusted for multiple comparisons with the Holm procedure.

Pearson’s chi-square test was used to test the association of data sharing policy (two levels: required vs not required) and open access status. Fisher’s Exact Test was used to test the association of the 6-level DSM with open access status. Fisher’s Test was used as opposed to Chi-square test due to the low number of Open Access journals within some DSM categories. To examine the association of open access status and data sharing weighted by publishing volume we examined the number of citable items in each category and tested for the association of open access and data sharing with Pearson’s chi-square test.

All statistical analyses were performed with R version 3.2.1 (2015-06-18). All code and data to reproduce these results can be found on github (https://github.com/OHSU-Ontology-Development-Group/DataSharingPolicies).

3.2 Notes

The Kruskal-Wallis test is a nonparametric version of ANOVA that tests whether the distribution of IF varies between DSM groups.
The Wilcoxon test is testing the difference in medians between two groups when the distributions of the outcome (IF in this case) are the same. Based on the boxplots and densities the distributions do look similar within DSM group so we are comfortable making inferences on the medians.

3.3 IF ~ DSM

We perform a Kruskal Wallis test for the difference in median JIF between 6 category DSM.

We can also collapse DSM into two categories and perform a Wilcoxon test for difference in median JIF between required vs not required data sharing.

k_if13 = with(jdata, kruskal.test(if_2013 ~ dsm))
k_if14 = with(jdata, kruskal.test(if_2014 ~ dsm))
k_if15 = with(jdata, kruskal.test(if_2015 ~ dsm))

w_if13 = with(jdata, pairwise.wilcox.test(if_2013, dsm))
w_if14 = with(jdata, pairwise.wilcox.test(if_2014, dsm))
w_if15 = with(jdata, pairwise.wilcox.test(if_2015, dsm))

w_if13_2cat = with(jdata, wilcox.test(if_2013 ~ dsm2))
w_if14_2cat = with(jdata, wilcox.test(if_2014 ~ dsm2))
w_if15_2cat = with(jdata, wilcox.test(if_2015 ~ dsm2))

# kw results
kw_results = data.frame(Y = c("if_2013_6catDSM", "if_2014_6catDSM", "if_2015_6catDSM", 
    "if_2013_2catDSM", "if_2014_2catDSM", "if_2015_2catDSM"), bind_rows(broom::tidy(k_if13), 
    broom::tidy(k_if14), broom::tidy(k_if15), broom::tidy(w_if13_2cat), broom::tidy(w_if14_2cat), 
    broom::tidy(w_if15_2cat)))
kw_results$p.value = as.character(signif(kw_results$p.value, 2))
kw_results = kw_results %>% rename(`degrees of freedom` = parameter)
kable(kw_results, digits = 2)

Y	statistic	p.value	degrees of freedom	method	alternative
if_2013_6catDSM	61.77	5.2e-12	5	Kruskal-Wallis rank sum test	NA
if_2014_6catDSM	60.04	1.2e-11	5	Kruskal-Wallis rank sum test	NA
if_2015_6catDSM	52.23	4.8e-10	5	Kruskal-Wallis rank sum test	NA
if_2013_2catDSM	4339.00	1.2e-09	NA	Wilcoxon rank sum test with continuity correction	two.sided
if_2014_2catDSM	4279.50	9.5e-10	NA	Wilcoxon rank sum test with continuity correction	two.sided
if_2015_2catDSM	4268.00	1.2e-09	NA	Wilcoxon rank sum test with continuity correction	two.sided

The pairwise Wilcoxon test p-values are below (by DSM group number) for 2013, 2014, and 2015. The p-values are adjusted for multiple comparisons with the holm method.

Significant p-values suggest that the median JIF is different for journals in the two DSM categories.

Pairwise Wilcoxon p-values for IF 2013 between DSM:

signif_13 = signif(w_if13$p.value, 2)
signifind = which(signif_13 < 0.05, arr.ind = T)
emphasize.strong.cells(signifind)
signif_13[which(signif_13 < 0.001, arr.ind = T)] = "< 0.001"
signif_13[which(is.na(signif_13), arr.ind = T)] = ""
pandoc.table(signif_13)

	1	2	3	4	5
2	0.86
3	< 0.001	0.034
4	< 0.001	0.0072	0.23
5	0.04	0.86	0.0022	< 0.001
6	< 0.001	0.0033	0.69	0.86	< 0.001

Pairwise Wilcoxon p-values for IF 2014 between DSM:

signif_14 = signif(w_if14$p.value, 2)
signifind = which(signif_14 < 0.05, arr.ind = T)
emphasize.strong.cells(signifind)
signif_14[which(signif_14 < 0.001, arr.ind = T)] = "< 0.001"
signif_14[which(is.na(signif_14), arr.ind = T)] = ""
pandoc.table(signif_14)

	1	2	3	4	5
2	0.82
3	< 0.001	0.034
4	< 0.001	0.0058	0.17
5	0.016	0.82	0.0062	< 0.001
6	< 0.001	0.0035	0.82	0.82	< 0.001

Pairwise Wilcoxon p-values for IF 2015 between DSM:

signif_15 = signif(w_if15$p.value, 2)
signifind = which(signif_15 < 0.05, arr.ind = T)
emphasize.strong.cells(signifind)
signif_15[which(signif_15 < 0.001, arr.ind = T)] = "< 0.001"
signif_15[which(is.na(signif_15), arr.ind = T)] = ""
pandoc.table(signif_15)

	1	2	3	4	5
2	0.89
3	< 0.001	0.051
4	< 0.001	0.0062	0.15
5	0.0098	0.42	0.14	0.012
6	< 0.001	0.0051	0.42	0.89	0.0062

3.3.1 Results

Impact factor is significantly associated with the six category data sharing mark (Kruskal-Wallis rank sum test, 5 df, p < 0.001, 2013 and 2014). Examining pairwise differences between DSMs we see that journals with DSM 1 have significantly higher JIF than journals with DSM 3, 4, 5, or 6 (Wilcoxon test, p < 0.001, < 0.001, 0.04, < 0.001; 2013 data, 2014 similar). Journals with DSM 2 have significantly higher JIF than journals with DSM 3, 4, or 6 (Wilcoxon test, p = 0.034,0.0072, 0.0033; 2013 data, 2014 similar). Journals with DSM 5 have significantly higher JIF than journals with DSM 3, 4, and 6 (Wilcoxon test, p 0.0022, < 0.001, < 0.001; 2013 data, 2014 similar). In general, IF is not significantly different between DSM 1&2 and DSM 2&5, reflecting the similar JIF for journals with data sharing requirements, either full or partial sharing. After collapsing DSM into two categories, required (DSM 1-2) and not required (DSM 3-6) we still see a highly significant increase in JIF for journals with required data sharing (Wilcoxon Rank Sum Test, p < 0.001, 2013 and 2014 data). The median JIFs for DSM 1-6 are 8.22, 6.34, 3.88, 3.29, 5.01, 3.48 in 2013 and 8.08, 5.62, 3.82, 3.1, 4.66, 3.5 in 2014. The median JIFs for the collapsed two category DSM required and not required are 6.79, 3.99 in 2013 and 7.05, 3.88 in 2014.

Results for 2015 are similar, except DSM 3 and 5 are no longer significantly different.

3.4 Kruskal-Wallis test for year effect

A Kruskal-Wallis test was performed for year effect with impact factor and number of citable items. There were no significant differences in distribution impact factor nor citable items by year in the entire data set nor in subsets defined by DSM group.

3.4.1 Impact Factor

tmp1 = broom::tidy(with(jdata_long, kruskal.test(impact_factor ~ as.factor(year))))

tmp2 = bind_rows(lapply(1:6, function(k) {
    broom::tidy(with(jdata_long %>% filter(dsm == k), kruskal.test(impact_factor ~ 
        as.factor(year))))
}))
tmp = bind_rows(tmp1, tmp2)
tmp = cbind(DSM = c("All", paste0("DSM = ", 1:6)), tmp)
tmp %>% kable(digits = 3)

DSM	statistic	p.value	parameter	method
All	1.286	0.526	2	Kruskal-Wallis rank sum test
DSM = 1	0.186	0.911	2	Kruskal-Wallis rank sum test
DSM = 2	0.055	0.973	2	Kruskal-Wallis rank sum test
DSM = 3	0.260	0.878	2	Kruskal-Wallis rank sum test
DSM = 4	0.225	0.894	2	Kruskal-Wallis rank sum test
DSM = 5	3.514	0.173	2	Kruskal-Wallis rank sum test
DSM = 6	0.207	0.902	2	Kruskal-Wallis rank sum test

3.4.2 Citable Items

tmp1 = broom::tidy(with(jdata_long, kruskal.test(citable_items ~ as.factor(year))))

tmp2 = bind_rows(lapply(1:6, function(k) {
    broom::tidy(with(jdata_long %>% filter(dsm == k), kruskal.test(citable_items ~ 
        as.factor(year))))
}))
tmp = bind_rows(tmp1, tmp2)
tmp = cbind(DSM = c("All", paste0("DSM = ", 1:6)), tmp)
tmp %>% kable(digits = 3)

DSM	statistic	p.value	parameter	method
All	0.019	0.990	2	Kruskal-Wallis rank sum test
DSM = 1	0.051	0.975	2	Kruskal-Wallis rank sum test
DSM = 2	0.135	0.935	2	Kruskal-Wallis rank sum test
DSM = 3	0.293	0.864	2	Kruskal-Wallis rank sum test
DSM = 4	0.584	0.747	2	Kruskal-Wallis rank sum test
DSM = 5	0.331	0.848	2	Kruskal-Wallis rank sum test
DSM = 6	0.002	0.999	2	Kruskal-Wallis rank sum test

3.5 Open Access vs. DSM

3.5.1 Fisher’s Exact Test for 6 category DSM vs. OAM

The Fisher’s Exact Test for DSM vs. OAM tests for the independence of the categories of DSM and OAM (unordered).

Table of counts for 6 category DSM and two category OAM.

tab1 = with(jdata, table(oam_fac, dsm))
kable(tab1)

	1	2	3	4	5	6
Subscription	29	27	63	29	40	86
Open Access	9	2	11	0	7	15

Table of proportion with Open Access in each DSM category:

tibble(DSM = dsm_labels, `Proportion Open Access` = round(tab1[2, ]/colSums(tab1), 
    2)) %>% kable()

DSM	Proportion Open Access
Required as condition of publication, barring exceptions	0.24
Required but no explicit statement regarding effect on publication/editorial decisions	0.07
Explicitly encouraged/addressed, but not required.	0.15
Mentioned indirectly	0.00
Only protein, proteomic, and/or genomic data sharing are addressed.	0.15
No mention	0.15

We can test for the association of DSM and OAM with Fisher’s Exact Test. The Test result is below:

fishres = broom::tidy(fisher.test(tab1))
kable(fishres)

p.value	method	alternative
0.0697445	Fisher’s Exact Test for Count Data	two.sided

3.5.2 Chi-Square test for 2 category DSM vs. OAM

Collapsing the categories into a 2x2 table makes the test hypothesis and result easier to interpret. When we collapse DSM into two categories (required vs. not required) there are more counts in each cell so we do not need to use the Fisher’s Exact Test but instead can use a Chi-square test (commonly used for large samples).

The number of journals in each of the 2x2 table categories are below:

tab2 = with(jdata, table(oam_fac, dsm2_fac))
kable(tab2)

	Not Required	Required
Subscription	218	56
Open Access	33	11

Table of proportion with Open Access in each DSM category:

jdata %>% group_by(dsm2_fac) %>% summarize(`Proportion  Open Access` = mean(oam)) %>% 
    rename(DSM = dsm2_fac) %>% kable(digits = 3)

DSM	Proportion Open Access
Not Required	0.131
Required	0.164

Table of proportion data sharing required in each Open Access category:

jdata %>% group_by(oam_fac) %>% summarize(`Proportion  DSM Required` = mean(dsm2)) %>% 
    rename(OAM = oam_fac) %>% kable(digits = 3)

OAM	Proportion DSM Required
Subscription	0.204
Open Access	0.250

chires = broom::tidy(chisq.test(tab2)) %>% rename(df = parameter)
kable(chires, digits = 3)

statistic	p.value	df	method
0.24	0.624	1	Pearson’s Chi-squared test with Yates’ continuity correction

3.5.3 Results

The Fisher’s Exact test is testing the hypothesis that open access status is associated with data sharing mark. The test is not significant (Fisher’s Exact Test, p = 0.07) which suggests that the proportion of open access journals is not significantly differenct across data sharing mark categories.

The Chi-square test is testing the hypothesis that open access status is associated with data sharing requirement (two categories DSM 1-2 vs DSM 3-6). The test is not significant (Chi-square Test, df=1, p = 0.62) which suggests that journals with data sharing requirements are not any more likely to be open access than journals without data sharing requirement. Also, open access journals are not more likely to have data sharing requirements than subscription journals. This is further supported by the evidence that the proportion of open access journals that is similar for data sharing “required” vs. “non-required” journals.

4 Publishing Volume

4.1 All Journals

Here we determine how data sharing and open access are related once incorporating publishing volume. In this case, we are considering the “citable item” as the unit of measurement as opposed to journal. In other words, we ask, if we are given a citable item that is open access, is it more likely to have data sharing requirements than a citable item that is subscription based?

4.1.1 Summary of Number of Citable Items by DSM and OAM

In 2013, the total number of citable items in the set of studied journals was 130330, in 2014 it was 131107 and in 2015 it was 130277.

Summary of citable items in 2013/2014/2015:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, caption = "Citable Items by DSM")

Citable Items by DSM
year	DSM	Num Journals	Total Citable
2013	1	38	42669
2013	2	29	12138
2013	3	74	25519
2013	4	29	8062
2013	5	47	19339
2013	6	101	22603
2014	1	38	42794
2014	2	29	12436
2014	3	74	26026
2014	4	29	7894
2014	5	47	19080
2014	6	101	22877
2015	1	38	40870
2015	2	29	14233
2015	3	74	26731
2015	4	29	7928
2015	5	47	17734
2015	6	101	22781

tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, caption = "Citable Items by Required/Not Required DSM")

Citable Items by Required/Not Required DSM
year	DSM	Num Journals	Total Citable
2013	Not Required	251	75523
2013	Required	67	54807
2014	Not Required	251	75877
2014	Required	67	55230
2015	Not Required	251	75174
2015	Required	67	55103

tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, caption = "Citable Items by Open Access")

Citable Items by Open Access
year	OAM	Num Journals	Total Citable
2013	Subscription	274	86541
2013	Open Access	44	43789
2014	Subscription	274	85276
2014	Open Access	44	45831
2015	Subscription	274	82704
2015	Open Access	44	47573

Summary with proportions:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, digits = 3)

year	DSM	Num Journals	Total Citable	Proportion Citable Items Open Access
2013	1	38	42669	0.819
2013	2	29	12138	0.023
2013	3	74	25519	0.103
2013	4	29	8062	0.000
2013	5	47	19339	0.100
2013	6	101	22603	0.176
2014	1	38	42794	0.818
2014	2	29	12436	0.086
2014	3	74	26026	0.136
2014	4	29	7894	0.000
2014	5	47	19080	0.117
2014	6	101	22877	0.174
2015	1	38	40870	0.812
2015	2	29	14233	0.232
2015	3	74	26731	0.149
2015	4	29	7928	0.000
2015	5	47	17734	0.115
2015	6	101	22781	0.223

tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, digits = 3)

year	DSM	Num Journals	Total Citable	Proportion Citable Items Open Access
2013	Not Required	251	75523	0.113
2013	Required	67	54807	0.643
2014	Not Required	251	75877	0.129
2014	Required	67	55230	0.653
2015	Not Required	251	75174	0.148
2015	Required	67	55103	0.662

tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Required DSM` = sum(citable_items[dsm2 == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, digits = 3)

year	OAM	Num Journals	Total Citable	Proportion Citable Items Required DSM
2013	Subscription	274	86541	0.226
2013	Open Access	44	43789	0.805
2014	Subscription	274	85276	0.225
2014	Open Access	44	45831	0.787
2015	Subscription	274	82704	0.225
2015	Open Access	44	47573	0.767

4.1.2 Chi-square analysis - Citable Item

When we weight the number of open access journals and required data sharing journals by total citable items within each category there is a significant association between open access and data sharing requirement at the citable item level. That is, a citable item that is open access is much more likely to also have a data sharing requirement. This is mainly due to the fact that although the number of journals who have these open access or data sharing requirements is smaller than the number of journals that do not, the total citable articles within those journals is much larger. The p-values for the chi-square test at the citable item level are <2e-16, very significant.

Chi-square test results:

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2013")

Citable Items by OAM and DSM, 2013
	Not Required	Required
Subscription	66968	19573
Open Access	8555	35234

tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
39924.41	< 2e-16	1	Pearson’s Chi-squared test with Yates’ continuity correction

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2014")

Citable Items by OAM and DSM, 2014
	Not Required	Required
Subscription	66115	19161
Open Access	9762	36069

tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
38658.64	< 2e-16	1	Pearson’s Chi-squared test with Yates’ continuity correction

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 8], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2014")

Citable Items by OAM and DSM, 2014
	Not Required	Required
Subscription	64067	18637
Open Access	11107	36466

tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
36238.57	< 2e-16	1	Pearson’s Chi-squared test with Yates’ continuity correction

Chi-square test results for DSM1-6 vs Open Access:

2013:

tmp = jdata_long %>% group_by(year, dsm, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
tmp = add_row(as.data.frame(tmp), year = 2013, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0)
tmp = add_row(as.data.frame(tmp), year = 2014, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0) %>% arrange(year, dsm, oam_fac)

tmpm = data.frame(matrix(tmp %>% filter(year == 2013) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
67706.64	< 2e-16	5	Pearson’s Chi-squared test

2014:

tmpm = data.frame(matrix(tmp %>% filter(year == 2014) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
62151.23	< 2e-16	5	Pearson’s Chi-squared test

2015:

tmpm = data.frame(matrix(tmp %>% filter(year == 2015) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
45573.45	< 2e-16	5	Pearson’s Chi-squared test

4.2 Trend test for proportion of citable items with required data sharing within open access categories across years

tmp = jdata_long %>% group_by(year, oam_fac, dsm2) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))

# within OA
tmpreq = tmp %>% filter(oam_fac == "Open Access", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Open Access", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_oa = prop.test(tmpreq, tmptot)
tmpres_oa_trend = prop.trend.test(tmpreq, tmptot)


# within subscription
tmpreq = tmp %>% filter(oam_fac == "Subscription", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Subscription", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_sub_trend = prop.trend.test(tmpreq, tmptot)
tmpres_sub = prop.test(tmpreq, tmptot)


tmpres = bind_rows(broom::tidy(tmpres_sub), broom::tidy(tmpres_oa))
tmpres_trend = bind_rows(broom::tidy(tmpres_sub_trend), broom::tidy(tmpres_oa_trend))

colnames(tmpres)[1:3] = c("2013", "2014", "2015")
tmpres_withPLOS = cbind(tmpres[, 1:3], tmpres_trend)

tmp = jdata_long %>% filter(!Journal == "PLoS One") %>% group_by(year, oam_fac, 
    dsm2) %>% summarize(`Num Journals` = n(), `Total Citable` = sum(citable_items, 
    na.rm = T))

# within OA
tmpreq = tmp %>% filter(oam_fac == "Open Access", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Open Access", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_oa = prop.test(tmpreq, tmptot)
tmpres_oa_trend = prop.trend.test(tmpreq, tmptot)


# within subscription
tmpreq = tmp %>% filter(oam_fac == "Subscription", dsm2 == 1) %$% `Total Citable`
tmpnot = tmp %>% filter(oam_fac == "Subscription", dsm2 == 0) %$% `Total Citable`
tmptot = tmpreq + tmpnot

tmpres_sub_trend = prop.trend.test(tmpreq, tmptot)
tmpres_sub = prop.test(tmpreq, tmptot)


tmpres = bind_rows(broom::tidy(tmpres_sub), broom::tidy(tmpres_oa))
tmpres_trend = bind_rows(broom::tidy(tmpres_sub_trend), broom::tidy(tmpres_oa_trend))

colnames(tmpres)[1:3] = c("2013", "2014", "2015")
tmpres = cbind(tmpres[, 1:3], tmpres_trend)

When looking at the open access journals including PLoS One, we can see that the proportion of data sharing required citable items is decreasing from 2013 to 2015 within open access journals (0.8, 0.79, 0.77, p=9.310^{-45}).

After removing PLos One, we can see that the proportion of data sharing required citable items is increasing from 2013 to 2015 within open access journals (0.3, 0.38, 0.43, p=1.410^{-108}).

However, this proportion does not increase within subscription journals (0.23, 0.22, 0.23, p=0.68).

The results are below:

kable(cbind(PLOS = c("withPLOS", "withPLOS", "withoutPLOS", "withoutPLOS"), 
    bind_rows(tmpres_withPLOS, tmpres)), digits = 3)

PLOS	2013	2014	2015	statistic	p.value	parameter	method
withPLOS	0.226	0.225	0.225	0.171	0.679	1	Chi-squared Test for Trend in Proportions
withPLOS	0.805	0.787	0.767	197.023	0.000	1	Chi-squared Test for Trend in Proportions
withoutPLOS	0.226	0.225	0.225	0.171	0.679	1	Chi-squared Test for Trend in Proportions
withoutPLOS	0.304	0.382	0.429	490.105	0.000	1	Chi-squared Test for Trend in Proportions

4.3 Remove PLoS One

PLoS One may be skewing the results since it has such high volume, so we try the above analysis after removing this journal.

jdata_long0 = jdata_long  # keep old jdata_long object
jdata_long = jdata_long %>% filter(!Journal == "PLoS One")

4.3.1 Summary of Number of Citable Items by DSM and OAM

In 2013, the total number of citable items in the set of studied journals (removing Plos One) was 98834, in 2014 it was 101067, and in 2015 it was 102163

Summary of citable items in 2013/2014/2015:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, caption = "Citable Items by DSM")

Citable Items by DSM
year	DSM	Num Journals	Total Citable
2013	1	37	11173
2013	2	29	12138
2013	3	74	25519
2013	4	29	8062
2013	5	47	19339
2013	6	101	22603
2014	1	37	12754
2014	2	29	12436
2014	3	74	26026
2014	4	29	7894
2014	5	47	19080
2014	6	101	22877
2015	1	37	12756
2015	2	29	14233
2015	3	74	26731
2015	4	29	7928
2015	5	47	17734
2015	6	101	22781

tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, caption = "Citable Items by Required/Not Required DSM")

Citable Items by Required/Not Required DSM
year	DSM	Num Journals	Total Citable
2013	Not Required	251	75523
2013	Required	66	23311
2014	Not Required	251	75877
2014	Required	66	25190
2015	Not Required	251	75174
2015	Required	66	26989

tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, caption = "Citable Items by Open Access")

Citable Items by Open Access
year	OAM	Num Journals	Total Citable
2013	Subscription	274	86541
2013	Open Access	43	12293
2014	Subscription	274	85276
2014	Open Access	43	15791
2015	Subscription	274	82704
2015	Open Access	43	19459

Summary with proportions:

tmp = jdata_long %>% group_by(year, dsm) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm)
kable(tmp, digits = 3)

year	DSM	Num Journals	Total Citable	Proportion Citable Items Open Access
2013	1	37	11173	0.310
2013	2	29	12138	0.023
2013	3	74	25519	0.103
2013	4	29	8062	0.000
2013	5	47	19339	0.100
2013	6	101	22603	0.176
2014	1	37	12754	0.389
2014	2	29	12436	0.086
2014	3	74	26026	0.136
2014	4	29	7894	0.000
2014	5	47	19080	0.117
2014	6	101	22877	0.174
2015	1	37	12756	0.396
2015	2	29	14233	0.232
2015	3	74	26731	0.149
2015	4	29	7928	0.000
2015	5	47	17734	0.115
2015	6	101	22781	0.223

tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Open Access` = sum(citable_items[oam == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(DSM = dsm2_fac)
kable(tmp, digits = 3)

year	DSM	Num Journals	Total Citable	Proportion Citable Items Open Access
2013	Not Required	251	75523	0.113
2013	Required	66	23311	0.160
2014	Not Required	251	75877	0.129
2014	Required	66	25190	0.239
2015	Not Required	251	75174	0.148
2015	Required	66	26989	0.309

tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T), `Proportion Citable Items Required DSM` = sum(citable_items[dsm2 == 
        1], na.rm = T)/sum(citable_items, na.rm = T)) %>% rename(OAM = oam_fac)
kable(tmp, digits = 3)

year	OAM	Num Journals	Total Citable	Proportion Citable Items Required DSM
2013	Subscription	274	86541	0.226
2013	Open Access	43	12293	0.304
2014	Subscription	274	85276	0.225
2014	Open Access	43	15791	0.382
2015	Subscription	274	82704	0.225
2015	Open Access	43	19459	0.429

4.3.2 Chi-square analysis - Citable Item

The results are still highly significant, likely due to the large number of citable items, but an open access article is still more likely to have been published under data sharing requirements than a subscription article. In 2013, 30.4% of open access articles had data sharing requirements as opposed to 22.6% for subscription articles. In 2014 and 2015 it was 38.2% vs. 22.5% and 42.9% vs. 22.5%.

tmp = jdata_long %>% group_by(year, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
kable(tmp)

year	oam_fac	Num Journals	Total Citable
2013	Subscription	274	86541
2013	Open Access	43	12293
2014	Subscription	274	85276
2014	Open Access	43	15791
2015	Subscription	274	82704
2015	Open Access	43	19459

tmp = jdata_long %>% group_by(year, dsm2_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
kable(tmp)

year	dsm2_fac	Num Journals	Total Citable
2013	Not Required	251	75523
2013	Required	66	23311
2014	Not Required	251	75877
2014	Required	66	25190
2015	Not Required	251	75174
2015	Required	66	26989

tmp = jdata_long %>% group_by(year, dsm2_fac, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
kable(tmp)

year	dsm2_fac	oam_fac	Num Journals	Total Citable
2013	Not Required	Subscription	218	66968
2013	Not Required	Open Access	33	8555
2013	Required	Subscription	56	19573
2013	Required	Open Access	10	3738
2014	Not Required	Subscription	218	66115
2014	Not Required	Open Access	33	9762
2014	Required	Subscription	56	19161
2014	Required	Open Access	10	6029
2015	Not Required	Subscription	218	64067
2015	Not Required	Open Access	33	11107
2015	Required	Subscription	56	18637
2015	Required	Open Access	10	8352

Chi-squre test results:

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2013")

Citable Items by OAM and DSM, 2013
	Not Required	Required
Subscription	66968	19573
Open Access	8555	3738

tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
362.0439	< 2e-16	1	Pearson’s Chi-squared test with Yates’ continuity correction

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 4], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2014")

Citable Items by OAM and DSM, 2014
	Not Required	Required
Subscription	66115	19161
Open Access	9762	6029

tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
1756.647	< 2e-16	1	Pearson’s Chi-squared test with Yates’ continuity correction

tmpm = data.frame(matrix(tmp$`Total Citable`[1:4 + 8], ncol = 2))
colnames(tmpm) = levels(jdata$dsm2_fac)
rownames(tmpm) = levels(jdata$oam_fac)
kable(tmpm, caption = "Citable Items by OAM and DSM, 2015")

Citable Items by OAM and DSM, 2015
	Not Required	Required
Subscription	64067	18637
Open Access	11107	8352

tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
3366.925	< 2e-16	1	Pearson’s Chi-squared test with Yates’ continuity correction

Chi-square test results for DSM1-6 vs Open Access:

2013:

tmp = jdata_long %>% group_by(year, dsm, oam_fac) %>% summarize(`Num Journals` = n(), 
    `Total Citable` = sum(citable_items, na.rm = T))
tmp = add_row(as.data.frame(tmp), year = 2013, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0)
tmp = add_row(as.data.frame(tmp), year = 2014, dsm = 4, oam_fac = "Open Access", 
    `Num Journals` = 0, `Total Citable` = 0) %>% arrange(year, dsm, oam_fac)

tmpm = data.frame(matrix(tmp %>% filter(year == 2013) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
6601.23	< 2e-16	5	Pearson’s Chi-squared test

2014:

tmpm = data.frame(matrix(tmp %>% filter(year == 2014) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
7512.572	< 2e-16	5	Pearson’s Chi-squared test

2015:

tmpm = data.frame(matrix(tmp %>% filter(year == 2015) %>% ungroup %>% select(`Total Citable`) %>% 
    unlist, nrow = 2))
tmpb = broom::tidy(chisq.test(tmpm))
if (tmpb$p.value < 2e-16) tmpb$p.value = "< 2e-16"
kable(tmpb)

statistic	p.value	parameter	method
33259.23	< 2e-16	5	Pearson’s Chi-squared test

4.4 Results

Of the 318 journals examined, 38 (11.9%) required data sharing as a condition of publication, barring exceptions and 29 (9.1%) required data sharing but made no explicit statement regarding the effect on publication and editorial decisions. 74 (23.3%) journals explicitly encouraged or addressed data sharing, but did not require it. And, 47 (14.8%) journals only addressed data sharing for proteomic, genomic data, or other specific structural data.

In order to understand the potential influence of the policies on the published literature, we also evaluated the distribution of policies by publication volume. In 2013, the total number of citable items (papers) in the studied journals was 98834 (after removing PlOS One), in 2014 the total number of citable items was 101067 and in 2015 the total number of citable items was 102163,

with median numbers of citable items per journal of 242, 236, and 240, respectively.

5 Sample Representation of All Journals

We can compare sampled journals to non-sampled journals (“out of sample”“) or to the entire set of journals (”All Journals“).

jdata_all <- read_excel("../Data-Sharing-Policies_2017-01-25.xlsx", sheet = "2014 ESI Data", 
    na = "N/A")

jdata_all = jdata_all %>% rename(Journal = `Full Journal Title`, if_2014 = `Journal Impact Factor`, 
    citable_2014 = `Citable Items`, total_cites_2014 = `Total Cites`)

jdata_sample = jdata %>% select(Journal, ISSN, if_2014, citable_2014, total_cites_2014) %>% 
    add_column(in_sample = "Sample")

Note, these journals are in the sampled data but not in the “full data”, but they have no data anyway:

table(jdata_sample$Journal %in% jdata_all$Journal)

## 
## FALSE  TRUE 
##     2   316

jdata_sample %>% filter(!Journal %in% jdata_all$Journal) %>% kable()

Journal	ISSN	if_2014	citable_2014	total_cites_2014	in_sample
Neuron Glia Biology	1740-925X	NA	NA	NA	Sample
Journal of Pure and Applied Microbiology	0973-7510	NA	NA	NA	Sample

Number of journals in and out of sample (removing journals with missing data):

# for comparing in sample to all journals
jdata_all_long = jdata_all
jdata_all_long$in_sample = "All Journals"
jdata_all_long = bind_rows(jdata_all_long, jdata_sample)
jdata_all_long = jdata_all_long %>% filter(!is.na(total_cites_2014))
table(jdata_all_long$in_sample)

## 
## All Journals       Sample 
##         1169          316

# for comparing in sample to out of sample
jdata_all = left_join(jdata_all, jdata_sample %>% select(-ISSN))
jdata_all$in_sample[is.na(jdata_all$in_sample)] = "Out of sample"
jdata_all = jdata_all %>% filter(!is.na(Journal))
table(jdata_all$in_sample)

## 
## Out of sample        Sample 
##           853           316

5.1 Distribution of Impact Factor

ggplot(jdata_all, aes(x = in_sample, y = if_2014, fill = in_sample)) + geom_boxplot() + 
    theme_minimal() + ggtitle("Distribution of Impact Factor")

ggplot(jdata_all_long, aes(x = in_sample, y = if_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Impact Factor")

jdata_all_long %>% group_by(in_sample) %>% summarize_at(vars(if_2014), funs(Min = "min", 
    Q25 = quantile(., probs = 0.25), Mean = "mean", Median = "median", Q75 = quantile(., 
        probs = 0.75), Max = "max"), na.rm = T) %>% kable(digits = 2)

in_sample	Min	Q25	Mean	Median	Q75	Max
All Journals	0.00	1.46	3.44	2.50	3.83	41.46
Sample	0.22	2.91	5.42	4.16	5.77	41.46

5.2 Distribution of Citable Items

ggplot(jdata_all, aes(x = in_sample, y = citable_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items")

ggplot(jdata_all %>% filter(citable_2014 < 20000), aes(x = in_sample, y = citable_2014, 
    fill = in_sample)) + geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items (remove Plos One)")

ggplot(jdata_all_long, aes(x = in_sample, y = citable_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items")

ggplot(jdata_all_long %>% filter(citable_2014 < 20000), aes(x = in_sample, y = citable_2014, 
    fill = in_sample)) + geom_boxplot() + theme_minimal() + ggtitle("Distribution of Citable Items (remove Plos One)")

jdata_all_long %>% group_by(in_sample) %>% summarize_at(vars(citable_2014), 
    funs(Total = "sum", Min = "min", Q25 = quantile(., probs = 0.25), Mean = "mean", 
        Median = "median", Q75 = quantile(., probs = 0.75), Max = "max"), na.rm = T) %>% 
    kable(digits = 2)

in_sample	Total	Min	Q25	Mean	Median	Q75	Max
All Journals	219640	0	51.0	187.89	94.0	184.00	30040
Sample	131107	0	148.5	414.90	237.5	356.25	30040

After removing PLoS One:

jdata_all_long %>% filter(!Journal == "PLoS One") %>% group_by(in_sample) %>% 
    summarize_at(vars(citable_2014), funs(Min = "min", Q25 = quantile(., probs = 0.25), 
        Mean = "mean", Median = "median", Q75 = quantile(., probs = 0.75), Max = "max"), 
        na.rm = T) %>% kable(digits = 2)

in_sample	Min	Q25	Mean	Median	Q75	Max
All Journals	0	51	162.33	94	184	3931
Sample	0	148	320.85	236	355	3579

5.3 Distribution ofTotal Citations

ggplot(jdata_all, aes(x = in_sample, y = total_cites_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Total Citations")

ggplot(jdata_all_long, aes(x = in_sample, y = total_cites_2014, fill = in_sample)) + 
    geom_boxplot() + theme_minimal() + ggtitle("Distribution of Total Citations")

jdata_all_long %>% group_by(in_sample) %>% summarize_at(vars(total_cites_2014), 
    funs(Total = "sum", Min = "min", Q25 = quantile(., probs = 0.25), Mean = "mean", 
        Median = "median", Q75 = quantile(., probs = 0.75), Max = "max"), na.rm = T) %>% 
    kable(digits = 2)

in_sample	Total	Min	Q25	Mean	Median	Q75	Max
All Journals	11132916	5	928	9523.45	2625.0	7394.00	617363
Sample	8136037	384	4517	25746.95	10126.5	22129.25	617363