Load the data

Let’s reload our packages and dataset (no need to reload if you have it saved from part 1).

library(quanteda); library(tidyverse); library(RColorBrewer); library(stm)

dataset <- read_csv("./articles-sample.csv")

#create corpus
myCorpus <- corpus(dataset$Abstract)

Create the dfm (pre-processing)

This time, let’s remove a pre-created list of “generic” words to our original stop list. These are words that are research terms that do not tell much about the subject itself.

stopWords <- c("can","use","uses","used","using","study","studies","first","second","third","also","across","results","result","resulted","may","however","one","two","three","four","five","among","well","within","many","related","i.e","e.g","find","finding","finds","found","increase","increases","increasing","increased","decreased","decrease","decreases","decreasing","propose","proposal","proposals","proposes","proposed","new","old","differ","differs","different","difference","differences","positive","negative","findings","reports","report","reported","state","states","article","articles","examines","examine","suggest","research","researches","researchers","need","needs","show","shows","association","associations","associated","discuss","discusses","discussed","will","likely","unlikely","paper","method","methods","methodology","compared","specifically","approach","impact","impacts","examine","examined","examines","includes","include","included","including","measure","measures","measured","analysis","analyze","analyses","complete","completes","completed","indicate","indicated","indicates","high","higher","low","lower","follow","follows","following","significant","significance","approach","approaches","approached","model","models","demonstrate","demonstrated","demonstrates","yet","best","worst","better","large","small","larger","smaller","several","few","much","less","given","via","long","short","often","years","along","whether","potential","significantly","influence","influenced","influences","develop","develops","developed","good","bad","based","p","group","groups","effect","affect","affects","effects","sample","samples","relationship","relationships","change","changes","m","k","conclusion","conclusions","present","presents")

dfm <- dfm(myCorpus,
           remove = c(stopwords("english"), stopWords),
           ngrams= 1L,
           stem = F,
           remove_numbers = TRUE, 
           remove_punct = TRUE,
           remove_symbols = TRUE)

vdfm <- dfm_trim(dfm, min_count = 10, min_docfreq = 5)
# min_count = remove words used less than x
# min_docfreq = remove words used in less than x docs

Let’s explore the top 50 words.

topfeatures(vdfm, n = 50)
##         data     students       health       social  information 
##         1553          954          871          825          651 
##         time      network       system       design  development 
##          627          616          562          520          507 
##      provide    important      support         care  performance 
##          489          484          475          469          466 
##     patients      factors     learning      control         work 
##          450          439          432          418          413 
##    education participants       number      problem       school 
##          412          408          408          396          391 
## implications      process     multiple      systems         risk 
##          383          380          377          375          358 
##    framework     networks     activity    community        users 
##          354          352          352          351          351 
##        women    knowledge         user        level     outcomes 
##          337          333          331          329          326 
##     security       levels        image     physical      purpose 
##          320          315          313          308          307 
##       public   management      quality       future      spatial 
##          304          302          301          300          298

Let’s plot two word clouds: one with the raw term frequencies and one with TF-IDF.

plot(vdfm,  scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"), 
     random.order = F, rot.per=0.1, max.words=250, main = "Raw Counts")

plot(tfidf(vdfm),  scale=c(3.5, .75), colors=brewer.pal(8, "Dark2"), 
     random.order = F, rot.per=0.1, max.words=250, main = "TF-IDF")

Let’s now create a dendogram to get an idea of how the words are clustering.

numWords <- 50

wordDfm <- sort(weight(vdfm, "tfidf"))
wordDfm <- t(wordDfm)[1:numWords,]  # keep the top numWords words
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
plot(wordCluster, xlab="", main="TF-IDF Frequency weighting")

Correlated Topic Model (CTM) with stm package

For this part (and the next), we’re going to use the stm package.

library(stm)

# use quanteda converter to convert our Dfm
stmdfm <- convert(dfm, to = "stm")

Unlike the topicmodels packages, stm has built in features to help analysts reduce sparse terms (minDoc or minCount).

plotRemoved(stmdfm$documents, lower.thresh = seq(1, 100, by = 10))

out <- prepDocuments(stmdfm$documents, stmdfm$vocab, stmdfm$meta, lower.thresh = 5)
## Removing 18741 of 23827 terms (31117 of 182292 tokens) due to frequency 
## Your corpus now has 2879 documents, 5086 terms and 151175 tokens.

This time, let’s consider running a 40 topic model. The code simply loads the file. You can run the model (which will take several minutes) by uncommenting out the code.

k <- 40
load(file = "./ctmfit.RData")

#ctmFit <- stm(out$documents, out$vocab, K = k,
#              max.em.its = 150, data = out$meta, init.type = "Spectral", seed = 300)

#save(ctmFit, file = "./ctmfit.RData")

Exploring the results through stm’s visualizations.

Let’s explore the topics.

plot(ctmFit, 
         type = "summary", 
         xlim = c(0,.16), 
         n = 5, 
         labeltype = "prob",
         main = "UNCC Research Topics", 
         text.cex = 0.8)

There are a lot of static visualizations we can explore. We’ll use the plot.STM function.

This function provides four different types of plots. Each can be selected using its name for the type parameter.

The four plots are:

  1. summary - plots topic proportions and names.

  2. labels - plots the top words for a specific topic.

  3. perspectives - compares two topics’ words.

  4. hist - a histogram of the expected topic proportions across documents for a topic.

Let’s examine one of the topics to interpret its meaning. Let’s consider topic 25 using the labels type.

plot(ctmFit, # model results
         type = "labels", # type of plot
         labeltype="prob", # label type for the words
         n = 30, # number of words to show
         topics = 25, # the topic we've selected
         text.cex = 1.2, # this increases the font by 20% (1.2 = 120%)
         width = 50) # this increases the width of the box

This is clearly Education topics. But if we look back at the summary, there’s also topic 26 with related terms.

We can alternatively use a different weighting scheme to focus on the words that are most distinctive for each topic.

For this, we’ll use the frex labeltype. FREX stands for frequent-exclusive words, thus indicating words that are frequently used but exclusive to the topic.

plot(ctmFit, 
         type = "labels", 
         labeltype="frex",
         n = 30, 
         topics = 25, 
         text.cex = 1.2, 
         width = 50)

Or we can use the “lift”…

plot(ctmFit, 
         type = "labels", 
         labeltype="lift", 
         n = 30, 
         topics = 25, 
         text.cex = 1.2, 
         width = 50)

There isn’t a “correct” approach. Each offers a unique perspective and knowing each one can help your full interpretation of a topic.

topicNames <- labelTopics(ctmFit, n = 5)
topic <- data.frame(
  TopicNumber = 1:k,
  TopicProportions = colMeans(ctmFit$theta))

Visualizations Example: Correlated Topic Model

Let’s create a network correlation plot. We’ll use a static network first.

library(igraph); library(visNetwork)

mod.out.corr <- topicCorr(ctmFit, cutoff = .01)
plot(mod.out.corr)

This is a start but let’s create a better, interactive network using the visNetwork package.

To create that network, we’ll need to format the data for that package by creating two data frames: nodes and edges.

# output links and simplify
links2 <- as.matrix(mod.out.corr$posadj)
net2 <- graph_from_adjacency_matrix(links2, mode = "undirected")
net2 <- igraph::simplify(net2) 

# create the links and nodes
links <- igraph::as_data_frame(net2, what="edges")
nodes <- igraph::as_data_frame(net2, what="vertices")

# set parameters for the network
nodes$shape <- "dot"  
nodes$title <- paste0("Topic ", topic$TopicNumber)
nodes$label <- apply(topicNames$prob, 1, function(x) paste0(x, collapse = " \n ")) # Node label
nodes$size <- (topic$TopicProportions / max(topic$TopicProportions)) * 30
nodes$font <- "18px"
nodes$id <- as.numeric(1:k)

visNetwork(nodes, links, width="100%",  height="800px", main="UNCC Research Topics") %>% 
  visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical")) %>%
  visNodes(scaling = list(max = 60)) %>%
  visIgraphLayout(smooth = T) %>%
  visInteraction(navigationButtons = T)

Knowing this, let’s use the perspectives type to examine two topics.

For example, let’s consider health care topics.

Let’s look at topic 30 (“patients”) and topic 11 (“health, care, older”).

plot(ctmFit, 
         type = "perspectives", 
         labeltype="prob",
         n = 30, 
         topics = c(30, 11), 
         text.cex = 0.8)

This shows the distinctive words versus the “shared” words.

Let’s now consider how the plot looks for two topics that are not similar (i.e. do not share an edge).

We’ll choose Topic 25 (“alcohol, drugs”) and Topic 32 (“genetics”)

plot(ctmFit, 
         type = "perspectives", 
         labeltype="prob",
         n = 30, 
         topics = c(25, 32), 
         text.cex = 0.8)

What’s interesting is that the only shared word is “data”, which is shared by a lot of topics. Recall – it was the most prevalent word in the corpus.

Semantic Coherence & Exclusivity

A quick view is that there are two ways of measuring topic “interpretability”: Semantic Coherence and Exclusivity.

Semantic coherence measures the consistency of the words used within the topic. Larger values are better and mean the topic is more consistent. Low values sometimes imply the topic may be composed of sub-topics.

Exclusivity measures how distinctive the top words are to that topic. For this, larger or smaller is not necessary better or worse, but indicates whether the topic is unique (high value) or broad (low value).

Let’s plot this using the topicQuality function.

topicQuality(ctmFit, out$documents)
##  [1] -132.39103 -156.57858  -93.41409 -122.58539 -115.06666  -98.92992
##  [7] -157.67238 -151.81079 -102.43084 -121.86327  -93.81976  -99.08010
## [13] -106.99197 -109.75569 -135.82653  -82.16096 -167.90385  -83.07698
## [19] -153.07141 -110.48911 -118.89850 -134.14551 -161.56335  -94.75840
## [25] -140.59263 -127.18362 -131.25581 -171.51092 -124.23067 -133.51247
## [31] -127.50723 -189.53865  -97.26029 -140.41097  -94.68884  -81.61411
## [37] -117.21629 -230.81613 -122.59603 -125.39035
##  [1] 9.821210 9.813169 9.805658 9.739507 9.922822 9.890600 9.918545
##  [8] 9.650631 9.781700 9.955408 9.886880 9.914400 9.907805 9.844354
## [15] 9.692662 9.939541 9.837876 9.886120 9.973505 9.873505 9.669783
## [22] 9.886677 9.780563 9.946365 9.903660 9.671637 9.865325 9.743770
## [29] 9.880882 9.890943 9.845279 9.787233 9.889740 9.730730 9.823780
## [36] 9.909686 9.801661 9.886913 9.828284 9.824010

Session Info

sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.3 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] visNetwork_2.0.1   igraph_1.1.2       stm_1.3.0         
##  [4] RColorBrewer_1.1-2 dplyr_0.7.4        purrr_0.2.4       
##  [7] readr_1.1.1        tidyr_0.7.2        tibble_1.3.4      
## [10] ggplot2_2.2.1.9000 tidyverse_1.1.1    quanteda_0.99     
## 
## loaded via a namespace (and not attached):
##  [1] wordcloud_2.5       slam_0.1-40         reshape2_1.4.2     
##  [4] haven_1.1.0         lattice_0.20-35     colorspace_1.3-2   
##  [7] htmltools_0.3.6     yaml_2.1.14         rlang_0.1.4        
## [10] foreign_0.8-69      glue_1.2.0.9000     modelr_0.1.1       
## [13] readxl_1.0.0        bindrcpp_0.2        matrixStats_0.52.2 
## [16] bindr_0.1           plyr_1.8.4          stringr_1.2.0      
## [19] munsell_0.4.3       gtable_0.2.0        cellranger_1.1.0   
## [22] rvest_0.3.2         htmlwidgets_0.9     psych_1.7.5        
## [25] evaluate_0.10.1     knitr_1.17          forcats_0.2.0      
## [28] parallel_3.4.2      broom_0.4.2         Rcpp_0.12.13       
## [31] scales_0.5.0.9000   backports_1.1.0     RcppParallel_4.3.20
## [34] jsonlite_1.5        fastmatch_1.1-0     mnormt_1.5-5       
## [37] hms_0.3             digest_0.6.12       stringi_1.1.5      
## [40] grid_3.4.2          rprojroot_1.2       tools_3.4.2        
## [43] magrittr_1.5        lazyeval_0.2.1      pkgconfig_2.0.1    
## [46] Matrix_1.2-11       data.table_1.10.4-3 xml2_1.1.1         
## [49] lubridate_1.7.1     assertthat_0.2.0    rmarkdown_1.8      
## [52] httr_1.3.1          R6_2.2.2            nlme_3.1-131       
## [55] compiler_3.4.2