LDA Topic Modeling

© 2014 Wouter van Atteveldt, license: [CC-BY-SA]

Latent Dirichlet Allocation is a topic modeling algorithm that automatically clusters words that for a cohesive pattern of co-occurrence. LDA assumes a 'generative model', where a text is generated by selecting one or more topics, and then drawing words from each of those topics. Thus, each document has multiple topics and each word can occur in multiple topics.

Creating a topic model

Topic models are constructed directly from a term-document matrix using the topicmodels package. As before, we use the create_matrix function from the RTextTools package to create the term-document matrix from a set of customer reviews. Note that we need to remove empty rows or columns (e.g. empty reviews). The achmea.csv file can be downloaded from github.

library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve
library(slam)
d = read.csv("achmea.csv")
m = create_matrix(d$CONTENT, language="dutch", removeStopwords=T, )
m = m[row_sums(m) > 0,col_sums(m) > 0]
dim(m)
## [1] 21229 29022

Now, we can fit the topic model, say with k=10 topics and alpha=.5. (A smaller alpha means that topics are more 'concentrated' in the documents)

library(topicmodels)
fit = LDA(m, k=10, method="Gibbs", control=list(iter=500, alpha=.5))

We can visually inspect the words per topics using the terms function:

terms(fit, 10)
##       Topic 1           Topic 2               Topic 3  Topic 4          
##  [1,] "zorgverzekering" "centraalbeheer"      "0900"   "httpt"          
##  [2,] "fbto"            "fbtowebcare"         "fbto"   "centraalbeheer" 
##  [3,] "2014"            "evenapeldoornbellen" "mensen" "autoverzekering"
##  [4,] "echt"            "jullie"              "ver"    "gratis"         
##  [5,] "ditzo"           "httpt"               "echt"   "afsluiten"      
##  [6,] "premies"         "fbto"                "gaan"   "keuze"          
##  [7,] "zorgpremie"      "weer"                "weten"  "via"            
##  [8,] "vergelijken"     "bedankt"             "helpen" "vraagt"         
##  [9,] "jaar"            "lekker"              "vinden" "scherpe"        
## [10,] "zorgve"          "nieuwe"              "paar"   "autocheck"      
##       Topic 5       Topic 6             Topic 7     Topic 8     
##  [1,] "fbto"        "centraal"          "apeldoorn" "beheer"    
##  [2,] "wel"         "beheer"            "even"      "centraal"  
##  [3,] "verzekering" "achmea"            "bellen"    "the"       
##  [4,] "jullie"      "httpt"             "httpt"     "commercial"
##  [5,] "jaar"        "2014"              "volkert"   "car"       
##  [6,] "fbtowebcare" "midwintermarathon" "nieuwe"    "and"       
##  [7,] "eigen"       "week"              "reclame"   "achmea"    
##  [8,] "premie"      "uitslag"           "winter"    "banned"    
##  [9,] "verzekerd"   "schreef"           "echte"     "driving"   
## [10,] "per"         "online"            "youtube"   "self"      
##       Topic 9        Topic 10       
##  [1,] "httpt"        "goed"         
##  [2,] "auto"         "snelle"       
##  [3,] "fbto"         "goede"        
##  [4,] "nieuwe"       "snel"         
##  [5,] "zelfrijdende" "afhandeling"  
##  [6,] "beheer"       "schade"       
##  [7,] "centraal"     "via"          
##  [8,] "commercial"   "zeer"         
##  [9,] "via"          "verzekeringen"
## [10,] "achmea"       "service"

And let's make a word cloud of the first topic:

library(RColorBrewer)
library(wordcloud)
x = posterior(fit)$terms[1,]
x = sort(x, decreasing=T)[1:100]
x = x[!is.na(x)]
pal <- brewer.pal(6,"YlGnBu")
wordcloud(names(x), x, scale=c(6,.5), min.freq=1, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal)
## Warning: zorgverzekering could not be fit on page. It will not be plotted.

plot of chunk unnamed-chunk-4

As in the 'Corpus Analysis' howto, we can define a function to compute the term statistics to filter on informative words:

library(tm)
## Loading required package: NLP
term.statistics <- function(dtm) {
    dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0]    # get rid of empty rows/columns
    vocabulary = colnames(dtm)
    data.frame(term = vocabulary,
               characters = nchar(vocabulary),
               number = grepl("[0-9]", vocabulary),
               nonalpha = grepl("\\W", vocabulary),
               termfreq = col_sums(dtm),
               docfreq = col_sums(dtm > 0),
               reldocfreq = col_sums(dtm > 0) / nDocs(dtm),
               tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0)))
}

terms = term.statistics(m)
words = terms$term[order(-terms$tfidf)[1:10000]]
m_filtered = m[, colnames(m) %in% words]
m_filtered = m_filtered[row_sums(m_filtered) > 0,col_sums(m_filtered) > 0]

fit = LDA(m_filtered, k=10, method="Gibbs", control=list(iter=500, alpha=.5))
terms(fit, 10)
##       Topic 1           Topic 2           Topic 3          Topic 4       
##  [1,] "nee"             "snelheid"        "rtalsjehetziet" "peterrdev"   
##  [2,] "utrecht"         "afwikkeling"     "caravan"        "cos6ypex2e7s"
##  [3,] "ziyech"          "bereikbaarheid"  "uitzetten"      "markhoekx"   
##  [4,] "nemo"            "duidelijkheid"   "tarieven"       "bedenken"    
##  [5,] "goedemorgen"     "vlotte"          "verwerking"     "hypotheken"  
##  [6,] "thanks"          "afhandelen"      "aanuit"         "eindhoven"   
##  [7,] "briljantreclame" "correcte"        "hond"           "flexibel"    
##  [8,] "nvt"             "eenvoudig"       "bellenäò"       "redelijke"   
##  [9,] "commercieel"     "vriendelijkheid" "betalingen"     "makkelijker" 
## [10,] "sotsji"          "nakomen"         "maat"           "nee"         
##       Topic 5               Topic 6          Topic 7          
##  [1,] "icoins"              "prima"          "peterrdev"      
##  [2,] "punten"              "redelijk"       "jorid"          
##  [3,] "klantvriendelijkste" "muts"           "uitbetaling"    
##  [4,] "beoordeling"         "helder"         "gezeur"         
##  [5,] "moment"              "desk"           "fijne"          
##  [6,] "negatieve"           "kut"            "guuskuijer"     
##  [7,] "vlot"                "beschikbaar"    "verzekeringsmap"
##  [8,] "cocnrn3f1sap"        "eerlijke"       "overzichtelijke"
##  [9,] "verbeterpunten"      "accountmanager" "woont"          
## [10,] "verstuurd"           "justronaldd"    "cba"            
##       Topic 8         Topic 9           Topic 10          
##  [1,] "uitzetten"     "moment"          "prijs"           
##  [2,] "afspraak"      "correcte"        "klantvriendelijk"
##  [3,] "duur"          "korte"           "kwaliteit"       
##  [4,] "prima"         "sorry"           "verhouding"      
##  [5,] "attent"        "nette"           "behulpzaam"      
##  [6,] "werknemer"     "onlinemarketeer" "bereikbaarheid"  
##  [7,] "koersen"       "terugkoppeling"  "betrouwbaar"     
##  [8,] "correcte"      "harte"           "gemakkelijk"     
##  [9,] "noemen"        "verzonden"       "heldere"         
## [10,] "flexibiliteit" "virupa"          "carolien"

Creating a topic model per sentiment category

We can also make a topic model of a subset of the data, for example of all the negative reviews:

neg = d$CONTENT[!is.na(d$SENTIMENT) & d$SENTIMENT == -1]
m_neg = create_matrix(neg, removeStopwords=T, language="dutch")
m_neg = m_neg[row_sums(m_neg) > 0,col_sums(m_neg) > 0]
fit = LDA(m_neg, k=10, method="Gibbs", control=list(iter=500, alpha=.5))
terms(fit, 10)
##       Topic 1       Topic 2      Topic 3   Topic 4          Topic 5      
##  [1,] "mail"        "module"     "eigen"   "communicatie"   "schade"     
##  [2,] "vraag"       "maanden"    "risico"  "goed"           "fbto"       
##  [3,] "antwoord"    "lang"       "betalen" "zeer"           "keer"       
##  [4,] "email"       "modules"    "erg"     "fbto"           "wel"        
##  [5,] "telefonisch" "wachttijd"  "bedrag"  "slecht"         "goed"       
##  [6,] "kreeg"       "wel"        "per"     "slechte"        "steeds"     
##  [7,] "contact"     "weer"       "wel"     "afspraken"      "vergoed"    
##  [8,] "ontvangen"   "jaar"       "euro"    "telefonische"   "zeker"      
##  [9,] "per"         "per"        "jaar"    "bereikbaarheid" "afhandeling"
## [10,] "reactie"     "vergoeding" "betaald" "personeel"      "gebeld"     
##       Topic 6           Topic 7    Topic 8           Topic 9        
##  [1,] "alleen"          "via"      "fbto"            "klant"        
##  [2,] "fbto"            "website"  "verzekering"     "klanten"      
##  [3,] "graag"           "wel"      "wij"             "verzekeringen"
##  [4,] "duidelijk"       "soms"     "reisverzekering" "jullie"       
##  [5,] "klant"           "beter"    "jaar"            "fbto"         
##  [6,] "zorgverzekering" "alle"     "moeten"          "nieuwe"       
##  [7,] "declaraties"     "site"     "wel"             "jaar"         
##  [8,] "declaratie"      "mensen"   "auto"            "korting"      
##  [9,] "kosten"          "moeten"   "gaan"            "jaren"        
## [10,] "overzicht"       "internet" "onze"            "jammer"       
##       Topic 10         
##  [1,] "premie"         
##  [2,] "prijs"          
##  [3,] "fbto"           
##  [4,] "verzekering"    
##  [5,] "erg"            
##  [6,] "duur"           
##  [7,] "verzekerd"      
##  [8,] "autoverzekering"
##  [9,] "betere"         
## [10,] "verzekeringen"

Extracting the topics per document

If you want to e.g. correlate topics with sentiment or add the topics as features to the machine learning, it is useful to extract which documents belong to which topic. The fit object contains the needed information, which can be cast into a matrix:

library(reshape2)
assignments = data.frame(i=fit@wordassignments$i, j=fit@wordassignments$j, v=fit@wordassignments$v)
docsums = acast(assignments, i ~ v, value.var='j', fun.aggregate=length) 
dim(docsums)
## [1] 2228   10
head(docsums)
##    1 2 3 4  5  6 7 8 9 10
## 1  0 0 0 0  0  0 3 1 0  2
## 2  0 0 0 0  0  0 0 0 0  1
## 3  0 0 1 0 13  0 0 4 0  8
## 4  1 0 0 0  0  0 4 2 8  0
## 5 24 0 0 6  9  2 0 0 0  0
## 6  0 0 1 1  1 13 7 3 0  0