© 2014 Wouter van Atteveldt, license: [CC-BY-SA]
Latent Dirichlet Allocation is a topic modeling algorithm that automatically clusters words that for a cohesive pattern of co-occurrence. LDA assumes a 'generative model', where a text is generated by selecting one or more topics, and then drawing words from each of those topics. Thus, each document has multiple topics and each word can occur in multiple topics.
Topic models are constructed directly from a term-document matrix using the topicmodels
package.
As before, we use the create_matrix
function from the RTextTools
package to create the term-document matrix from a set of customer reviews.
Note that we need to remove empty rows or columns (e.g. empty reviews).
The achmea.csv
file can be downloaded from github.
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
library(slam)
d = read.csv("achmea.csv")
m = create_matrix(d$CONTENT, language="dutch", removeStopwords=T, )
m = m[row_sums(m) > 0,col_sums(m) > 0]
dim(m)
## [1] 21229 29022
Now, we can fit the topic model, say with k=10 topics and alpha=.5. (A smaller alpha means that topics are more 'concentrated' in the documents)
library(topicmodels)
fit = LDA(m, k=10, method="Gibbs", control=list(iter=500, alpha=.5))
We can visually inspect the words per topics using the terms
function:
terms(fit, 10)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "zorgverzekering" "centraalbeheer" "0900" "httpt"
## [2,] "fbto" "fbtowebcare" "fbto" "centraalbeheer"
## [3,] "2014" "evenapeldoornbellen" "mensen" "autoverzekering"
## [4,] "echt" "jullie" "ver" "gratis"
## [5,] "ditzo" "httpt" "echt" "afsluiten"
## [6,] "premies" "fbto" "gaan" "keuze"
## [7,] "zorgpremie" "weer" "weten" "via"
## [8,] "vergelijken" "bedankt" "helpen" "vraagt"
## [9,] "jaar" "lekker" "vinden" "scherpe"
## [10,] "zorgve" "nieuwe" "paar" "autocheck"
## Topic 5 Topic 6 Topic 7 Topic 8
## [1,] "fbto" "centraal" "apeldoorn" "beheer"
## [2,] "wel" "beheer" "even" "centraal"
## [3,] "verzekering" "achmea" "bellen" "the"
## [4,] "jullie" "httpt" "httpt" "commercial"
## [5,] "jaar" "2014" "volkert" "car"
## [6,] "fbtowebcare" "midwintermarathon" "nieuwe" "and"
## [7,] "eigen" "week" "reclame" "achmea"
## [8,] "premie" "uitslag" "winter" "banned"
## [9,] "verzekerd" "schreef" "echte" "driving"
## [10,] "per" "online" "youtube" "self"
## Topic 9 Topic 10
## [1,] "httpt" "goed"
## [2,] "auto" "snelle"
## [3,] "fbto" "goede"
## [4,] "nieuwe" "snel"
## [5,] "zelfrijdende" "afhandeling"
## [6,] "beheer" "schade"
## [7,] "centraal" "via"
## [8,] "commercial" "zeer"
## [9,] "via" "verzekeringen"
## [10,] "achmea" "service"
And let's make a word cloud of the first topic:
library(RColorBrewer)
library(wordcloud)
x = posterior(fit)$terms[1,]
x = sort(x, decreasing=T)[1:100]
x = x[!is.na(x)]
pal <- brewer.pal(6,"YlGnBu")
wordcloud(names(x), x, scale=c(6,.5), min.freq=1, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal)
## Warning: zorgverzekering could not be fit on page. It will not be plotted.
As in the 'Corpus Analysis' howto, we can define a function to compute the term statistics to filter on informative words:
library(tm)
## Loading required package: NLP
term.statistics <- function(dtm) {
dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0] # get rid of empty rows/columns
vocabulary = colnames(dtm)
data.frame(term = vocabulary,
characters = nchar(vocabulary),
number = grepl("[0-9]", vocabulary),
nonalpha = grepl("\\W", vocabulary),
termfreq = col_sums(dtm),
docfreq = col_sums(dtm > 0),
reldocfreq = col_sums(dtm > 0) / nDocs(dtm),
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0)))
}
terms = term.statistics(m)
words = terms$term[order(-terms$tfidf)[1:10000]]
m_filtered = m[, colnames(m) %in% words]
m_filtered = m_filtered[row_sums(m_filtered) > 0,col_sums(m_filtered) > 0]
fit = LDA(m_filtered, k=10, method="Gibbs", control=list(iter=500, alpha=.5))
terms(fit, 10)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "nee" "snelheid" "rtalsjehetziet" "peterrdev"
## [2,] "utrecht" "afwikkeling" "caravan" "cos6ypex2e7s"
## [3,] "ziyech" "bereikbaarheid" "uitzetten" "markhoekx"
## [4,] "nemo" "duidelijkheid" "tarieven" "bedenken"
## [5,] "goedemorgen" "vlotte" "verwerking" "hypotheken"
## [6,] "thanks" "afhandelen" "aanuit" "eindhoven"
## [7,] "briljantreclame" "correcte" "hond" "flexibel"
## [8,] "nvt" "eenvoudig" "bellenäò" "redelijke"
## [9,] "commercieel" "vriendelijkheid" "betalingen" "makkelijker"
## [10,] "sotsji" "nakomen" "maat" "nee"
## Topic 5 Topic 6 Topic 7
## [1,] "icoins" "prima" "peterrdev"
## [2,] "punten" "redelijk" "jorid"
## [3,] "klantvriendelijkste" "muts" "uitbetaling"
## [4,] "beoordeling" "helder" "gezeur"
## [5,] "moment" "desk" "fijne"
## [6,] "negatieve" "kut" "guuskuijer"
## [7,] "vlot" "beschikbaar" "verzekeringsmap"
## [8,] "cocnrn3f1sap" "eerlijke" "overzichtelijke"
## [9,] "verbeterpunten" "accountmanager" "woont"
## [10,] "verstuurd" "justronaldd" "cba"
## Topic 8 Topic 9 Topic 10
## [1,] "uitzetten" "moment" "prijs"
## [2,] "afspraak" "correcte" "klantvriendelijk"
## [3,] "duur" "korte" "kwaliteit"
## [4,] "prima" "sorry" "verhouding"
## [5,] "attent" "nette" "behulpzaam"
## [6,] "werknemer" "onlinemarketeer" "bereikbaarheid"
## [7,] "koersen" "terugkoppeling" "betrouwbaar"
## [8,] "correcte" "harte" "gemakkelijk"
## [9,] "noemen" "verzonden" "heldere"
## [10,] "flexibiliteit" "virupa" "carolien"
We can also make a topic model of a subset of the data, for example of all the negative reviews:
neg = d$CONTENT[!is.na(d$SENTIMENT) & d$SENTIMENT == -1]
m_neg = create_matrix(neg, removeStopwords=T, language="dutch")
m_neg = m_neg[row_sums(m_neg) > 0,col_sums(m_neg) > 0]
fit = LDA(m_neg, k=10, method="Gibbs", control=list(iter=500, alpha=.5))
terms(fit, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "mail" "module" "eigen" "communicatie" "schade"
## [2,] "vraag" "maanden" "risico" "goed" "fbto"
## [3,] "antwoord" "lang" "betalen" "zeer" "keer"
## [4,] "email" "modules" "erg" "fbto" "wel"
## [5,] "telefonisch" "wachttijd" "bedrag" "slecht" "goed"
## [6,] "kreeg" "wel" "per" "slechte" "steeds"
## [7,] "contact" "weer" "wel" "afspraken" "vergoed"
## [8,] "ontvangen" "jaar" "euro" "telefonische" "zeker"
## [9,] "per" "per" "jaar" "bereikbaarheid" "afhandeling"
## [10,] "reactie" "vergoeding" "betaald" "personeel" "gebeld"
## Topic 6 Topic 7 Topic 8 Topic 9
## [1,] "alleen" "via" "fbto" "klant"
## [2,] "fbto" "website" "verzekering" "klanten"
## [3,] "graag" "wel" "wij" "verzekeringen"
## [4,] "duidelijk" "soms" "reisverzekering" "jullie"
## [5,] "klant" "beter" "jaar" "fbto"
## [6,] "zorgverzekering" "alle" "moeten" "nieuwe"
## [7,] "declaraties" "site" "wel" "jaar"
## [8,] "declaratie" "mensen" "auto" "korting"
## [9,] "kosten" "moeten" "gaan" "jaren"
## [10,] "overzicht" "internet" "onze" "jammer"
## Topic 10
## [1,] "premie"
## [2,] "prijs"
## [3,] "fbto"
## [4,] "verzekering"
## [5,] "erg"
## [6,] "duur"
## [7,] "verzekerd"
## [8,] "autoverzekering"
## [9,] "betere"
## [10,] "verzekeringen"
If you want to e.g. correlate topics with sentiment or add the topics as features to the machine learning, it is useful to extract which documents belong to which topic.
The fit
object contains the needed information, which can be cast into a matrix:
library(reshape2)
assignments = data.frame(i=fit@wordassignments$i, j=fit@wordassignments$j, v=fit@wordassignments$v)
docsums = acast(assignments, i ~ v, value.var='j', fun.aggregate=length)
dim(docsums)
## [1] 2228 10
head(docsums)
## 1 2 3 4 5 6 7 8 9 10
## 1 0 0 0 0 0 0 3 1 0 2
## 2 0 0 0 0 0 0 0 0 0 1
## 3 0 0 1 0 13 0 0 4 0 8
## 4 1 0 0 0 0 0 4 2 8 0
## 5 24 0 0 6 9 2 0 0 0 0
## 6 0 0 1 1 1 13 7 3 0 0