R for Text Analysis

Wouter van Atteveldt & Kasper Welbers
ICA 2016

Case: State of the Union

data(sotu)
kable(aggregate(cbind(Freq=sotu.meta$id), list(Speaker=sotu.meta$headline), length))
Speaker Freq
Barack Obama 554
George W. Bush 536
kable(head(sotu.tokens, 2))
word sentence pos lemma offset aid id pos1 freq
It 1 PRP it 0 111541965 1 O 1
is 1 VBZ be 3 111541965 2 V 1

Contrast: Bush vs Obama

dtm = with(sotu.tokens[sotu.tokens$pos1 %in% c("N", "A", "M"), ],
           dtm.create(aid, lemma))
obama = sotu.meta$id[sotu.meta$headline == "Barack Obama"]
cmp  = corpora.compare(dtm, select.rows = obama)
h = rescale(log(cmp$over), c(1, .6666))
s = rescale(sqrt(cmp$chi), c(.25,1))
cmp$col = hsv(h, s, .33 + .67*s)
kable(head(cmp))
term termfreq.x termfreq.y termfreq relfreq.x relfreq.y over chi col
ability 7 7 14 0.0006252 0.0005868 1.0242109 0.0140887 #7D5F80
able 16 12 28 0.0014291 0.0010060 1.2109372 0.8550324 #815E8B
abuse 3 2 5 0.0002680 0.0001677 1.0858934 0.2687463 #805F86
access 12 9 21 0.0010718 0.0007545 1.1808800 0.6410800 #815F89
account 4 18 22 0.0003573 0.0015089 0.5409763 8.0594275 #A5578C
accountable 7 3 10 0.0006252 0.0002515 1.2986328 1.8663069 #835D91

Contrast: Bush vs Obama

cmp = arrange(cmp, -termfreq)
with(head(cmp, 130), plotWords(x=log(over), words=term, wordfreq=termfreq, random.y = T, col=col, scale=5))
text(-2, 0, "Bush", srt=90, col="red", cex=2)
text(2, 0, "Obama", srt=90, col="blue", cex=2)
title(xlab="Overrepresentation")