Wouter van Atteveldt
Glasgow Text Analysis, 2016-11-17
10:30 - 12:00
13:30 - 15:00
15:15 - 17:00
docker run --name corenlp -dp 9000:9000 chilland/corenlp-docker
docker run --name nlpipe --link corenlp:corenlp -e "CORENLP_HOST=http://corenlp:9000" -dp 5001:5001 vanatteveldt/nlpipe
devtools::install_github("vanatteveldt/nlpiper")
library(nlpiper)
process("test_upper", "test")
[1] "TEST"
library(nlpiper)
text = "Donald trump was elected president of the United States"
process("corenlp_lemmatize", text, format="csv")
# choose one:
download.file("http://i.amcat.nl/tokens.rds", "tokens.rds")
download.file("http://i.amcat.nl/tokens_full.rds", "tokens.rds")
download.file("http://i.amcat.nl/tokens_sample.rds", "tokens.rds")
tokens = readRDS("tokens.rds")
Head(tokens)
id | sentence | offset | word | lemma | POS | POS1 | ner | |
---|---|---|---|---|---|---|---|---|
78757 | 162317736 | 1 | 0 | New | New | NNP | R | O |
78758 | 162317736 | 1 | 4 | Technologies | Technologies | NNP | R | O |
78759 | 162317736 | 1 | 17 | Give | give | VB | V | O |
78760 | 162317736 | 1 | 22 | Government | Government | NNP | R | O |
78761 | 162317736 | 1 | 33 | Ample | Ample | NNP | R | O |
78762 | 162317736 | 1 | 39 | Means | Means | NNP | R | O |
The corpustools package
devtools::install_github("kasperwelbers/corpus-tools")
library(quanteda)
dfm = dfm(c("a text", "and another text"))
dtm = convert(dfm, "tm")
class(dtm)
[1] "DocumentTermMatrix" "simple_triplet_matrix"
library(corpustools)
dfm = dtm.to.dfm(dtm)
class(dfm)
[1] "dfmSparse"
attr(,"package")
[1] "quanteda"
dtm = dtm.create(tokens$id, tokens$lemma)
dtm.names = with(subset(tokens, POS1=="R"),
dtm.create(id, lemma))
dtm.adj = with(subset(tokens, POS1=="G"),
dtm.create(id, lemma))
dtm.persons = with(subset(tokens, ner=="PERSON"),
dtm.create(id, lemma))
stats = term.statistics(dtm.persons)
stats = plyr::arrange(stats, -termfreq)
Head(stats)
term | characters | number | nonalpha | termfreq | docfreq | reldocfreq | tfidf |
---|---|---|---|---|---|---|---|
Trump | 5 | FALSE | FALSE | 5237 | 674 | 0.6884576 | 0.12922905 |
Clinton | 7 | FALSE | FALSE | 3357 | 582 | 0.5944842 | 0.12456663 |
Donald | 6 | FALSE | FALSE | 1087 | 643 | 0.6567926 | 0.04239862 |
Obama | 5 | FALSE | FALSE | 1082 | 318 | 0.3248212 | 0.17415394 |
Sanders | 7 | FALSE | FALSE | 1040 | 190 | 0.1940756 | 0.36697430 |
Hillary | 7 | FALSE | FALSE | 892 | 496 | 0.5066394 | 0.06503282 |
meta = readRDS("meta.rds")
nyt = meta$medium == "The New York Times"
nyt1 = meta$id[nyt & meta$date < "2016-08-01"]
nyt2 = meta$id[nyt & meta$date >= "2016-08-01"]
dtm1 = with(subset(tokens, id %in% nyt1 & POS1=="G"),
dtm.create(id, lemma))
dtm2 = with(subset(tokens, id %in% nyt2 & POS1=="G"),
dtm.create(id, lemma))
cmp = corpora.compare(dtm1, dtm2)
cmp = plyr::arrange(cmp, -chi)
Head(cmp)
term | termfreq.x | termfreq.y | termfreq | relfreq.x | relfreq.y | over | chi |
---|---|---|---|---|---|---|---|
presumptive | 63 | 0 | 63 | 0.0023287 | 0.0000606 | 38.4190954 | 37.69604 |
russian | 33 | 66 | 99 | 0.0012371 | 0.0040611 | 0.3046290 | 36.30182 |
hispanic | 27 | 57 | 84 | 0.0010188 | 0.0035156 | 0.2897992 | 33.34656 |
brazilian | 0 | 17 | 17 | 0.0000364 | 0.0010910 | 0.0333499 | 28.46516 |
racist | 0 | 16 | 16 | 0.0000364 | 0.0010304 | 0.0353117 | 26.79011 |
incumbent | 0 | 15 | 15 | 0.0000364 | 0.0009698 | 0.0375186 | 25.11513 |
dtm.wordcloud(dtm.persons, freq.fun = sqrt)
corpustools::plotWords
h = rescale(log(cmp$over), c(1, .6666))
s = rescale(sqrt(cmp$chi), c(.25,1))
cmp$col = hsv(h, s, .33 + .67*s)
cmp = arrange(cmp, -chi)
with(head(cmp, 75), plotWords(x=log(over), words=term, wordfreq=chi, random.y = T, col=col, scale=2))
terms = term.statistics(dtm.persons)
wordfreqs = dtm.to.df(dtm.persons)
wordfreqs = merge(meta, wordfreqs, by.x="id", by.y="doc")
dates = aggregate(wordfreqs["date"], by=wordfreqs["term"], FUN=mean)
terms = merge(terms, dates)
terms = plyr::arrange(terms, -docfreq)
with(head(terms, 50), plotWords(words=term, x=date, wordfreq = termfreq))
axis(1)
dtm = with(subset(tokens, POS1 %in% c("G", "N", "V")),
dtm.create(id, lemma, minfreq = 10))
dtm = dtm[, !colnames(dtm) %in% tm::stopwords()]
set.seed(1234)
m = lda.fit(dtm, K = 10, num.iterations = 50, alpha=0.5)
Head(terms(m, 10))
Topic 1 | Topic 2 | Topic 3 | Topic 4 | Topic 5 | Topic 6 | Topic 7 | Topic 8 | Topic 9 | Topic 10 |
---|---|---|---|---|---|---|---|---|---|
make | say | year | debate | country | tax | show | say | voter | say |
city | people | first | media | will | say | art | campaign | win | |
work | get | show | political | people | year | include | presidential | vote | official |
can | woman | play | question | american | will | exhibition | republican | state | case |
take | think | write | use | police | company | work | convention | candidate | report |
come | can | family | time | gun | job | artist | know | republican | year |
Visualizing Topic Models: LDAvis
install.packages(c("LDAvis", "servr"))
library(LDAvis)
dtm = dtm[row_sums(dtm) > 0,]
json = ldavis_json(m, dtm)
LDAvis::serVis(json)
topics = c("city", "woman", "play", "debate", "country", "tax", "art", "campaign", "votes", "email")
cm = cor(t(m@beta))
colnames(cm) = rownames(cm) = topics
diag(cm) = 0
heatmap(cm, symm = T)
compare.topics <- function(m, cmp_topics) {
docs = factor(m@wordassignments$i, labels=m@documents)
terms = factor(m@wordassignments$j, labels=m@terms)
assignments = data.frame(doc=docs, term=terms, freq=m@wordassignments$v)
terms = dcast(assignments, term ~ freq, value.var = "doc", fun.aggregate = length)
terms = terms[, c(1, cmp_topics+1)]
terms$freq = rowSums(terms[-1])
terms = terms[terms$freq > 0,]
terms$prop = terms[[2]] / terms$freq
terms$col = hsv(rescale(terms$prop, c(1, .6666)), .5, .5)
terms[order(-terms$freq), ]
}
terms = compare.topics(m, match(c("tax", "email"), topics))
with(head(terms, 100), plotWords(x=prop, wordfreq = freq, words = term, col=col, xaxt="none", random.y = T, scale=2))
tpd = topics.per.document(m)
colnames(tpd)[-1] = topics
tpd = merge(meta, tpd)
Head(tpd)
id | date | medium | year | month | week | city | woman | play | debate | country | tax | art | campaign | votes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
162317736 | 2016-02-01 | The New York Times | 2016-01-01 | 2016-02-01 | 2016-02-01 | 0.2146172 | 0.0916473 | 0.0313225 | 0.1125290 | 0.0336427 | 0.0452436 | 0.0290023 | 0.0243619 | 0.0243619 | 0.3932715 |
162317809 | 2016-01-25 | The New York Times | 2016-01-01 | 2016-01-01 | 2016-01-25 | 0.1926752 | 0.0843949 | 0.0684713 | 0.1066879 | 0.0302548 | 0.1385350 | 0.0270701 | 0.0175159 | 0.0143312 | 0.3200637 |
171932192 | 2016-07-04 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-07-04 | 0.1936416 | 0.0028902 | 0.0838150 | 0.1011561 | 0.0722543 | 0.2341040 | 0.0202312 | 0.0028902 | 0.0260116 | 0.2630058 |
171932219 | 2016-07-03 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-06-27 | 0.0538793 | 0.5107759 | 0.0969828 | 0.1099138 | 0.1185345 | 0.0323276 | 0.0237069 | 0.0280172 | 0.0193966 | 0.0064655 |
171932226 | 2016-07-03 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-06-27 | 0.0454545 | 0.1363636 | 0.0454545 | 0.2272727 | 0.2272727 | 0.0454545 | 0.0454545 | 0.0454545 | 0.0454545 | 0.1363636 |
171932227 | 2016-07-03 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-06-27 | 0.1089385 | 0.2597765 | 0.2877095 | 0.0418994 | 0.0083799 | 0.0027933 | 0.0586592 | 0.1480447 | 0.0363128 | 0.0474860 |