Wouter van Atteveldt, Nel Ruigrok, Kasper Welbers
Session 2: Corpus and Network Analysis
Session 1
Session 2
library(RTextTools)
m = create_matrix(c("I love data", "John loves data!"))
as.matrix(m)
Terms
Docs data john love loves
1 1 0 1 0
2 1 1 0 1
library(corpustools)
head(term.statistics(m))
term | characters | number | nonalpha | termfreq | docfreq | reldocfreq | tfidf | |
---|---|---|---|---|---|---|---|---|
data | data | 4 | FALSE | FALSE | 2 | 2 | 1.0 | 0.0000000 |
john | john | 4 | FALSE | FALSE | 1 | 1 | 0.5 | 0.3333333 |
love | love | 4 | FALSE | FALSE | 1 | 1 | 0.5 | 0.5000000 |
loves | loves | 5 | FALSE | FALSE | 1 | 1 | 0.5 | 0.3333333 |
RTextTools
NLP
, openNLP
rJava
, pain to install)data(sotu)
head(sotu.tokens)
word | sentence | pos | lemma | offset | aid | id | pos1 | freq |
---|---|---|---|---|---|---|---|---|
It | 1 | PRP | it | 0 | 111541965 | 1 | O | 1 |
is | 1 | VBZ | be | 3 | 111541965 | 2 | V | 1 |
our | 1 | PRP$ | we | 6 | 111541965 | 3 | O | 1 |
unfinished | 1 | JJ | unfinished | 10 | 111541965 | 4 | A | 1 |
task | 1 | NN | task | 21 | 111541965 | 5 | N | 1 |
to | 1 | TO | to | 26 | 111541965 | 6 | ? | 1 |
tokens = amcat.gettokens(conn, project=1, articleset=set)
tokens = amcat.gettokens(conn, project=1, articleset=set, module="corenlp_lemmatize")
dtm = with(subset(sotu.tokens, pos1=="M"),
dtm.create(aid, lemma))
dtm.wordcloud(dtm)
stats = term.statistics(dtm)
stats= arrange(stats, -termfreq)
head(stats)
term | characters | number | nonalpha | termfreq | docfreq | reldocfreq | tfidf |
---|---|---|---|---|---|---|---|
America | 7 | FALSE | FALSE | 409 | 346 | 0.3940774 | 0.6883991 |
Americans | 9 | FALSE | FALSE | 179 | 158 | 0.1799544 | 1.4280099 |
Congress | 8 | FALSE | FALSE | 168 | 149 | 0.1697039 | 1.1398894 |
Iraq | 4 | FALSE | FALSE | 109 | 65 | 0.0740319 | 1.4157528 |
States | 6 | FALSE | FALSE | 99 | 89 | 0.1013667 | 0.9573274 |
United | 6 | FALSE | FALSE | 88 | 82 | 0.0933941 | 0.7817946 |
Compare speakers, media, periods, …
library(corpustools)
data(sotu)
obama = sotu.meta$id[sotu.meta$headline == "Barack Obama"]
dtm_o = with(subset(sotu.tokens, aid %in% obama & pos1 %in% c("N", "A", "M")),
dtm.create(aid, lemma))
dtm.wordcloud(dtm_o)
dtm_b = with(subset(sotu.tokens, !(aid %in% obama) & pos1 %in% c("N", "A", "M")),
dtm.create(aid, lemma))
cmp = corpora.compare(dtm_o, dtm_b)
cmp = arrange(cmp, -chi)
head(cmp)
term | termfreq.x | termfreq.y | termfreq | relfreq.x | relfreq.y | over | chi |
---|---|---|---|---|---|---|---|
job | 200 | 56 | 256 | 0.0195351 | 0.0051090 | 3.3614321 | 92.34135 |
terrorist | 13 | 103 | 116 | 0.0012698 | 0.0093970 | 0.2183120 | 64.24944 |
freedom | 8 | 79 | 87 | 0.0007814 | 0.0072074 | 0.2170491 | 53.48220 |
Iraq | 15 | 94 | 109 | 0.0014651 | 0.0085759 | 0.2574317 | 52.32461 |
terror | 0 | 55 | 55 | 0.0000000 | 0.0050178 | 0.1661740 | 51.50577 |
business | 109 | 31 | 140 | 0.0106466 | 0.0028282 | 3.0423131 | 49.32315 |
with(utils::head(cmp, 100),
plotWords(x=log(over), words = term, wordfreq = chi, random.y = T))
rownames(dtm_o)[1:3 ]
[1] "111541965" "111541995" "111542001"
meta = sotu.meta[match(rownames(dtm_o), sotu.meta$id), ]
meta$year = format(meta$date, "%Y")
head(meta)
id | medium | headline | date | year |
---|---|---|---|---|
111541965 | Speeches | Barack Obama | 2013-02-12 | 2013 |
111541995 | Speeches | Barack Obama | 2013-02-12 | 2013 |
111542001 | Speeches | Barack Obama | 2013-02-12 | 2013 |
111542006 | Speeches | Barack Obama | 2013-02-12 | 2013 |
111542013 | Speeches | Barack Obama | 2013-02-12 | 2013 |
111542018 | Speeches | Barack Obama | 2013-02-12 | 2013 |
d = corpora.compare.list(dtm_o, as.character(meta$year), return.df=T, .progress="none")
d = arrange(d,-chi)
head(d)
corpus | term | termfreq.x | termfreq.y | termfreq | relfreq.x | relfreq.y | over | chi |
---|---|---|---|---|---|---|---|---|
2009 | plan | 21 | 18 | 39 | 0.0145429 | 0.0020469 | 5.101313 | 51.03811 |
2014 | Cory | 9 | 0 | 9 | 0.0049724 | 0.0000000 | 5.972376 | 41.94405 |
2013 | reduction | 7 | 0 | 7 | 0.0039909 | 0.0000000 | 4.990878 | 33.88177 |
2012 | unit | 7 | 0 | 7 | 0.0039326 | 0.0000000 | 4.932584 | 33.28456 |
2009 | recovery | 11 | 7 | 18 | 0.0076177 | 0.0007960 | 4.798297 | 32.88778 |
2009 | lending | 6 | 1 | 7 | 0.0041551 | 0.0001137 | 4.628769 | 29.64959 |
library(corpustools)
set.seed(123)
m = lda.fit(dtm_o, K = 5, alpha = .1)
kable(terms(m, 10))
Topic 1 | Topic 2 | Topic 3 | Topic 4 | Topic 5 |
---|---|---|---|---|
people | America | job | job | year |
country | world | school | new | tax |
time | people | education | energy | more |
future | security | college | year | family |
government | new | child | business | deficit |
american | american | student | America | health |
America | country | kid | more | care |
Americans | war | America | company | Americans |
day | year | more | american | cost |
nation | troops | high | worker | Congress |
library(LDAvis)
json = ldavis_json(m, dtm_o)
serVis(json)
p = readRDS("perplex.rds")
p = aggregate(p["p"], p["k"], mean)
library(ggplot2)
ggplot(p, aes(x=k, y=p, )) + geom_line() +geom_point()
tpd = topics.per.document(m)
tpd = merge(sotu.meta, tpd)
head(tpd)
id | medium | headline | date | X1 | X2 | X3 | X4 | X5 |
---|---|---|---|---|---|---|---|---|
111541965 | Speeches | Barack Obama | 2013-02-12 | 0.8133333 | 0.0133333 | 0.0133333 | 0.1466667 | 0.0133333 |
111541995 | Speeches | Barack Obama | 2013-02-12 | 0.0933333 | 0.0044444 | 0.4488889 | 0.4488889 | 0.0044444 |
111542001 | Speeches | Barack Obama | 2013-02-12 | 0.0400000 | 0.0036364 | 0.6581818 | 0.0400000 | 0.2581818 |
111542006 | Speeches | Barack Obama | 2013-02-12 | 0.4628571 | 0.0057143 | 0.5200000 | 0.0057143 | 0.0057143 |
111542013 | Speeches | Barack Obama | 2013-02-12 | 0.2897959 | 0.3306122 | 0.2897959 | 0.0857143 | 0.0040816 |
111542018 | Speeches | Barack Obama | 2013-02-12 | 0.0046512 | 0.0976744 | 0.4232558 | 0.4697674 | 0.0046512 |
Session 1
Session 2
igraph
E(g)$label
, etcsemnet
github.com/kasperwelbers/semnet
library(semnet)
g = coOccurenceNetwork(dtm)
g = windowedCoOccurenceNetwork(location, term, context)
g_backbone = getBackboneNetwork(g, alpha=0.01, max.vertices=100)
write.graph(g, filename, format)
library(rgexf)
gefx = igraph.to.gexf(g)
print(gefx, file="..")
Hand-outs:
What you have learned:
Go out and code!