Wouter van Atteveldt
Glasgow Text Analysis, 2016-11-17
“The man who leaked cell-phone coverage of Saddam Hussein's execution was arrested”
lex = list(pos=c("good", "nice", "great"), neg=c("bad","stupid", "crooked"))
library(slam)
npos = row_sums(dtm[, colnames(dtm) %in% lex$pos])
nneg = row_sums(dtm[, colnames(dtm) %in% lex$neg])
sent = data.frame(id=rownames(dtm), npos=npos, nneg=nneg)
sent$subj = sent$npos + sent$nneg
sent$sent = ifelse(sent$subj == 0, 0,
(sent$npos-sent$nneg) / (sent$subj))
sent = merge(meta, sent)
Head(sent)
id | date | medium | year | month | week | npos | nneg | subj | sent |
---|---|---|---|---|---|---|---|---|---|
162317736 | 2016-02-01 | The New York Times | 2016-01-01 | 2016-02-01 | 2016-02-01 | 0 | 0 | 0 | 0 |
162317809 | 2016-01-25 | The New York Times | 2016-01-01 | 2016-01-01 | 2016-01-25 | 0 | 0 | 0 | 0 |
171932192 | 2016-07-04 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-07-04 | 0 | 0 | 0 | 0 |
171932219 | 2016-07-03 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-06-27 | 1 | 0 | 1 | 1 |
171932226 | 2016-07-03 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-06-27 | 0 | 0 | 0 | 0 |
171932227 | 2016-07-03 | The New York Times | 2016-01-01 | 2016-07-01 | 2016-06-27 | 0 | 0 | 0 | 0 |
a = aggregate(sent[c("sent", "subj")],
sent[c("week", "medium")], sum)
library(ggplot2)
ggplot(a, aes(x=week, y=sent, colour=medium)) +
geom_line()
Apply directly to tokenlist:
tokens$sent =0
tokens$sent[tokens$lemma %in% lex$pos] = 1
Head(tokens[tokens$sent > 0,])
id | sentence | offset | word | lemma | POS | POS1 | ner | sent | |
---|---|---|---|---|---|---|---|---|---|
230726 | 171932219 | 50 | 3857 | good | good | JJ | G | O | 1 |
316450 | 171932322 | 2 | 193 | good | good | JJ | G | O | 1 |
431087 | 171932429 | 22 | 3577 | Good | good | JJ | G | O | 1 |
547654 | 171932557 | 17 | 1848 | good | good | JJ | G | O | 1 |
547812 | 171932557 | 25 | 2589 | great | great | JJ | G | O | 1 |
560047 | 171932568 | 13 | 2011 | good | good | JJ | G | O | 1 |
Use quanteda::apply
library(quanteda)
library(corpustools)
dfm = dtm.to.dfm(dtm)
dfm = applyDictionary(dfm, lex)
head(dfm)
Document-feature matrix of: 999 documents, 2 features (0% sparse).
(showing first 6 documents and first 2 features)
features
docs pos neg
162317736 0 0
162317809 0 0
171932192 0 0
171932219 1 0
171932226 0 0
171932227 0 0
lex = readRDS("lexicon.rds")
dict = list(
pos = lex$word1[lex$priorpolarity == "positive"],
neg = lex$word1[lex$priorpolarity == "negative"],
trump = "Trump",
clinton = c("Hillary", "Clinton"))
tokens$doc = paste(tokens$id, tokens$sentence, sep="_")
dtm = dtm.create(tokens$doc, tokens$lemma, minfreq = 10)
x = sapply(dict, function(x)
row_sums(dtm[, colnames(dtm) %in% x]))
Head(x)
pos | neg | trump | clinton | |
---|---|---|---|---|
162317736_1 | 2 | 2 | 0 | 0 |
162317736_2 | 2 | 4 | 0 | 0 |
162317736_3 | 0 | 1 | 0 | 0 |
162317736_4 | 1 | 2 | 0 | 0 |
162317736_5 | 3 | 1 | 0 | 0 |
162317736_6 | 0 | 4 | 0 | 0 |
devtools::install_github("kasperwelbers/semnet")
library(semnet)
tokens$concept = NA
for(c in names(dict))
tokens$concept[tokens$lemma %in% dict[[c]]] = c
hits = windowedCoOccurenceNetwork(location=tokens$offset,
term=tokens$concept, context=tokens$id,
window.size=40, output.per.context = T)
hits = subset(hits, x %in% c("clinton", "trump")
& y %in% c("pos", "neg"))
hits$sent = ifelse(hits$y == "pos", 1, -1)
tapply(hits$sent, droplevels(hits$x), mean)
clinton trump
0.10921228 0.02649982