require(quanteda)
## Loading required package: quanteda
## quanteda version 0.99.22
## Using 7 of 8 threads for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
load("../data/guardianSample.RData")
You can also download Guardian corpus and save it in you computer.
This corpus contains 6,000 Guardian news articles from 2012 to 2016.
ndoc(guardianSample)
## [1] 6000
range(docvars(guardianSample, 'date'))
## [1] "2012-01-02" "2016-12-31"
Create new document variables and tokenize texts:
docvars(guardianSample, 'month') <- format(docvars(guardianSample, 'date'), '%Y-%m')
docvars(guardianSample, 'year') <- format(docvars(guardianSample, 'date'), '%Y')
toks <- tokens(guardianSample)
toks <- tokens_select(toks, stopwords('english'), select = 'remove', padding = TRUE)
toks <- tokens_select(toks, '^[\\p{P}\\p{S}]$', select = 'remove', valuetype = 'regex', padding = TRUE)
By collocation analysis, we can identify multi-word expressions that are very frequent in newspapers articles. One of the most common type of multi-word expressions is proper names, which can be identified simply based on capitalization in English texts.
cap_toks <- tokens_select(toks, '^[A-Z]', valuetype = 'regex', case_insensitive = FALSE, padding = TRUE)
cap_col <- textstat_collocations(cap_toks, min_count = 5)
head(cap_col, 20)
## collocation count length lambda z
## 1 david cameron 860 2 8.312459 150.05618
## 2 donald trump 774 2 8.483164 124.99786
## 3 george osborne 362 2 8.803971 109.38982
## 4 hillary clinton 525 2 9.247096 104.30271
## 5 new york 1016 2 10.604024 101.76535
## 6 islamic state 330 2 9.958311 99.70884
## 7 white house 478 2 10.067493 97.81578
## 8 european union 348 2 8.394972 96.47133
## 9 jeremy corbyn 244 2 8.885666 92.41831
## 10 boris johnson 245 2 9.820288 86.13744
## 11 bernie sanders 394 2 10.057545 85.95219
## 12 guardian australia 237 2 6.481932 85.73424
## 13 northern ireland 204 2 10.025226 84.51172
## 14 home office 216 2 9.838011 80.06444
## 15 ed miliband 173 2 10.008367 79.61398
## 16 south africa 172 2 7.724806 79.21350
## 17 barack obama 343 2 9.915702 79.18010
## 18 ted cruz 417 2 10.912503 79.00897
## 19 black friday 190 2 8.615139 78.21714
## 20 south carolina 271 2 9.560901 78.07656
The result of collocation analysis is not only interesting but useful: you can be use it to compound tokens. Compounding makes tokens less ambiguous and significantly improves quality of statistical analysis in the downstream. We will only compound strongly associated (p<0.998) multi-word expressions here by sub-setting cap_col$collocation
.
comp_toks <- tokens_compound(toks, phrase(cap_col$collocation[cap_col$z > 3]))
toks[['text7005']][370:450] # before compounding
## [1] "" "" ""
## [4] "need" "" ""
## [7] "incriminate" "" ""
## [10] "anyone" "else" ""
## [13] "Scotland" "Yard's" "investigation"
## [16] "" "" ""
## [19] "rejected" "conspiracy" "theories"
## [22] "" "" "old"
## [25] "boss" "" "Rupert"
## [28] "Murdoch" "" ""
## [31] "deals" "" "people"
## [34] "like" "" "last"
## [37] "boss" "" "David"
## [40] "Cameron" "" ""
## [43] "" "" "George"
## [46] "Osborne" "" ""
## [49] "keen" "" "recruit"
## [52] "" "" "well-connected"
## [55] "Murdoch" "apparatchik" ""
## [58] "despite" "" "resignation"
## [61] "" "" "News"
## [64] "" "" "World"
## [67] "" "" "royal"
## [70] "hacking" "case" ""
## [73] "Coulson" "straight-batted" ""
## [76] "" "Jay's" "innuendo"
## [79] "" "" ""
comp_toks[['text7005']][370:450] # after compounding
## [1] "incriminate" "" ""
## [4] "anyone" "else" ""
## [7] "Scotland_Yard's" "investigation" ""
## [10] "" "" "rejected"
## [13] "conspiracy" "theories" ""
## [16] "" "old" "boss"
## [19] "" "Rupert_Murdoch" ""
## [22] "" "deals" ""
## [25] "people" "like" ""
## [28] "last" "boss" ""
## [31] "David_Cameron" "" ""
## [34] "" "" "George_Osborne"
## [37] "" "" "keen"
## [40] "" "recruit" ""
## [43] "" "well-connected" "Murdoch"
## [46] "apparatchik" "" "despite"
## [49] "" "resignation" ""
## [52] "" "News" ""
## [55] "" "World" ""
## [58] "" "royal" "hacking"
## [61] "case" "" "Coulson"
## [64] "straight-batted" "" ""
## [67] "Jay's" "innuendo" ""
## [70] "" "" ""
## [73] "stitch-up" "" "win"
## [76] "" "Sun's" "support"
## [79] "" "" "later"
Keyness is a statistical measure originally implemented in WordSmith to discover frequent words in target documents. This statistic is essentially a signed chi-square, where words more frequent than expected are given positive sign.
news_dfm <- dfm(comp_toks)
key <- textstat_keyness(news_dfm, docvars(news_dfm, 'year') == 2016)
head(key, 20)
## chi2 p n_target n_reference
## trump 2478.5407 0 2140 287
## 2016 1976.5118 0 1762 261
## clinton 1293.7808 0 1128 156
## eu 1229.1453 0 2206 983
## brexit 1141.6002 0 767 18
## sanders 1063.5417 0 820 69
## gmt 861.5140 0 2105 1193
## cruz 810.0924 0 568 24
## donald_trump 793.1306 0 680 89
## block-time 680.7539 0 3240 2591
## campaign 635.7943 0 1808 1127
## february 572.3454 0 901 353
## published-time 513.8171 0 2446 1956
## hillary_clinton 509.7642 0 454 67
## related 504.1816 0 1996 1476
## ted_cruz 474.9513 0 337 16
## bernie_sanders 467.0847 0 334 17
## republican 427.4190 0 792 363
## trump's 421.7344 0 416 80
## delegates 318.3553 0 307 56
textplot_keyness(key)
We can also find words associated with target words using the window
argument of tokens_select()
.
brexit_toks <- tokens_keep(comp_toks, 'brexit', window = 10) # equivalent to tokens_select(selection = 'keep')
not_brexit_toks <- tokens_remove(comp_toks, 'brexit', window = 10) # equivalent to tokens_select(selection = 'remove')
print(brexit_toks[['text173244']])
## [1] "said" "" "Airports_Commission"
## [4] "" "disagreed" ""
## [7] "Khan" "" "adding"
## [10] "" "Brexit" "loaded"
## [13] "" "dice" ""
## [16] "" "favour" ""
## [19] "" "west_London" "location"
## [22] "easily" "" "quickly"
## [25] "" "" ""
## [28] "option" "" ""
## [31] "" "Brexit" "makes"
## [34] "" "commission's" "conclusion"
## [37] "" "" ""
## [40] "Heathrow" "expansion" ""
dfm_brexti <- dfm(brexit_toks)
not_brexit_dfm <- dfm(not_brexit_toks)
key_brexit <- textstat_keyness(rbind(dfm_brexti, not_brexit_dfm), seq_len(ndoc(dfm_brexti)))
key_brexit <- key_brexit[key_brexit$n_target > 10,]
head(key_brexit, 20)
## chi2 p n_target n_reference
## brexit 257051.30090 0 785 0
## vote 1971.29282 0 126 2237
## uncertainty 731.85707 0 29 311
## sterling 604.19202 0 17 124
## pound 514.16366 0 22 252
## eu 388.90310 0 71 3118
## referendum 367.28871 0 38 1018
## gove 309.85435 0 11 99
## campaigners 304.95293 0 19 312
## leave 299.60603 0 42 1460
## related 281.73956 0 65 3407
## boris_johnson 207.75130 0 13 209
## fears 192.74866 0 18 428
## hard 158.07415 0 26 1015
## uk 153.27897 0 61 4638
## impact 151.90838 0 25 975
## risks 141.84426 0 16 446
## economic 120.77618 0 30 1611
## campaign 101.69741 0 39 2896
## economy 96.81687 0 29 1829
textplot_keyness(key_brexit[-1,])
Targeted frequency analysis might look complex, but can be done in three lines.
trump_dfm <- dfm(tokens_keep(comp_toks, c('donald_trump', 'trump'), window = 10))
not_trump_dfm <- dfm(tokens_remove(comp_toks, c('donald_trump', 'trump'), window = 10))
key_trump <- textstat_keyness(rbind(trump_dfm, not_trump_dfm), seq_len(ndoc(trump_dfm)))
head(key_trump[key_trump$n_target > 10,], 20)
## chi2 p n_target n_reference
## trump 200401.8183 0 2427 0
## donald_trump 63477.6946 0 769 0
## donald_j 4292.2553 0 54 1
## cruz 3606.9352 0 166 426
## ted_cruz 3522.7601 0 126 227
## republican 3485.0460 0 232 923
## @realdonaldtrump 2665.1277 0 52 29
## nominee 1255.2483 0 60 158
## frontrunner 1090.6559 0 48 113
## presidential 736.1504 0 98 781
## campaign 709.2115 0 192 2743
## rally 675.2740 0 70 441
## hillary_clinton 639.3000 0 69 452
## rubio 608.6393 0 62 384
## delegates 585.9146 0 55 308
## clinton 540.9624 0 106 1178
## 1,237 519.6446 0 19 33
## candidate 485.5896 0 87 898
## voters 467.4642 0 97 1129
## staten_island 452.7590 0 16 26
textplot_keyness(key_trump[c(-1, -2, -3),])
Dictionary-based analysis is one of the most popular quantitative text analysis methods. You can define keys and values of a dictionary using dictionary()
(it also import the Wordstat, LIWC, Yoshicoder, Lexicoder and YAML formats through file
argument).
dict <- dictionary(list(positive = c('good', 'nice', 'excellent'),
negative = c('bad', 'nasty', 'poor')))
print(dict)
## Dictionary object with 2 key entries.
## - [positive]:
## - good, nice, excellent
## - [negative]:
## - bad, nasty, poor
You can use tokens_lookup()
or dfm_looup()
to count dictionary values. quanteda contains Lexicoder Sentiment Dictionary created by Young and Soroka, so you can perfrom sentiment analysis of English texts right away.
lengths(data_dictionary_LSD2015)
## negative positive neg_positive neg_negative
## 2858 1709 1721 2860
lsd_toks <- tokens_lookup(toks, data_dictionary_LSD2015)
head(lsd_toks, 2)
## tokens from 2 documents.
## text136751 :
## [1] "positive" "positive"
##
## text118588 :
## [1] "positive" "positive" "negative" "negative" "negative" "positive"
## [7] "negative" "positive" "positive" "negative" "positive" "positive"
## [13] "positive" "positive" "positive" "negative" "negative" "negative"
## [19] "positive" "negative" "positive" "negative" "positive" "positive"
## [25] "positive" "positive" "negative" "positive" "positive" "positive"
## [31] "positive" "positive" "positive" "positive" "positive" "positive"
dfm_lsd <- dfm(lsd_toks)
head(dfm_lsd, 2)
## Document-feature matrix of: 2 documents, 2 features (25% sparse).
## 2 x 2 sparse Matrix of class "dfm"
## features
## docs negative positive
## text136751 0 2
## text118588 11 25
We can aggregate frequency count using the groups
argument of dfm()
.
lsd_year_dfm <- dfm(lsd_toks, groups = docvars(lsd_toks, 'year'))
head(lsd_year_dfm)
## Document-feature matrix of: 5 documents, 2 features (0% sparse).
## 5 x 2 sparse Matrix of class "dfm"
## features
## docs negative positive
## 2016 65774 62265
## 2015 56097 55392
## 2014 21393 20480
## 2013 13646 12093
## 2012 12354 11672
You can use tokens_select()
with window
argument to perform more targeted sentiment analysis.
lsd_brexit_dfm <- dfm(brexit_toks, dictionary = data_dictionary_LSD2015[1:2],
groups = docvars(brexit_toks, 'month'))
lsd_brexit_dfm <- tail(lsd_brexit_dfm[order(docnames(lsd_brexit_dfm))], 24) # last 24 month only
matplot(lsd_brexit_dfm[,c(1:2)], type = 'l', xaxt = 'n', lty = 1,
main = 'Sentiment on Brexit', ylab = 'Frequency')
grid()
axis(1, 1:24, docnames(lsd_brexit_dfm))
legend('topleft', col = 1:2, legend = c('Negative', 'Positive'), lty = 1)
plot(lsd_brexit_dfm[,2] - lsd_brexit_dfm[,1], type = 'b', main = 'Sentiment on Brexit (difference)',
ylab = 'Frequency', xlab = '', xaxt = 'n')
axis(1, 1:24, docnames(lsd_brexit_dfm))
grid()
abline(h = 0, lty = 2)
lsd_trump_dfm <- dfm(tokens_keep(comp_toks, c('donald_trump', 'trump'), window = 10),
dictionary = data_dictionary_LSD2015[1:2], groups = docvars(comp_toks, 'month'))
lsd_trump_dfm <- tail(lsd_trump_dfm[order(docnames(lsd_trump_dfm))], 24) # last 24 month only
matplot(lsd_trump_dfm, type = 'l', xaxt = 'n', lty = 1,
main = 'Sentiment on Trump', ylab = 'Frequency')
grid()
axis(1, 1:24, docnames(lsd_trump_dfm))
legend('topleft', col = 1:2, legend = c('Negative', 'Positive'), lty = 1)
plot(lsd_trump_dfm[,2] - lsd_trump_dfm[,1], type = 'b', main = 'Sentiment on Trump (difference)',
ylab = 'Frequency', xlab = '', xaxt = 'n')
axis(1, 1:24, docnames(lsd_trump_dfm))
grid()
abline(h = 0, lty = 2)