Part 2: Basic text analysis using collocation, keyness and dictionary

require(quanteda)

## Loading required package: quanteda

## quanteda version 0.99.22

## Using 7 of 8 threads for parallel computing

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

load("../data/guardianSample.RData")

You can also download Guardian corpus and save it in you computer.

This corpus contains 6,000 Guardian news articles from 2012 to 2016.

ndoc(guardianSample)

## [1] 6000

range(docvars(guardianSample, 'date'))

## [1] "2012-01-02" "2016-12-31"

Create new document variables and tokenize texts:

docvars(guardianSample, 'month') <- format(docvars(guardianSample, 'date'), '%Y-%m')
docvars(guardianSample, 'year') <- format(docvars(guardianSample, 'date'), '%Y')

toks <- tokens(guardianSample)
toks <- tokens_select(toks, stopwords('english'), select = 'remove', padding = TRUE) 
toks <- tokens_select(toks, '^[\\p{P}\\p{S}]$', select = 'remove', valuetype = 'regex', padding = TRUE)

Collocation analysis

By collocation analysis, we can identify multi-word expressions that are very frequent in newspapers articles. One of the most common type of multi-word expressions is proper names, which can be identified simply based on capitalization in English texts.

cap_toks <- tokens_select(toks, '^[A-Z]', valuetype = 'regex', case_insensitive = FALSE, padding = TRUE)
cap_col <- textstat_collocations(cap_toks, min_count = 5)
head(cap_col, 20)

##           collocation count length    lambda         z
## 1       david cameron   860      2  8.312459 150.05618
## 2        donald trump   774      2  8.483164 124.99786
## 3      george osborne   362      2  8.803971 109.38982
## 4     hillary clinton   525      2  9.247096 104.30271
## 5            new york  1016      2 10.604024 101.76535
## 6       islamic state   330      2  9.958311  99.70884
## 7         white house   478      2 10.067493  97.81578
## 8      european union   348      2  8.394972  96.47133
## 9       jeremy corbyn   244      2  8.885666  92.41831
## 10      boris johnson   245      2  9.820288  86.13744
## 11     bernie sanders   394      2 10.057545  85.95219
## 12 guardian australia   237      2  6.481932  85.73424
## 13   northern ireland   204      2 10.025226  84.51172
## 14        home office   216      2  9.838011  80.06444
## 15        ed miliband   173      2 10.008367  79.61398
## 16       south africa   172      2  7.724806  79.21350
## 17       barack obama   343      2  9.915702  79.18010
## 18           ted cruz   417      2 10.912503  79.00897
## 19       black friday   190      2  8.615139  78.21714
## 20     south carolina   271      2  9.560901  78.07656

Compound multi-word expressions

The result of collocation analysis is not only interesting but useful: you can be use it to compound tokens. Compounding makes tokens less ambiguous and significantly improves quality of statistical analysis in the downstream. We will only compound strongly associated (p<0.998) multi-word expressions here by sub-setting cap_col$collocation.

comp_toks <- tokens_compound(toks, phrase(cap_col$collocation[cap_col$z > 3]))
toks[['text7005']][370:450] # before compounding

##  [1] ""                ""                ""               
##  [4] "need"            ""                ""               
##  [7] "incriminate"     ""                ""               
## [10] "anyone"          "else"            ""               
## [13] "Scotland"        "Yard's"          "investigation"  
## [16] ""                ""                ""               
## [19] "rejected"        "conspiracy"      "theories"       
## [22] ""                ""                "old"            
## [25] "boss"            ""                "Rupert"         
## [28] "Murdoch"         ""                ""               
## [31] "deals"           ""                "people"         
## [34] "like"            ""                "last"           
## [37] "boss"            ""                "David"          
## [40] "Cameron"         ""                ""               
## [43] ""                ""                "George"         
## [46] "Osborne"         ""                ""               
## [49] "keen"            ""                "recruit"        
## [52] ""                ""                "well-connected" 
## [55] "Murdoch"         "apparatchik"     ""               
## [58] "despite"         ""                "resignation"    
## [61] ""                ""                "News"           
## [64] ""                ""                "World"          
## [67] ""                ""                "royal"          
## [70] "hacking"         "case"            ""               
## [73] "Coulson"         "straight-batted" ""               
## [76] ""                "Jay's"           "innuendo"       
## [79] ""                ""                ""

comp_toks[['text7005']][370:450] # after compounding

##  [1] "incriminate"     ""                ""               
##  [4] "anyone"          "else"            ""               
##  [7] "Scotland_Yard's" "investigation"   ""               
## [10] ""                ""                "rejected"       
## [13] "conspiracy"      "theories"        ""               
## [16] ""                "old"             "boss"           
## [19] ""                "Rupert_Murdoch"  ""               
## [22] ""                "deals"           ""               
## [25] "people"          "like"            ""               
## [28] "last"            "boss"            ""               
## [31] "David_Cameron"   ""                ""               
## [34] ""                ""                "George_Osborne" 
## [37] ""                ""                "keen"           
## [40] ""                "recruit"         ""               
## [43] ""                "well-connected"  "Murdoch"        
## [46] "apparatchik"     ""                "despite"        
## [49] ""                "resignation"     ""               
## [52] ""                "News"            ""               
## [55] ""                "World"           ""               
## [58] ""                "royal"           "hacking"        
## [61] "case"            ""                "Coulson"        
## [64] "straight-batted" ""                ""               
## [67] "Jay's"           "innuendo"        ""               
## [70] ""                ""                ""               
## [73] "stitch-up"       ""                "win"            
## [76] ""                "Sun's"           "support"        
## [79] ""                ""                "later"

Relative frequency analysis

Keyness is a statistical measure originally implemented in WordSmith to discover frequent words in target documents. This statistic is essentially a signed chi-square, where words more frequent than expected are given positive sign.

Frequent words in 2016

news_dfm <- dfm(comp_toks)
key <- textstat_keyness(news_dfm, docvars(news_dfm, 'year') == 2016) 
head(key, 20)

##                      chi2 p n_target n_reference
## trump           2478.5407 0     2140         287
## 2016            1976.5118 0     1762         261
## clinton         1293.7808 0     1128         156
## eu              1229.1453 0     2206         983
## brexit          1141.6002 0      767          18
## sanders         1063.5417 0      820          69
## gmt              861.5140 0     2105        1193
## cruz             810.0924 0      568          24
## donald_trump     793.1306 0      680          89
## block-time       680.7539 0     3240        2591
## campaign         635.7943 0     1808        1127
## february         572.3454 0      901         353
## published-time   513.8171 0     2446        1956
## hillary_clinton  509.7642 0      454          67
## related          504.1816 0     1996        1476
## ted_cruz         474.9513 0      337          16
## bernie_sanders   467.0847 0      334          17
## republican       427.4190 0      792         363
## trump's          421.7344 0      416          80
## delegates        318.3553 0      307          56

textplot_keyness(key)

Frequent words on Brexit

We can also find words associated with target words using the window argument of tokens_select().

brexit_toks <- tokens_keep(comp_toks, 'brexit', window = 10) # equivalent to tokens_select(selection = 'keep')
not_brexit_toks <- tokens_remove(comp_toks, 'brexit', window = 10) # equivalent to tokens_select(selection = 'remove')
print(brexit_toks[['text173244']])

##  [1] "said"                ""                    "Airports_Commission"
##  [4] ""                    "disagreed"           ""                   
##  [7] "Khan"                ""                    "adding"             
## [10] ""                    "Brexit"              "loaded"             
## [13] ""                    "dice"                ""                   
## [16] ""                    "favour"              ""                   
## [19] ""                    "west_London"         "location"           
## [22] "easily"              ""                    "quickly"            
## [25] ""                    ""                    ""                   
## [28] "option"              ""                    ""                   
## [31] ""                    "Brexit"              "makes"              
## [34] ""                    "commission's"        "conclusion"         
## [37] ""                    ""                    ""                   
## [40] "Heathrow"            "expansion"           ""

dfm_brexti <- dfm(brexit_toks)
not_brexit_dfm <- dfm(not_brexit_toks)

key_brexit <- textstat_keyness(rbind(dfm_brexti, not_brexit_dfm), seq_len(ndoc(dfm_brexti)))
key_brexit <- key_brexit[key_brexit$n_target > 10,]
head(key_brexit, 20)

##                       chi2 p n_target n_reference
## brexit        257051.30090 0      785           0
## vote            1971.29282 0      126        2237
## uncertainty      731.85707 0       29         311
## sterling         604.19202 0       17         124
## pound            514.16366 0       22         252
## eu               388.90310 0       71        3118
## referendum       367.28871 0       38        1018
## gove             309.85435 0       11          99
## campaigners      304.95293 0       19         312
## leave            299.60603 0       42        1460
## related          281.73956 0       65        3407
## boris_johnson    207.75130 0       13         209
## fears            192.74866 0       18         428
## hard             158.07415 0       26        1015
## uk               153.27897 0       61        4638
## impact           151.90838 0       25         975
## risks            141.84426 0       16         446
## economic         120.77618 0       30        1611
## campaign         101.69741 0       39        2896
## economy           96.81687 0       29        1829

textplot_keyness(key_brexit[-1,])

Frequent words on Trump

Targeted frequency analysis might look complex, but can be done in three lines.

trump_dfm <- dfm(tokens_keep(comp_toks, c('donald_trump', 'trump'), window = 10))
not_trump_dfm <- dfm(tokens_remove(comp_toks, c('donald_trump', 'trump'), window = 10))
key_trump <- textstat_keyness(rbind(trump_dfm, not_trump_dfm), seq_len(ndoc(trump_dfm)))

head(key_trump[key_trump$n_target > 10,], 20)

##                         chi2 p n_target n_reference
## trump            200401.8183 0     2427           0
## donald_trump      63477.6946 0      769           0
## donald_j           4292.2553 0       54           1
## cruz               3606.9352 0      166         426
## ted_cruz           3522.7601 0      126         227
## republican         3485.0460 0      232         923
## @realdonaldtrump   2665.1277 0       52          29
## nominee            1255.2483 0       60         158
## frontrunner        1090.6559 0       48         113
## presidential        736.1504 0       98         781
## campaign            709.2115 0      192        2743
## rally               675.2740 0       70         441
## hillary_clinton     639.3000 0       69         452
## rubio               608.6393 0       62         384
## delegates           585.9146 0       55         308
## clinton             540.9624 0      106        1178
## 1,237               519.6446 0       19          33
## candidate           485.5896 0       87         898
## voters              467.4642 0       97        1129
## staten_island       452.7590 0       16          26

textplot_keyness(key_trump[c(-1, -2, -3),])

Dictionary-based analysis

Dictionary-based analysis is one of the most popular quantitative text analysis methods. You can define keys and values of a dictionary using dictionary() (it also import the Wordstat, LIWC, Yoshicoder, Lexicoder and YAML formats through file argument).

dict <- dictionary(list(positive = c('good', 'nice', 'excellent'), 
                        negative = c('bad', 'nasty', 'poor')))
print(dict)

## Dictionary object with 2 key entries.
## - [positive]:
##   - good, nice, excellent
## - [negative]:
##   - bad, nasty, poor

You can use tokens_lookup() or dfm_looup() to count dictionary values. quanteda contains Lexicoder Sentiment Dictionary created by Young and Soroka, so you can perfrom sentiment analysis of English texts right away.

lengths(data_dictionary_LSD2015)

##     negative     positive neg_positive neg_negative 
##         2858         1709         1721         2860

lsd_toks <- tokens_lookup(toks, data_dictionary_LSD2015)
head(lsd_toks, 2)

## tokens from 2 documents.
## text136751 :
## [1] "positive" "positive"
## 
## text118588 :
##  [1] "positive" "positive" "negative" "negative" "negative" "positive"
##  [7] "negative" "positive" "positive" "negative" "positive" "positive"
## [13] "positive" "positive" "positive" "negative" "negative" "negative"
## [19] "positive" "negative" "positive" "negative" "positive" "positive"
## [25] "positive" "positive" "negative" "positive" "positive" "positive"
## [31] "positive" "positive" "positive" "positive" "positive" "positive"

dfm_lsd <- dfm(lsd_toks)
head(dfm_lsd, 2)

## Document-feature matrix of: 2 documents, 2 features (25% sparse).
## 2 x 2 sparse Matrix of class "dfm"
##             features
## docs         negative positive
##   text136751        0        2
##   text118588       11       25

Aggregate frequency counts

We can aggregate frequency count using the groups argument of dfm().

lsd_year_dfm <- dfm(lsd_toks, groups = docvars(lsd_toks, 'year'))
head(lsd_year_dfm)

## Document-feature matrix of: 5 documents, 2 features (0% sparse).
## 5 x 2 sparse Matrix of class "dfm"
##       features
## docs   negative positive
##   2016    65774    62265
##   2015    56097    55392
##   2014    21393    20480
##   2013    13646    12093
##   2012    12354    11672

Targeted analysis

You can use tokens_select() with window argument to perform more targeted sentiment analysis.

lsd_brexit_dfm <- dfm(brexit_toks, dictionary = data_dictionary_LSD2015[1:2], 
                      groups = docvars(brexit_toks, 'month'))
lsd_brexit_dfm <- tail(lsd_brexit_dfm[order(docnames(lsd_brexit_dfm))], 24) # last 24 month only

matplot(lsd_brexit_dfm[,c(1:2)], type = 'l', xaxt = 'n', lty = 1, 
        main = 'Sentiment on Brexit', ylab = 'Frequency')
grid()
axis(1, 1:24, docnames(lsd_brexit_dfm))
legend('topleft', col = 1:2, legend = c('Negative', 'Positive'), lty = 1)

plot(lsd_brexit_dfm[,2] - lsd_brexit_dfm[,1], type = 'b', main = 'Sentiment on Brexit (difference)',
     ylab = 'Frequency', xlab = '', xaxt = 'n')
axis(1, 1:24, docnames(lsd_brexit_dfm))
grid()
abline(h = 0, lty = 2)

lsd_trump_dfm <- dfm(tokens_keep(comp_toks, c('donald_trump', 'trump'), window = 10), 
                     dictionary = data_dictionary_LSD2015[1:2], groups = docvars(comp_toks, 'month'))
lsd_trump_dfm <- tail(lsd_trump_dfm[order(docnames(lsd_trump_dfm))], 24) # last 24 month only

matplot(lsd_trump_dfm, type = 'l', xaxt = 'n', lty = 1, 
        main = 'Sentiment on Trump', ylab = 'Frequency')
grid()
axis(1, 1:24, docnames(lsd_trump_dfm))
legend('topleft', col = 1:2, legend = c('Negative', 'Positive'), lty = 1)

plot(lsd_trump_dfm[,2] - lsd_trump_dfm[,1], type = 'b', main = 'Sentiment on Trump (difference)',
     ylab = 'Frequency', xlab = '', xaxt = 'n')
axis(1, 1:24, docnames(lsd_trump_dfm))
grid()
abline(h = 0, lty = 2)