Text-as-Data: Dictionaries

Preparation

Load the Data

First, let’s load the data

library(tidyverse)

file <- "../data/pres_tweets.csv"
#file <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/pres_tweets.csv"

tweets <- read_csv(file)

This dataset includes about 19,000 tweets by the four major 2016 presidential candidates.

This dataset excludes any retweets (i.e., only verb == “posts”).

table(tweets$displayName)

## 
##  Bernie Sanders Donald J. Trump Hillary Clinton        Ted Cruz 
##            4844            6402            4236            3621

Let’s use quanteda to analyze the tweet text (body).

Create the Corpus data structure

library(quanteda)

corpus <- corpus(tweets$body,
                 docvars = data.frame(candidate = tweets$displayName,
                                      postedTime = tweets$estTime))

Moral Foundations

This code will count the number of times words from each dimension fall into each category.

mfFile <- "../data/dictionaries/moral-foundations-dictionary.dic"
#mfFile <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/dictionaries/moral-foundations-dictionary.dic"

#Moral Foundations
mfdict <- dictionary(file = mfFile, format = "LIWC")

mfDfm <- dfm(corpus, 
             dictionary = mfdict)

as.data.frame(mfDfm[1:10])

##        01                    HarmVirtue 02                    HarmVice
## text1                                 0                              0
## text2                                 0                              0
## text3                                 0                              0
## text4                                 0                              0
## text5                                 0                              0
## text6                                 1                              1
## text7                                 0                              0
## text8                                 0                              0
## text9                                 0                              0
## text10                                0                              0
##        03                    FairnessVirtue
## text1                                     0
## text2                                     0
## text3                                     0
## text4                                     2
## text5                                     0
## text6                                     0
## text7                                     0
## text8                                     0
## text9                                     0
## text10                                    0
##        04                    FairnessVice
## text1                                   0
## text2                                   0
## text3                                   0
## text4                                   0
## text5                                   0
## text6                                   0
## text7                                   0
## text8                                   0
## text9                                   0
## text10                                  0
##        05                    IngroupVirtue
## text1                                    0
## text2                                    1
## text3                                    0
## text4                                    0
## text5                                    0
## text6                                    0
## text7                                    1
## text8                                    0
## text9                                    0
## text10                                   0
##        06                    IngroupVice
## text1                                  0
## text2                                  0
## text3                                  0
## text4                                  0
## text5                                  0
## text6                                  0
## text7                                  0
## text8                                  0
## text9                                  0
## text10                                 0
##        07                    AuthorityVirtue
## text1                                      0
## text2                                      0
## text3                                      0
## text4                                      0
## text5                                      0
## text6                                      0
## text7                                      0
## text8                                      0
## text9                                      0
## text10                                     1
##        08                    AuthorityVice
## text1                                    0
## text2                                    0
## text3                                    0
## text4                                    0
## text5                                    0
## text6                                    0
## text7                                    0
## text8                                    0
## text9                                    0
## text10                                   1
##        09                    PurityVirtue 10                    PurityVice
## text1                                   0                                0
## text2                                   0                                0
## text3                                   0                                0
## text4                                   0                                0
## text5                                   0                                0
## text6                                   0                                0
## text7                                   0                                0
## text8                                   0                                0
## text9                                   0                                0
## text10                                  0                                0
##        11                    MoralityGeneral
## text1                                      0
## text2                                      0
## text3                                      0
## text4                                      0
## text5                                      0
## text6                                      0
## text7                                      0
## text8                                      0
## text9                                      0
## text10                                     0

What’s a problem with this?

Sparsity! Most tweets may only have one or so feature.

This is typically a big problem with dictionary-based methods, especially with tweets.

mfDfm <- dfm(corpus, 
             group = "candidate",
             dictionary = mfdict)

# transpose
as.data.frame(t(mfDfm))

##                                       Bernie Sanders Donald J. Trump
## 01                    HarmVirtue                 478             179
## 02                    HarmVice                   400             232
## 03                    FairnessVirtue             260             105
## 04                    FairnessVice                52             125
## 05                    IngroupVirtue              928             280
## 06                    IngroupVice                101             184
## 07                    AuthorityVirtue            397             402
## 08                    AuthorityVice               38             152
## 09                    PurityVirtue                61              30
## 10                    PurityVice                  73              36
## 11                    MoralityGeneral            150             298
##                                       Hillary Clinton Ted Cruz
## 01                    HarmVirtue                  389      263
## 02                    HarmVice                    488      207
## 03                    FairnessVirtue              377       52
## 04                    FairnessVice                 71        5
## 05                    IngroupVirtue               551      258
## 06                    IngroupVice                 153      150
## 07                    AuthorityVirtue             259      268
## 08                    AuthorityVice                16       59
## 09                    PurityVirtue                 27        8
## 10                    PurityVice                   28        1
## 11                    MoralityGeneral             121       73

So this gives us how many times each candidate used a word with each dimension.

However, we need to make this relative to how many tweets (could even do number of words in tweets) to adequately compare for candidates who tweeted a lot (e.g., Donald Trump) versus those that tweeted sparingly (e.g., Ted Cruz)

mfRelDfm <- dfm_weight(mfDfm, "relFreq")

# transpose
as.data.frame(t(mfRelDfm))

##                                       Bernie Sanders Donald J. Trump
## 01                    HarmVirtue          0.16269571      0.08848245
## 02                    HarmVice            0.13614704      0.11468117
## 03                    FairnessVirtue      0.08849558      0.05190311
## 04                    FairnessVice        0.01769912      0.06178942
## 05                    IngroupVirtue       0.31586113      0.13840830
## 06                    IngroupVice         0.03437713      0.09095403
## 07                    AuthorityVirtue     0.13512594      0.19871478
## 08                    AuthorityVice       0.01293397      0.07513594
## 09                    PurityVirtue        0.02076242      0.01482946
## 10                    PurityVice          0.02484683      0.01779535
## 11                    MoralityGeneral     0.05105514      0.14730598
##                                       Hillary Clinton     Ted Cruz
## 01                    HarmVirtue          0.156854839 0.1956845238
## 02                    HarmVice            0.196774194 0.1540178571
## 03                    FairnessVirtue      0.152016129 0.0386904762
## 04                    FairnessVice        0.028629032 0.0037202381
## 05                    IngroupVirtue       0.222177419 0.1919642857
## 06                    IngroupVice         0.061693548 0.1116071429
## 07                    AuthorityVirtue     0.104435484 0.1994047619
## 08                    AuthorityVice       0.006451613 0.0438988095
## 09                    PurityVirtue        0.010887097 0.0059523810
## 10                    PurityVice          0.011290323 0.0007440476
## 11                    MoralityGeneral     0.048790323 0.0543154762

mfDf <- as.data.frame(mfRelDfm)

#install.packages("radarchart")
library(radarchart)

labels <- substr(colnames(mfDf),23,40)

scores <- list(
  "Bernie Sanders" = as.numeric(mfDf[1,]),
  "Donald J. Trump" = as.numeric(mfDf[2,]),
  "Hillary Clinton" = as.numeric(mfDf[3,]),
  "Ted Cruz" = as.numeric(mfDf[4,])
)

chartJSRadar(scores = scores, 
             labs = labels, 
             maxScale = 0.3)

WordStat Dictionary

wordStatFile <- "../data/dictionaries/WordStatSentiments.CAT"
#wordStatFile <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/dictionaries/WordStatSentiments.CAT"

wordStat <- dictionary(file = wordStatFile,
                      format = "wordstat")

Let’s explore the dictionary using a handy HTMLWidget tool, listviewer.

#install.packages("listviewer")
library(listviewer)

listviewer::jsonedit_gadget(wordStat)

dfm <- dfm(corpus, 
           dictionary = wordStat,
           groups = "candidate")

topfeatures(dfm)

## TO_BE_IGNORED.POSITIVE WORDS TO_BE_IGNORED.NEGATIVE WORDS 
##                        37136                        22525 
##      TO_BE_IGNORED.NEGATIONS 
##                         5474

dfmRel <- dfm_weight(dfm, "relFreq")

Negative

negative <- as.vector(dfmRel[, "TO_BE_IGNORED.NEGATIVE WORDS"])
names(negative) <- docnames(dfmRel)
dotchart(sort(negative), xlab = "WordStat \"Negative\" terms used as a proportion of all terms", 
         pch = 19, xlim = c(0, 0.4))

negative <- as.vector(dfmRel[, "TO_BE_IGNORED.POSITIVE WORDS"])
names(negative) <- docnames(dfmRel)
dotchart(sort(negative), xlab = "WordStat \"Positive\" terms used as a proportion of all terms", 
         pch = 19, xlim = c(0, 0.7))

negative <- as.vector(dfmRel[, "TO_BE_IGNORED.NEGATIONS"])
names(negative) <- docnames(dfmRel)
dotchart(sort(negative), xlab = "WordStat \"Negative\" terms used as a proportion of all terms", 
         pch = 19, xlim = c(0, 0.15))