First, let’s load the data
library(tidyverse)
file <- "../data/pres_tweets.csv"
#file <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/pres_tweets.csv"
tweets <- read_csv(file)
This dataset includes about 19,000 tweets by the four major 2016 presidential candidates.
This dataset excludes any retweets (i.e., only verb == “posts”).
table(tweets$displayName)
##
## Bernie Sanders Donald J. Trump Hillary Clinton Ted Cruz
## 4844 6402 4236 3621
Let’s use quanteda
to analyze the tweet text (body).
library(quanteda)
corpus <- corpus(tweets$body,
docvars = data.frame(candidate = tweets$displayName,
postedTime = tweets$estTime))
This code will count the number of times words from each dimension fall into each category.
mfFile <- "../data/dictionaries/moral-foundations-dictionary.dic"
#mfFile <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/dictionaries/moral-foundations-dictionary.dic"
#Moral Foundations
mfdict <- dictionary(file = mfFile, format = "LIWC")
mfDfm <- dfm(corpus,
dictionary = mfdict)
as.data.frame(mfDfm[1:10])
## 01 HarmVirtue 02 HarmVice
## text1 0 0
## text2 0 0
## text3 0 0
## text4 0 0
## text5 0 0
## text6 1 1
## text7 0 0
## text8 0 0
## text9 0 0
## text10 0 0
## 03 FairnessVirtue
## text1 0
## text2 0
## text3 0
## text4 2
## text5 0
## text6 0
## text7 0
## text8 0
## text9 0
## text10 0
## 04 FairnessVice
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## text7 0
## text8 0
## text9 0
## text10 0
## 05 IngroupVirtue
## text1 0
## text2 1
## text3 0
## text4 0
## text5 0
## text6 0
## text7 1
## text8 0
## text9 0
## text10 0
## 06 IngroupVice
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## text7 0
## text8 0
## text9 0
## text10 0
## 07 AuthorityVirtue
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## text7 0
## text8 0
## text9 0
## text10 1
## 08 AuthorityVice
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## text7 0
## text8 0
## text9 0
## text10 1
## 09 PurityVirtue 10 PurityVice
## text1 0 0
## text2 0 0
## text3 0 0
## text4 0 0
## text5 0 0
## text6 0 0
## text7 0 0
## text8 0 0
## text9 0 0
## text10 0 0
## 11 MoralityGeneral
## text1 0
## text2 0
## text3 0
## text4 0
## text5 0
## text6 0
## text7 0
## text8 0
## text9 0
## text10 0
What’s a problem with this?
Sparsity! Most tweets may only have one or so feature.
This is typically a big problem with dictionary-based methods, especially with tweets.
mfDfm <- dfm(corpus,
group = "candidate",
dictionary = mfdict)
# transpose
as.data.frame(t(mfDfm))
## Bernie Sanders Donald J. Trump
## 01 HarmVirtue 478 179
## 02 HarmVice 400 232
## 03 FairnessVirtue 260 105
## 04 FairnessVice 52 125
## 05 IngroupVirtue 928 280
## 06 IngroupVice 101 184
## 07 AuthorityVirtue 397 402
## 08 AuthorityVice 38 152
## 09 PurityVirtue 61 30
## 10 PurityVice 73 36
## 11 MoralityGeneral 150 298
## Hillary Clinton Ted Cruz
## 01 HarmVirtue 389 263
## 02 HarmVice 488 207
## 03 FairnessVirtue 377 52
## 04 FairnessVice 71 5
## 05 IngroupVirtue 551 258
## 06 IngroupVice 153 150
## 07 AuthorityVirtue 259 268
## 08 AuthorityVice 16 59
## 09 PurityVirtue 27 8
## 10 PurityVice 28 1
## 11 MoralityGeneral 121 73
So this gives us how many times each candidate used a word with each dimension.
However, we need to make this relative to how many tweets (could even do number of words in tweets) to adequately compare for candidates who tweeted a lot (e.g., Donald Trump) versus those that tweeted sparingly (e.g., Ted Cruz)
mfRelDfm <- dfm_weight(mfDfm, "relFreq")
# transpose
as.data.frame(t(mfRelDfm))
## Bernie Sanders Donald J. Trump
## 01 HarmVirtue 0.16269571 0.08848245
## 02 HarmVice 0.13614704 0.11468117
## 03 FairnessVirtue 0.08849558 0.05190311
## 04 FairnessVice 0.01769912 0.06178942
## 05 IngroupVirtue 0.31586113 0.13840830
## 06 IngroupVice 0.03437713 0.09095403
## 07 AuthorityVirtue 0.13512594 0.19871478
## 08 AuthorityVice 0.01293397 0.07513594
## 09 PurityVirtue 0.02076242 0.01482946
## 10 PurityVice 0.02484683 0.01779535
## 11 MoralityGeneral 0.05105514 0.14730598
## Hillary Clinton Ted Cruz
## 01 HarmVirtue 0.156854839 0.1956845238
## 02 HarmVice 0.196774194 0.1540178571
## 03 FairnessVirtue 0.152016129 0.0386904762
## 04 FairnessVice 0.028629032 0.0037202381
## 05 IngroupVirtue 0.222177419 0.1919642857
## 06 IngroupVice 0.061693548 0.1116071429
## 07 AuthorityVirtue 0.104435484 0.1994047619
## 08 AuthorityVice 0.006451613 0.0438988095
## 09 PurityVirtue 0.010887097 0.0059523810
## 10 PurityVice 0.011290323 0.0007440476
## 11 MoralityGeneral 0.048790323 0.0543154762
mfDf <- as.data.frame(mfRelDfm)
#install.packages("radarchart")
library(radarchart)
labels <- substr(colnames(mfDf),23,40)
scores <- list(
"Bernie Sanders" = as.numeric(mfDf[1,]),
"Donald J. Trump" = as.numeric(mfDf[2,]),
"Hillary Clinton" = as.numeric(mfDf[3,]),
"Ted Cruz" = as.numeric(mfDf[4,])
)
chartJSRadar(scores = scores,
labs = labels,
maxScale = 0.3)
wordStatFile <- "../data/dictionaries/WordStatSentiments.CAT"
#wordStatFile <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/dictionaries/WordStatSentiments.CAT"
wordStat <- dictionary(file = wordStatFile,
format = "wordstat")
Let’s explore the dictionary using a handy HTMLWidget tool, listviewer
.
#install.packages("listviewer")
library(listviewer)
listviewer::jsonedit_gadget(wordStat)
dfm <- dfm(corpus,
dictionary = wordStat,
groups = "candidate")
topfeatures(dfm)
## TO_BE_IGNORED.POSITIVE WORDS TO_BE_IGNORED.NEGATIVE WORDS
## 37136 22525
## TO_BE_IGNORED.NEGATIONS
## 5474
dfmRel <- dfm_weight(dfm, "relFreq")
negative <- as.vector(dfmRel[, "TO_BE_IGNORED.NEGATIVE WORDS"])
names(negative) <- docnames(dfmRel)
dotchart(sort(negative), xlab = "WordStat \"Negative\" terms used as a proportion of all terms",
pch = 19, xlim = c(0, 0.4))
negative <- as.vector(dfmRel[, "TO_BE_IGNORED.POSITIVE WORDS"])
names(negative) <- docnames(dfmRel)
dotchart(sort(negative), xlab = "WordStat \"Positive\" terms used as a proportion of all terms",
pch = 19, xlim = c(0, 0.7))
negative <- as.vector(dfmRel[, "TO_BE_IGNORED.NEGATIONS"])
names(negative) <- docnames(dfmRel)
dotchart(sort(negative), xlab = "WordStat \"Negative\" terms used as a proportion of all terms",
pch = 19, xlim = c(0, 0.15))