In this section we will explore some text analysis and analysis of metadata from a corpus of tweets retrieved from the Twitter API. The tweets are a small sample from a collection of tweets relating to the European Parliament elections of 2015.
Load the data frame containing the sample tweets:
require(quanteda)
## Loading required package: quanteda
## quanteda version 0.9.8.5
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:base':
##
## sample
load("tweetSample.RData")
str(tweetSample)
## 'data.frame': 10000 obs. of 35 variables:
## $ created_at : chr "2014-05-28 15:53:33+00:00" "2014-05-30 08:32:13+00:00" "2014-05-29 19:22:15+00:00" "2014-05-03 20:23:43+00:00" ...
## $ geo_latitude : num NA NA NA NA NA NA NA NA NA NA ...
## $ geo_longitude : num NA NA NA NA NA NA NA NA NA NA ...
## $ hashtags : chr "['Pomeriggio5', 'Canale5']" NA NA NA ...
## $ id : num 4.72e+17 4.72e+17 4.72e+17 4.63e+17 4.71e+17 ...
## $ lang : Factor w/ 43 levels "Arabic","Basque",..: 20 35 35 15 30 12 33 9 35 35 ...
## $ text : chr "Oggi pomeriggio, a partire dalle 18.00, interverrò a #Pomeriggio5 su #Canale5 http://t.co/aqB64fH4et ST" ".@pacomarhuenda llamando El Coletas a @Pablo_Iglesias_... precisamente, si hay alguien que tiene que callarse sobre peinados, e"| __truncated__ "Las declaraciones de Felipe Gonzalez hoy hablan por sí solas http://t.co/0LJo6zAXdc" "@KOPRITHS @GAPATZHS @MariaSpyraki και εκεί που λες εχουν πιάσει πάτο, θα καταφέρουν να σε διαψεύσουν." ...
## $ type : Factor w/ 3 levels "reply","retweet",..: 2 3 2 2 3 2 2 2 2 2 ...
## $ user_followers_count: int 769 303 470 470 3662 470 67 124 1359 181 ...
## $ user_friends_count : int 557 789 419 647 793 910 36 90 793 258 ...
## $ user_geo_enabled : Factor w/ 2 levels "False","True": 1 1 2 1 2 1 2 1 1 2 ...
## $ user_id : num 8.40e+07 2.75e+08 4.61e+08 2.43e+09 1.62e+08 ...
## $ user_id_str : num 8.40e+07 2.75e+08 4.61e+08 2.43e+09 1.62e+08 ...
## $ user_lang : Factor w/ 40 levels "Arabic","Basque",..: 10 34 34 16 4 13 21 10 4 34 ...
## $ user_listed_count : int 6 13 1 1 133 4 0 3 31 7 ...
## $ user_location : chr NA "Sanfer of Henares" "La Puebla ciry" NA ...
## $ user_name : chr "Francesco Filini" "Carlos Marina" "Gabi Armario Cívico" "ΤΗΛΕΠΛΑΣΙΕ" ...
## $ user_screen_name : chr "FrancescoFilini" "marina_carlos" "erpartecama" "THLEPLASHIE" ...
## $ user_statuses_count : int 1880 7051 6776 666 19006 30239 1563 601 37237 2313 ...
## $ user_time_zone : chr "Amsterdam" "Madrid" "Athens" NA ...
## $ user_url : chr "http://rapportoaureo.wordpress.com" "http://carlosmarina.com" "http://www.cazuelaalamorisca.com" NA ...
## $ user_created_at : chr "Wed, 21 Oct 2009 08:59:58 +0000" "2011-03-30 13:07:21+00:00" "Tue, 10 Jan 2012 23:23:18 +0000" "Mon, 07 Apr 2014 10:59:39 +0000" ...
## $ user_geo_enabled.1 : Factor w/ 2 levels "False","True": 1 1 2 1 2 1 2 1 1 2 ...
## $ user_screen_nameL : chr "francescofilini" "marina_carlos" "erpartecama" "thleplashie" ...
## $ Party : chr NA NA NA NA ...
## $ Party.Code : num NA NA NA NA NA NA NA NA NA NA ...
## $ Sitting_2009 : Factor w/ 2 levels "Non-incumbent",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Sitting_2014 : Factor w/ 2 levels "Non-incumbent",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Name : chr NA NA NA NA ...
## $ Twitter : chr NA NA NA NA ...
## $ Facebook : chr NA NA NA NA ...
## $ gender : Factor w/ 2 levels "Female","Male": NA NA NA NA NA NA NA NA NA NA ...
## $ Country : Factor w/ 27 levels "Austria","Belgium",..: NA NA NA NA NA NA NA NA NA NA ...
## $ hasTwitter : Factor w/ 2 levels "No","Yes": NA NA NA NA NA NA NA NA NA NA ...
## $ candidate : Factor w/ 2 levels "candidate","non-candidate": NA NA NA NA NA NA NA NA NA NA ...
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tweetSample <- mutate(tweetSample, day = yday(created_at))
tweetSample <- mutate(tweetSample, dayDate = as.Date(day-1, origin = "2014-01-01"))
juncker <- filter(tweetSample, grepl('juncker', text, ignore.case=TRUE)) %>%
mutate(kand='Juncker')
schulz <- filter(tweetSample, grepl('schulz', text, ignore.case=TRUE)) %>%
mutate(kand='Schulz')
verhof <- filter(tweetSample, grepl('verhofstadt', text, ignore.case=TRUE)) %>%
mutate(kand='Verhofstadt')
spitzAll <- bind_rows(juncker, schulz, verhof)
Once the data is in the correct format, we can use ggplot to display the candidate mentions on the a single plot:
require(ggplot2)
## Loading required package: ggplot2
require(scales)
## Loading required package: scales
# mentioning kandidates names over time
plotDf <- count(spitzAll, kand, day=day) %>% mutate(day=as.Date(day-1, origin = "2014-01-01"))
ggplot(data=plotDf, aes(x=day, y=n, colour=kand)) +
geom_line(size=1) +
scale_y_continuous(labels = comma) + geom_vline(xintercept=as.numeric(as.Date("2014-05-15")), linetype=4) +
geom_vline(xintercept=as.numeric(as.Date("2014-05-25")), linetype=4) +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))
We can use the keptFeatures
argument to dfm()
to analyse only hashtags for each candidate’s text.
# Top hashtags for tweets that mention Juncker
dv <- data.frame(user = juncker$user_screen_name)
jCorp <- corpus(juncker$text, docvars = dv)
jd <- dfm(jCorp)
## Creating a dfm from a corpus ...
##
## ... lowercasing
##
## ... tokenizing
##
## ... indexing documents: 31 documents
##
## ... indexing features:
## 364 feature types
##
## ... created a 31 x 364 sparse dfm
## ... complete.
## Elapsed time: 0.041 seconds.
jd <- selectFeatures(jd, "^#.*", "keep", valuetype = "regex")
## kept 20 features, from 1 supplied (regex) feature types
# equivalent: jd <- selectFeatures(jd, "#*", "keep", valuetype = "glob")
topfeatures(jd, nfeature(jd))
## #withjuncker #telleurope #afd #tvduell #nowschulz
## 5 3 2 2 1
## #rt #riga #teammartens #caraacaratve #ppe
## 1 1 1 1 1
## #votapp #votacanete #publicviewing #tvduell's #euhaus
## 1 1 1 1 1
## #europawahl #berlin #linke #merkel #gabriel
## 1 1 1 1 1
Wordscores:
data(amicusCorpus, package = "quantedaData")
refs <- docvars(amicusCorpus, "trainclass")
refs <- (as.numeric(refs) - 1.5)*2
amicusDfm <- dfm(amicusCorpus, verbose = FALSE)
wm <- textmodel(amicusDfm, y = refs, model = "wordscores")
summary(wm)
## Call:
## textmodel_wordscores(data = x, scores = y)
##
## Reference Document Statistics:
## (ref scores and feature count statistics)
##
## Score Total Min Max Mean Median
## sP1.txt -1 10691 0 768 0.57027791 0
## sP2.txt -1 12228 0 848 0.65226436 0
## sR1.txt 1 13062 0 1040 0.69675148 0
## sR2.txt 1 10952 0 662 0.58420014 0
## sAP01.txt NA 6175 0 335 0.32938604 0
## sAP02.txt NA 6282 0 380 0.33509361 0
## sAP03.txt NA 7772 0 484 0.41457300 0
## sAP04.txt NA 4751 0 304 0.25342722 0
## sAP05.txt NA 7013 0 409 0.37408652 0
## sAP06.txt NA 4888 0 253 0.26073505 0
## sAP07.txt NA 4881 0 360 0.26036166 0
## sAP08.txt NA 1756 0 121 0.09366832 0
## sAP09.txt NA 4304 0 226 0.22958340 0
## sAP10.txt NA 4101 0 233 0.21875500 0
## sAP11.txt NA 5830 0 368 0.31098309 0
## sAP12.txt NA 6916 0 309 0.36891236 0
## sAP13.txt NA 5425 0 375 0.28937963 0
## sAP14.txt NA 5601 0 286 0.29876780 0
## sAP15.txt NA 6194 0 402 0.33039953 0
## sAP16.txt NA 3592 0 241 0.19160399 0
## sAP17.txt NA 4996 0 303 0.26649597 0
## sAP18.txt NA 4156 0 236 0.22168880 0
## sAP19.txt NA 3354 0 201 0.17890863 0
## sAR01.txt NA 1995 0 105 0.10641703 0
## sAR02.txt NA 7225 0 431 0.38539500 0
## sAR03.txt NA 7351 0 501 0.39211607 0
## sAR04.txt NA 7741 0 479 0.41291940 0
## sAR05.txt NA 7002 0 533 0.37349976 0
## sAR06.txt NA 8245 0 442 0.43980370 0
## sAR07.txt NA 8601 0 451 0.45879341 0
## sAR08.txt NA 6427 0 262 0.34282819 0
## sAR09.txt NA 5217 0 310 0.27828453 0
## sAR10.txt NA 7896 0 432 0.42118739 0
## sAR11.txt NA 8372 0 461 0.44657812 0
## sAR12.txt NA 4024 0 197 0.21464768 0
## sAR13.txt NA 7917 0 409 0.42230757 0
## sAR14.txt NA 5896 0 326 0.31450365 0
## sAR15.txt NA 8200 0 532 0.43740332 0
## sAR16.txt NA 6577 0 402 0.35082947 0
## sAR17.txt NA 7174 0 377 0.38267456 0
## sAR18.txt NA 7414 0 532 0.39547661 0
## sAR19.txt NA 5015 0 317 0.26750947 0
## sAR20.txt NA 7969 0 416 0.42508135 0
## sAR21.txt NA 4602 0 276 0.24547928 0
## sAR22.txt NA 7152 0 483 0.38150104 0
## sAR23.txt NA 6594 0 381 0.35173628 0
## sAR24.txt NA 7827 0 400 0.41750680 0
## sAR25.txt NA 6746 0 361 0.35984424 0
## sAR26.txt NA 7540 0 378 0.40219768 0
## sAR27.txt NA 7853 0 394 0.41889369 0
## sAR28.txt NA 5546 0 335 0.29583400 0
## sAR29.txt NA 7770 0 566 0.41446631 0
## sAR30.txt NA 6114 0 471 0.32613218 0
## sAR31.txt NA 6763 0 325 0.36075105 0
## sAR32.txt NA 6802 0 380 0.36283139 0
## sAR33.txt NA 8213 0 414 0.43809676 0
## sAR34.txt NA 6394 0 359 0.34106790 0
## sAR35.txt NA 6826 0 301 0.36411159 0
## sAR36.txt NA 2148 0 121 0.11457833 0
## sAR37.txt NA 6379 0 414 0.34026778 0
## sAR38.txt NA 6320 0 261 0.33712061 0
## sAR39.txt NA 6975 0 453 0.37205953 0
## sAR40.txt NA 7996 0 490 0.42652158 0
## sAR41.txt NA 6911 0 423 0.36864565 0
## sAR42.txt NA 2233 0 175 0.11911239 0
## sAR43.txt NA 6790 0 274 0.36219128 0
## sAR44.txt NA 9148 0 555 0.48797141 0
## sAR45.txt NA 7267 0 327 0.38763535 0
## sAR46.txt NA 8224 0 472 0.43868352 0
## sAR47.txt NA 1885 0 97 0.10054942 0
## sAR48.txt NA 4276 0 264 0.22808983 0
## sAR49.txt NA 7090 0 410 0.37819384 0
## sAR50.txt NA 6587 0 356 0.35136288 0
## sAR51.txt NA 3272 0 230 0.17453459 0
## sAR52.txt NA 7507 0 440 0.40043740 0
## sAR53.txt NA 4176 0 293 0.22275564 0
## sAR54.txt NA 7838 0 424 0.41809356 0
## sAR55.txt NA 218 0 17 0.01162853 0
## sAR56.txt NA 7410 0 351 0.39526324 0
## sAR58.txt NA 3687 0 312 0.19667147 0
## sAR59.txt NA 5189 0 338 0.27679095 0
## sAR60.txt NA 6144 0 269 0.32773244 0
## sAR61.txt NA 5286 0 418 0.28196511 0
## sAR62.txt NA 3734 0 173 0.19917854 0
## sAR63.txt NA 4487 0 284 0.23934496 0
## sAR64.txt NA 8478 0 494 0.45223236 0
## sAR65.txt NA 8543 0 498 0.45569958 0
## sAR66.txt NA 6438 0 354 0.34341495 0
## sAR67.txt NA 4766 0 308 0.25422734 0
## sAR68.txt NA 6045 0 368 0.32245159 0
## sAR71.txt NA 7344 0 445 0.39174268 0
## sAR72.txt NA 5809 0 294 0.30986291 0
## sAR73.txt NA 6732 0 384 0.35909746 0
## sAR74.txt NA 2773 0 162 0.14791700 0
## sAR75.txt NA 5664 0 395 0.30212834 0
## sAR76.txt NA 4748 0 211 0.25326719 0
## sAR77.txt NA 4356 0 307 0.23235718 0
## sAR78.txt NA 2450 0 152 0.13068758 0
## sAR79.txt NA 4270 0 207 0.22776978 0
## sAR80.txt NA 5683 0 446 0.30314184 0
## sAR81.txt NA 2240 0 128 0.11948578 0
## sAR83.txt NA 4836 0 348 0.25796127 0
preds <- predict(wm, newdata = amicusDfm)
## 5193 of 18747 features (27.7%) can be scored
summary(preds)
## Predicted textmodel of type: wordscores
##
## textscore LBG se ci lo ci hi
## sP1.txt -0.2353 0.0037 -0.2426 -0.2280
## sP2.txt -0.2153 0.0035 -0.2222 -0.2083
## sR1.txt 0.2626 0.0039 0.2549 0.2702
## sR2.txt 0.1880 0.0042 0.1798 0.1963
## sAP01.txt -0.0705 0.0058 -0.0818 -0.0592
## sAP02.txt 0.0073 0.0058 -0.0041 0.0187
## sAP03.txt 0.0034 0.0053 -0.0071 0.0138
## sAP04.txt -0.1008 0.0062 -0.1130 -0.0887
## sAP05.txt 0.0086 0.0057 -0.0025 0.0198
## sAP06.txt -0.0741 0.0064 -0.0867 -0.0615
## sAP07.txt -0.0093 0.0062 -0.0214 0.0027
## sAP08.txt -0.0053 0.0100 -0.0249 0.0142
## sAP09.txt 0.0616 0.0075 0.0469 0.0763
## sAP10.txt 0.0499 0.0079 0.0345 0.0654
## sAP11.txt -0.0129 0.0064 -0.0254 -0.0004
## sAP12.txt 0.0008 0.0062 -0.0113 0.0129
## sAP13.txt -0.0932 0.0058 -0.1045 -0.0819
## sAP14.txt -0.0457 0.0062 -0.0579 -0.0336
## sAP15.txt -0.0025 0.0059 -0.0141 0.0091
## sAP16.txt -0.0941 0.0072 -0.1083 -0.0799
## sAP17.txt 0.0786 0.0068 0.0653 0.0919
## sAP18.txt 0.0353 0.0081 0.0193 0.0512
## sAP19.txt -0.0810 0.0076 -0.0959 -0.0661
## sAR01.txt 0.0983 0.0125 0.0738 0.1228
## sAR02.txt -0.0186 0.0054 -0.0292 -0.0080
## sAR03.txt 0.1097 0.0054 0.0990 0.1203
## sAR04.txt 0.1215 0.0057 0.1104 0.1326
## sAR05.txt 0.0250 0.0057 0.0138 0.0362
## sAR06.txt 0.1239 0.0056 0.1131 0.1348
## sAR07.txt 0.0768 0.0050 0.0670 0.0867
## sAR08.txt 0.1107 0.0061 0.0988 0.1227
## sAR09.txt 0.0848 0.0067 0.0716 0.0980
## sAR10.txt 0.1105 0.0055 0.0997 0.1214
## sAR11.txt 0.0340 0.0049 0.0244 0.0436
## sAR12.txt 0.0840 0.0077 0.0690 0.0991
## sAR13.txt 0.1136 0.0058 0.1021 0.1250
## sAR14.txt 0.1039 0.0064 0.0914 0.1164
## sAR15.txt 0.1283 0.0053 0.1179 0.1388
## sAR16.txt 0.0799 0.0060 0.0682 0.0917
## sAR17.txt 0.0325 0.0054 0.0218 0.0431
## sAR18.txt 0.1149 0.0060 0.1031 0.1266
## sAR19.txt 0.0104 0.0062 -0.0017 0.0225
## sAR20.txt 0.0468 0.0055 0.0361 0.0576
## sAR21.txt 0.0392 0.0069 0.0257 0.0526
## sAR22.txt 0.1198 0.0058 0.1084 0.1311
## sAR23.txt 0.1258 0.0061 0.1138 0.1379
## sAR24.txt 0.1759 0.0053 0.1655 0.1862
## sAR25.txt 0.1383 0.0061 0.1264 0.1503
## sAR26.txt 0.1486 0.0057 0.1374 0.1597
## sAR27.txt 0.1476 0.0056 0.1366 0.1586
## sAR28.txt 0.0954 0.0062 0.0831 0.1076
## sAR29.txt 0.0773 0.0053 0.0669 0.0878
## sAR30.txt 0.1221 0.0064 0.1096 0.1346
## sAR31.txt 0.1262 0.0067 0.1131 0.1393
## sAR32.txt 0.0451 0.0063 0.0328 0.0575
## sAR33.txt 0.0324 0.0051 0.0225 0.0423
## sAR34.txt 0.1164 0.0060 0.1046 0.1282
## sAR35.txt 0.1138 0.0063 0.1015 0.1260
## sAR36.txt 0.1553 0.0102 0.1353 0.1753
## sAR37.txt -0.0149 0.0057 -0.0260 -0.0038
## sAR38.txt 0.1328 0.0065 0.1201 0.1454
## sAR39.txt 0.1094 0.0057 0.0981 0.1206
## sAR40.txt 0.0929 0.0054 0.0822 0.1035
## sAR41.txt 0.0425 0.0056 0.0316 0.0535
## sAR42.txt 0.0331 0.0090 0.0154 0.0508
## sAR43.txt 0.1828 0.0062 0.1706 0.1950
## sAR44.txt 0.1536 0.0052 0.1434 0.1638
## sAR45.txt 0.0627 0.0058 0.0513 0.0741
## sAR46.txt 0.1252 0.0055 0.1145 0.1359
## sAR47.txt 0.0612 0.0102 0.0411 0.0812
## sAR48.txt 0.1447 0.0073 0.1304 0.1591
## sAR49.txt 0.0950 0.0059 0.0834 0.1065
## sAR50.txt 0.0785 0.0058 0.0671 0.0900
## sAR51.txt 0.1559 0.0086 0.1391 0.1727
## sAR52.txt 0.1102 0.0058 0.0988 0.1215
## sAR53.txt 0.0245 0.0070 0.0107 0.0382
## sAR54.txt 0.1114 0.0057 0.1001 0.1226
## sAR55.txt 0.0943 0.0297 0.0360 0.1526
## sAR56.txt 0.0641 0.0057 0.0529 0.0753
## sAR58.txt 0.0484 0.0074 0.0339 0.0629
## sAR59.txt 0.0874 0.0067 0.0742 0.1005
## sAR60.txt 0.1181 0.0064 0.1056 0.1305
## sAR61.txt 0.0617 0.0067 0.0486 0.0748
## sAR62.txt 0.0936 0.0077 0.0785 0.1086
## sAR63.txt 0.1016 0.0073 0.0873 0.1158
## sAR64.txt 0.0902 0.0052 0.0801 0.1003
## sAR65.txt 0.1053 0.0050 0.0955 0.1152
## sAR66.txt 0.0430 0.0058 0.0317 0.0544
## sAR67.txt 0.1154 0.0070 0.1017 0.1290
## sAR68.txt 0.0521 0.0059 0.0406 0.0636
## sAR71.txt 0.0727 0.0054 0.0620 0.0833
## sAR72.txt 0.1356 0.0065 0.1228 0.1483
## sAR73.txt 0.1098 0.0059 0.0983 0.1212
## sAR74.txt 0.0069 0.0081 -0.0090 0.0228
## sAR75.txt 0.1129 0.0065 0.1000 0.1257
## sAR76.txt 0.1134 0.0072 0.0993 0.1276
## sAR77.txt 0.1027 0.0069 0.0890 0.1163
## sAR78.txt 0.0993 0.0093 0.0811 0.1175
## sAR79.txt -0.0353 0.0067 -0.0485 -0.0221
## sAR80.txt 0.1316 0.0066 0.1186 0.1446
## sAR81.txt 0.0923 0.0101 0.0725 0.1122
## sAR83.txt 0.0317 0.0062 0.0196 0.0438
plot(preds@textscores$textscore_raw ~ docvars(amicusCorpus, "testclass"),
horizontal = TRUE, xlab = "Predicted document score",
ylab = "Test class", las = 1)
Correspondence analysis:
ieDfm <- dfm(ie2010Corpus, verbose = FALSE)
ieCA <- textmodel(ieDfm, model = "ca")
require(ca)
## Loading required package: ca
plot(ieCA, what = c("all", "none"))
Poisson scaling:
ieWF <- textmodel(ieDfm, model = "wordfish")
summary(ieWF)
## Call:
## textmodel_wordfish(data = x)
##
## Estimated document positions:
## theta SE lower
## 2010_BUDGET_01_Brian_Lenihan_FF -1.77842508 0.02069143 -1.81898027
## 2010_BUDGET_02_Richard_Bruton_FG 0.58436869 0.02939256 0.52675927
## 2010_BUDGET_03_Joan_Burton_LAB 1.14761695 0.01572313 1.11679962
## 2010_BUDGET_04_Arthur_Morgan_SF 0.09400958 0.02935247 0.03647874
## 2010_BUDGET_05_Brian_Cowen_FF -1.79211539 0.02367182 -1.83851217
## 2010_BUDGET_06_Enda_Kenny_FG 0.78894787 0.02604077 0.73790796
## 2010_BUDGET_07_Kieran_ODonnell_FG 0.49306437 0.04357844 0.40765063
## 2010_BUDGET_08_Eamon_Gilmore_LAB 0.58812988 0.03034615 0.52865142
## 2010_BUDGET_09_Michael_Higgins_LAB 0.97901464 0.04107866 0.89850047
## 2010_BUDGET_10_Ruairi_Quinn_LAB 0.92084329 0.04233912 0.83785862
## 2010_BUDGET_11_John_Gormley_Green -1.12261547 0.07547495 -1.27054638
## 2010_BUDGET_12_Eamon_Ryan_Green -0.21004677 0.06449005 -0.33644727
## 2010_BUDGET_13_Ciaran_Cuffe_Green -0.79133534 0.07306007 -0.93453308
## 2010_BUDGET_14_Caoimhghin_OCaolain_SF 0.09854280 0.03908478 0.02193662
## upper
## 2010_BUDGET_01_Brian_Lenihan_FF -1.73786988
## 2010_BUDGET_02_Richard_Bruton_FG 0.64197811
## 2010_BUDGET_03_Joan_Burton_LAB 1.17843427
## 2010_BUDGET_04_Arthur_Morgan_SF 0.15154043
## 2010_BUDGET_05_Brian_Cowen_FF -1.74571862
## 2010_BUDGET_06_Enda_Kenny_FG 0.83998778
## 2010_BUDGET_07_Kieran_ODonnell_FG 0.57847811
## 2010_BUDGET_08_Eamon_Gilmore_LAB 0.64760833
## 2010_BUDGET_09_Michael_Higgins_LAB 1.05952881
## 2010_BUDGET_10_Ruairi_Quinn_LAB 1.00382796
## 2010_BUDGET_11_John_Gormley_Green -0.97468457
## 2010_BUDGET_12_Eamon_Ryan_Green -0.08364627
## 2010_BUDGET_13_Ciaran_Cuffe_Green -0.64813761
## 2010_BUDGET_14_Caoimhghin_OCaolain_SF 0.17514898
dotchart(ieWF@theta,
labels = paste(docvars(ie2010Corpus, "name"), docvars(ie2010Corpus, "party")))
Topic models:
require(topicmodels)
## Loading required package: topicmodels
mycorpus <- subset(inaugCorpus, Year>1950)
quantdfm <- dfm(mycorpus, verbose=FALSE, stem=TRUE,
ignoredFeatures=c(stopwords('english'),'will','us','nation', 'can','peopl*','americ*'))
##
ldadfm <- convert(quantdfm, to="topicmodels")
lda <- LDA(ldadfm, control = list(alpha = 0.1), k=20)
terms(lda, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6
## [1,] "new" "world" "freedom" "new" "freedom" "citizen"
## [2,] "generat" "peac" "world" "must" "liberti" "stori"
## [3,] "work" "let" "day" "strength" "ideal" "countri"
## [4,] "everi" "make" "one" "togeth" "everi" "time"
## [5,] "less" "man" "liberti" "world" "citizen" "live"
## [6,] "world" "know" "free" "human" "human" "call"
## [7,] "must" "earth" "unit" "spirit" "hope" "nation"
## [8,] "prosper" "need" "countri" "time" "time" "must"
## [9,] "common" "now" "seen" "nation" "state" "everi"
## [10,] "spirit" "voic" "move" "just" "must" "courag"
## Topic 7 Topic 8 Topic 9 Topic 10 Topic 11 Topic 12 Topic 13
## [1,] "new" "peac" "world" "world" "must" "chang" "govern"
## [2,] "let" "faith" "must" "may" "time" "man" "world"
## [3,] "world" "know" "chang" "nation" "make" "must" "one"
## [4,] "centuri" "man" "today" "seek" "citizen" "union" "freedom"
## [5,] "challeng" "life" "new" "peac" "requir" "world" "time"
## [6,] "power" "come" "let" "freedom" "countri" "great" "must"
## [7,] "live" "shall" "renew" "must" "know" "old" "now"
## [8,] "enough" "live" "work" "hope" "freedom" "believ" "histori"
## [9,] "educ" "free" "time" "help" "equal" "land" "new"
## [10,] "strong" "make" "challeng" "honor" "everi" "everi" "human"
## Topic 14 Topic 15 Topic 16 Topic 17 Topic 18 Topic 19 Topic 20
## [1,] "new" "new" "govern" "free" "word" "let" "let"
## [2,] "centuri" "work" "must" "world" "today" "power" "peac"
## [3,] "time" "world" "believ" "upon" "crisi" "world" "world"
## [4,] "world" "great" "world" "freedom" "earth" "side" "respons"
## [5,] "promis" "thing" "time" "must" "let" "new" "new"
## [6,] "everi" "friend" "one" "faith" "time" "pledg" "great"
## [7,] "must" "good" "freedom" "shall" "moment" "ask" "govern"
## [8,] "land" "day" "work" "strength" "women" "nation" "home"
## [9,] "one" "free" "go" "nation" "nation" "citizen" "year"
## [10,] "govern" "time" "day" "hope" "new" "shall" "make"
require(stm)
## Loading required package: stm
## stm v1.1.3 (2016-01-14) successfully loaded. See ?stm for help.
data(moviesCorpus, package = "quantedaData")
summary(moviesCorpus, 5)
## Corpus consisting of 2000 documents, showing 5 documents.
## Warning in nsentence.character(object, ...): nsentence() does not correctly
## count sentences in all lower-cased text
## Text Types Tokens Sentences Sentiment id1 id2
## neg_cv000_29416 354 847 9 neg cv000 29416
## neg_cv001_19502 156 278 1 neg cv001 19502
## neg_cv002_17424 279 559 3 neg cv002 17424
## neg_cv003_12683 325 594 2 neg cv003 12683
## neg_cv004_12641 385 872 2 neg cv004 12641
##
## Source: /Users/kbenoit/Dropbox/QUANTESS/quantedaData_kenlocal_gh/* on x86_64 by kbenoit
## Created: Sat Nov 15 18:43:25 2014
## Notes:
movies_dfm <- dfm(moviesCorpus, ignoredFeatures = c(stopwords("SMART")), stem = TRUE)
## Creating a dfm from a corpus ...
##
## ... lowercasing
##
## ... tokenizing
##
## ... indexing documents: 2,000 documents
##
## ... indexing features:
## 47,691 feature types
##
## ...
## removed 546 features, from 570 supplied (glob) feature types
## ... stemming features (English)
## ```
```
movies_dfm_stm <- convert(movies_dfm, to = "stm",
docvars = data.frame(sentiment = docvars(moviesCorpus, "Sentiment")))
stm_fittedmodel <- stm(movies_dfm_stm$documents, movies_dfm_stm$vocab, K = 20,
interactions = FALSE, verbose = FALSE)
labelTopics(stm_fittedmodel)
## Topic 1 Top Words:
## Highest Prob: film, movi, black, scene, play, time, cop
## FREX: reev, sweetback, arnold, keanu, taran, cauldron, satan
## Lift: adult-helfgott, anney, azazel, baadasssss, bolton, bubbi, clairvoy
## Score: ___, ____, _____, _does_, _hard_war, _huge_, _in
## Topic 2 Top Words:
## Highest Prob: film, star, movi, war, charact, effect, trek
## FREX: jedi, anakin, obi-wan, insurrect, skywalk, qui-gon, phantom
## Lift: anakin, patlabor, ___, ____, _anything_, _come_, _experience_
## Score: ___, ____, _america's_funniest_home_videos_, _american_psycho_, _anything_, _blade, _come_
## Topic 3 Top Words:
## Highest Prob: film, horror, movi, scream, charact, killer, origin
## FREX: nbsp, slasher, blair, williamson, prinz, scream, horror
## Lift: _bad_, _before_, _dawson's_creek_, _disturbing_behavior_, _fisherman, _halloween, _i_know
## Score: _air_force_one_, _american_psycho_, _and_, _andre_, _babe_, _bad_, _before_
## Topic 4 Top Words:
## Highest Prob: movi, bad, good, time, thing, guy, plot
## FREX: stupid, bad, worst, movi, bunch, guess, idiot
## Lift: _armageddon_, dade, handphon, ketchum, lipnicki, littl, quak
## Score: ___, ____, _____, ______, _air_force_one_, _all_, _am_
## Topic 5 Top Words:
## Highest Prob: film, action, movi, chan, jacki, scene, play
## FREX: damm, fu, kung, chan, hong, kong, li
## Lift: _amadeus_, _angel, _are_, _basquiat_, _blade, _casablanca_, _death
## Score: _amadeus_, _angel, _are_, _basquiat_, _blade, _boom_, _can_
## Topic 6 Top Words:
## Highest Prob: movi, play, film, harri, charact, make, dog
## FREX: shrek, leila, memphi, donkey, farquaad, nomi, palma
## Lift: _beloved_, _onegin_, avi, baxter, becki, bluer, boogieman
## Score: _____, _air_force_one_, _am_, _american_beauty_, _arrrgh_, _beloved_, _breakfast_
## Topic 7 Top Words:
## Highest Prob: film, vampir, movi, play, time, charact, girl
## FREX: vampir, lebowski, kiki, spice, coen, bat, blade
## Lift: adjani, batologist, bettani, bunton, chisolm, dar, dobi
## Score: _____, ______, _american_psycho_, _and_, _blade_, _can't_, _dead_
## Topic 8 Top Words:
## Highest Prob: film, charact, make, stori, good, scene, perform
## FREX: actor, great, flaw, charact, happen, film, perform
## Lift: adandon, anabell, awoken, bassing, betwixt, blanco, bloc
## Score: ___, ____, _____, ______, ____________________________________________, __________________________________________________________, _21_jump_street_
## Topic 9 Top Words:
## Highest Prob: film, world, time, scienc, effect, movi, space
## FREX: dvd, scienc, disc, contact, elli, space, horizon
## Lift: supergirl, 19-month, 50-minut, 800-mile, 85but, absolout, acronym
## Score: _____, ______, ____________________________________________, __________________________________________________________, _air_force_one_, _all_, _amadeus_
## Topic 10 Top Words:
## Highest Prob: love, stori, life, shakespear, movi, charact, play
## FREX: hedwig, shakespear, bulworth, paltrow, sixth, sonni, lumumba
## Lift: jovovich, moulin, ule, __________________________________________________________, _entertainment_weekly_, _polish_wedding_, _schindler
## Score: ___, ____, _____, ______, __________________________________________________________, _american_beauty_, _breakfast_
## Topic 11 Top Words:
## Highest Prob: film, love, charact, life, comedi, movi, friend
## FREX: flynt, wed, julia, luci, jimmi, romant, kate
## Lift: _huge_, _hustler_, _their_, _to, _wayyyyy_, _would_, 10-day
## Score: _____, ______, _air_force_one_, _am_, _america's_funniest_home_videos_, _american_beauty_, _and_
## Topic 12 Top Words:
## Highest Prob: film, movi, funni, comedi, joke, laugh, make
## FREX: mike, cusack, jay, deuc, bob, allen, finn
## Lift: _saturday_night_live_, 92t, arija, bareiki, bluntman, breckin, busboy
## Score: _____, ______, _21_jump_street_, _48_hrs, _a_night_at_the_roxbury_, _am_, _america's_funniest_home_videos_
## Topic 13 Top Words:
## Highest Prob: film, thing, world, scene, killer, movi, make
## FREX: existenz, wcw, wrestl, bacon, gattaca, gadget, vincent
## Lift: 48th, ac, braddock, bret, cuesta, dawkin, dietz
## Score: _____, ______, _amadeus_, _american_psycho_, _and_, _blade, _can_
## Topic 14 Top Words:
## Highest Prob: anim, disney, film, war, movi, voic, stori
## FREX: tarzan, disney, mulan, warrior, toy, gorilla, anim
## Lift: dalai, tarzan, tibetan, verlain, _lone, _octob, _people_
## Score: _____, __________________________________________________________, _amadeus_, _american_beauty_, _and_, _armageddon_, _blade
## Topic 15 Top Words:
## Highest Prob: film, ship, godzilla, effect, titan, movi, deep
## FREX: comet, godzilla, titan, leder, dicaprio, armageddon, volcano
## Lift: arab-american, argonauticus, astrophysicist, attatch, beiderman, biederman, brisco
## Score: ______, __________________________________________________________, _and_, _andre_, _armageddon_, _babe_, _brazil_
## Topic 16 Top Words:
## Highest Prob: film, truman, movi, show, carrey, murphi, scene
## FREX: krippendorf, dora, guido, benigni, truman, carrey, burbank
## Lift: _cliffhanger_, _daylight_, 2th, alfredo, almod, aloft, anthropolog
## Score: _48_hrs, _and_, _babe_, _boom_, _brazil_, _can_, _cliffhang
## Topic 17 Top Words:
## Highest Prob: film, school, movi, play, music, stori, kid
## FREX: patch, pauli, anni, egoyan, max, maggi, teacher
## Lift: _rushmore_, abb, all-night, alvarado, annabella, atreid, ayla
## Score: ___, ____, _____, ______, ____________________________________________, __________________________________________________________, _all_
## Topic 18 Top Words:
## Highest Prob: film, movi, comedi, play, make, ape, work
## FREX: brook, derek, stiller, ape, skinhead, perri, melvin
## Lift: aldi, carrera, cassavet, cathrin, chabat, clori, delia
## Score: _____, ______, _a, _america's_funniest_home_videos_, _american_beauty_, _and_, _anim
## Topic 19 Top Words:
## Highest Prob: film, charact, jacki, movi, make, play, scene
## FREX: ordel, brenner, lambeau, maximus, rudi, marti, shandl
## Lift: _somewhere_, _very_, 20-foot, 216-digit, 26min, 3-hour, 44-year
## Score: _____, ______, __________________________________________________________, _american_beauty_, _breakfast_, _breakfast_of_champions_, _fear_and_loathing_in_las_vegas_
## Topic 20 Top Words:
## Highest Prob: film, alien, movi, batman, scene, action, make
## FREX: ripley, batman, psychlo, alien, jackal, speci, virus
## Lift: _four_, _his_, _looks_, _mafia_, 10th, 1600s, 18-foot-high
## Score: _armageddon_, _everybody_, _four_, _genius_, _hard_war, _his_, _in