Here we will step through the basic elements of preparing a text for analysis. These are tokenization, conversion to lower case, stemming, removing or selecting features, and defining equivalency classes for features, including the use of dictionaries.

1. Tokenization

Tokenization in quanteda is very conservative: by default, it only removes separator characters.

require(quanteda, quietly = TRUE, warn.conflicts = FALSE)
## quanteda version 0.9.8.5
txt <- c(text1="This is $10 in 999 different ways,\n up and down; left and right!",
         text2="@kenbenoit working: on #quanteda 2day\t4ever, http://textasdata.com?page=123.")
tokenize(txt)
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      "is"        "$"         "10"        "in"       
##  [6] "999"       "different" "ways"      ","         "up"       
## [11] "and"       "down"      ";"         "left"      "and"      
## [16] "right"     "!"        
## 
## text2 :
##  [1] "@kenbenoit"     "working"        ":"              "on"            
##  [5] "#quanteda"      "2day"           "4ever"          ","             
##  [9] "http"           ":"              "/"              "/"             
## [13] "textasdata.com" "?"              "page"           "="             
## [17] "123"            "."
tokenize(txt, verbose=TRUE)
## Starting tokenization...
## ...preserving Twitter characters (#, @)
## ...total elapsed: 0.00100000000000033 seconds.
## ...tokenizing texts
## 
##   ...removing separators.
## ...total elapsed:  0.00100000000000033 seconds.
## ...replacing Twitter characters (#, @)
## ...total elapsed: 0 seconds.
## ...replacing names
## ...total elapsed:  0 seconds.
## Finished tokenizing and cleaning 2 texts.
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      "is"        "$"         "10"        "in"       
##  [6] "999"       "different" "ways"      ","         "up"       
## [11] "and"       "down"      ";"         "left"      "and"      
## [16] "right"     "!"        
## 
## text2 :
##  [1] "@kenbenoit"     "working"        ":"              "on"            
##  [5] "#quanteda"      "2day"           "4ever"          ","             
##  [9] "http"           ":"              "/"              "/"             
## [13] "textasdata.com" "?"              "page"           "="             
## [17] "123"            "."
tokenize(txt, removeNumbers=TRUE, removePunct=TRUE)
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      "is"        "in"        "different" "ways"     
##  [6] "up"        "and"       "down"      "left"      "and"      
## [11] "right"    
## 
## text2 :
## [1] "@kenbenoit"     "working"        "on"             "#quanteda"     
## [5] "2day"           "4ever"          "http"           "textasdata.com"
## [9] "page"
tokenize(txt, removeNumbers=FALSE, removePunct=TRUE)
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      "is"        "10"        "in"        "999"      
##  [6] "different" "ways"      "up"        "and"       "down"     
## [11] "left"      "and"       "right"    
## 
## text2 :
##  [1] "@kenbenoit"     "working"        "on"             "#quanteda"     
##  [5] "2day"           "4ever"          "http"           "textasdata.com"
##  [9] "page"           "123"
tokenize(txt, removeNumbers=TRUE, removePunct=FALSE)
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      "is"        "$"         "in"        "different"
##  [6] "ways"      ","         "up"        "and"       "down"     
## [11] ";"         "left"      "and"       "right"     "!"        
## 
## text2 :
##  [1] "@kenbenoit"     "working"        ":"              "on"            
##  [5] "#quanteda"      "2day"           "4ever"          ","             
##  [9] "http"           ":"              "/"              "/"             
## [13] "textasdata.com" "?"              "page"           "="             
## [17] "."
tokenize(txt, removeNumbers=FALSE, removePunct=FALSE)
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      "is"        "$"         "10"        "in"       
##  [6] "999"       "different" "ways"      ","         "up"       
## [11] "and"       "down"      ";"         "left"      "and"      
## [16] "right"     "!"        
## 
## text2 :
##  [1] "@kenbenoit"     "working"        ":"              "on"            
##  [5] "#quanteda"      "2day"           "4ever"          ","             
##  [9] "http"           ":"              "/"              "/"             
## [13] "textasdata.com" "?"              "page"           "="             
## [17] "123"            "."
tokenize(txt, removeNumbers=FALSE, removePunct=FALSE, removeSeparators=FALSE)
## tokenizedText object from 2 documents.
## text1 :
##  [1] "This"      " "         "is"        " "         "$"        
##  [6] "10"        " "         "in"        " "         "999"      
## [11] " "         "different" " "         "ways"      ","        
## [16] "\n"        " "         "up"        " "         "and"      
## [21] " "         "down"      ";"         " "         "left"     
## [26] " "         "and"       " "         "right"     "!"        
## 
## text2 :
##  [1] "@kenbenoit"     " "              "working"        ":"             
##  [5] " "              "on"             " "              "#quanteda"     
##  [9] " "              "2day"           "\t"             "4ever"         
## [13] ","              " "              "http"           ":"             
## [17] "/"              "/"              "textasdata.com" "?"             
## [21] "page"           "="              "123"            "."

There are several options to the what argument:

# sentence level
tokenize(c("Kurt Vongeut said; only assholes use semi-colons.",
           "Today is Thursday in Canberra:  It is yesterday in London.",
           "Today is Thursday in Canberra:  \nIt is yesterday in London.",
           "To be?  Or\not to be?"),
          what = "sentence")
## tokenizedText object from 4 documents.
## Component 1 :
## [1] "Kurt Vongeut said; only assholes use semi-colons."
## 
## Component 2 :
## [1] "Today is Thursday in Canberra:  It is yesterday in London."
## 
## Component 3 :
## [1] "Today is Thursday in Canberra:   It is yesterday in London."
## 
## Component 4 :
## [1] "To be?"       "Or ot to be?"
tokenize(inaugTexts[2], what = "sentence")
## tokenizedText object from 1 document.
## 1793-Washington :
## [1] "Fellow citizens, I am again called upon by the voice of my country to execute the functions of its Chief Magistrate."                                                                                                                                                                                                                                       
## [2] "When the occasion proper for it shall arrive, I shall endeavor to express the high sense I entertain of this distinguished honor, and of the confidence which has been reposed in me by the people of united America."                                                                                                                                      
## [3] "Previous to the execution of any official act of the President the Constitution requires an oath of office."                                                                                                                                                                                                                                                
## [4] "This oath I am now about to take, and in your presence: That if it shall be found during my administration of the Government I have in any instance violated willingly or knowingly the injunctions thereof, I may (besides incurring constitutional punishment) be subject to the upbraidings of all who are now witnesses of the present solemn ceremony."
# character level
tokenize("My big fat text package.", what="character")
## tokenizedText object from 1 document.
## Component 1 :
##  [1] "M" "y" "b" "i" "g" "f" "a" "t" "t" "e" "x" "t" "p" "a" "c" "k" "a"
## [18] "g" "e" "."
tokenize("My big fat text package.", what="character", removeSeparators=FALSE)
## tokenizedText object from 1 document.
## Component 1 :
##  [1] "M" "y" " " "b" "i" "g" " " "f" "a" "t" " " "t" "e" "x" "t" " " "p"
## [18] "a" "c" "k" "a" "g" "e" "."

Two other options, for really fast and simple tokenization are "fastestword" and "fasterword", if performance is a key issue. These are less intelligent than the boundary detection used in the default "word" method, which is based on stringiboundary detection.

2. Conversion to lower case

This is a tricky one in our workflow, since it is a form of equivalency declaration, rather than a tokenization step. It turns out that it is more efficient to perform at the pre-tokenization stage.

As a result, the method toLower() is defined for many classes of quanteda objects.

methods(toLower)
## [1] toLower.character*      toLower.corpus*         toLower.NULL*          
## [4] toLower.tokenizedTexts*
## see '?methods' for accessing help and source code

We include options designed to preserve acronyms.

test1 <- c(text1 = "England and France are members of NATO and UNESCO",
           text2 = "NASA sent a rocket into space.")
toLower(test1)
##                                               text1 
## "england and france are members of nato and unesco" 
##                                               text2 
##                    "nasa sent a rocket into space."
toLower(test1, keepAcronyms = TRUE)
##                                               text1 
## "england and france are members of NATO and UNESCO" 
##                                               text2 
##                    "NASA sent a rocket into space."
test2 <- tokenize(test1, removePunct = TRUE)
toLower(test2)
## tokenizedText object from 2 documents.
## text1 :
## [1] "england" "and"     "france"  "are"     "members" "of"      "nato"   
## [8] "and"     "unesco" 
## 
## text2 :
## [1] "nasa"   "sent"   "a"      "rocket" "into"   "space"
toLower(test2, keepAcronyms = TRUE)
## tokenizedText object from 2 documents.
## text1 :
## [1] "england" "and"     "france"  "are"     "members" "of"      "NATO"   
## [8] "and"     "UNESCO" 
## 
## text2 :
## [1] "NASA"   "sent"   "a"      "rocket" "into"   "space"

toLower is based on stringi, and is therefore nicely Unicode compliant.

# Russian
cat(iconv(encodedTexts[8], "windows-1251", "UTF-8"))
## "8-битные" oncodings являются частью прошлого. 0€. Дефис-ели. Тильда ~ тире - и ± § 50.
cat(toLower(iconv(encodedTexts[8], "windows-1251", "UTF-8")))
## "8-битные" oncodings являются частью прошлого. 0€. дефис-ели. тильда ~ тире - и ± § 50.
head(toLower(stopwords("russian")), 20)
##  [1] "и"   "в"   "во"  "не"  "что" "он"  "на"  "я"   "с"   "со"  "как"
## [12] "а"   "то"  "все" "она" "так" "его" "но"  "да"  "ты"
# Arabic
cat(iconv(encodedTexts[6], "ISO-8859-6", "UTF-8"))
## ترميزات 8 بت هي موديل قديم. 0 . الواصلة أكل. تيلدا ~ م اندفاعة - و  50 .
cat(toLower(iconv(encodedTexts[6], "ISO-8859-6", "UTF-8")))
## ترميزات 8 بت هي موديل قديم. 0 . الواصلة أكل. تيلدا ~ م اندفاعة - و  50 .
head(toLower(stopwords("arabic")), 20)
##  [1] "فى"    "في"    "كل"    "لم"    "لن"    "له"    "من"    "هو"   
##  [9] "هي"    "قوة"   "كما"   "لها"   "منذ"   "وقد"   "ولا"   "نفسه" 
## [17] "لقاء"  "مقابل" "هناك"  "وقال"

Note: dfm, the Swiss army knife, converts to lower case by default, but this can be turned off using the toLower = FALSE argument.

3. Removing and selecting features

This can be done when creating a dfm:

# with English stopwords and stemming
dfmsInaug2 <- dfm(subset(inaugCorpus, Year > 1980),
                  ignoredFeatures = stopwords("english"), stem = TRUE)
## Creating a dfm from a corpus ...
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 9 documents
## 
##    ... indexing features:
## 3,114 feature types
## 
## ...
## removed 128 features, from 174 supplied (glob) feature types
## ... stemming features (English)
## ```

, trimmed 781 feature variants

… created a 9 x 2205 sparse dfm

… complete.

Elapsed time: 0.255 seconds.

```

Or can be done after creating a dfm:

myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.",
               "Does the United_States or Sweden have more progressive taxation?"),
             toLower = FALSE, verbose = FALSE)
selectFeatures(myDfm, c("s$", ".y"), "keep", valuetype = "regex")
## kept 6 features, from 2 supplied (regex) feature types
## Document-feature matrix of: 2 documents, 6 features.
## 2 x 6 sparse Matrix of class "dfmSparse"
##        features
## docs    My Christmas was by Does United_States
##   text1  1         1   1  1    0             0
##   text2  0         0   0  0    1             1
selectFeatures(myDfm, c("s$", ".y"), "remove", valuetype = "regex")
## removed 6 features, from 2 supplied (regex) feature types
## Document-feature matrix of: 2 documents, 12 features.
## 2 x 12 sparse Matrix of class "dfmSparse"
##        features
## docs    ruined your opposition tax plan the or Sweden have more
##   text1      1    1          1   1    1   0  0      0    0    0
##   text2      0    0          0   0    0   1  1      1    1    1
##        features
## docs    progressive taxation
##   text1           0        0
##   text2           1        1
selectFeatures(myDfm, stopwords("english"), "keep", valuetype = "fixed")
## kept 9 features, from 174 supplied (fixed) feature types
## Document-feature matrix of: 2 documents, 9 features.
## 2 x 9 sparse Matrix of class "dfmSparse"
##        features
## docs    My was by your Does the or have more
##   text1  1   1  1    1    0   0  0    0    0
##   text2  0   0  0    0    1   1  1    1    1
selectFeatures(myDfm, stopwords("english"), "remove", valuetype = "fixed")
## removed 9 features, from 174 supplied (fixed) feature types
## Document-feature matrix of: 2 documents, 9 features.
## 2 x 9 sparse Matrix of class "dfmSparse"
##        features
## docs    Christmas ruined opposition tax plan United_States Sweden
##   text1         1      1          1   1    1             0      0
##   text2         0      0          0   0    0             1      1
##        features
## docs    progressive taxation
##   text1           0        0
##   text2           1        1

More examples:

# removing stopwords
testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
             the newspaper from a boy named Seamus, in his mouth."
testCorpus <- corpus(testText)
# note: "also" is not in the default stopwords("english")
features(dfm(testCorpus, ignoredFeatures = stopwords("english")))
## Creating a dfm from a corpus ...
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 1 document
## 
##    ... indexing features:
## 19 feature types
## 
## ...
## removed 7 features, from 174 supplied (glob) feature types
##    ... created a 1 x 12 sparse dfm
##    ... complete. 
## Elapsed time: 0.027 seconds.
##  [1] "quick"     "brown"     "fox"       "named"     "seamus"   
##  [6] "jumps"     "lazy"      "dog"       "also"      "newspaper"
## [11] "boy"       "mouth"
# for ngrams
features(dfm(testCorpus, ngrams = 2, ignoredFeatures = stopwords("english")))
## Creating a dfm from a corpus ...
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 1 document
## 
##    ... indexing features:
## 22 feature types
## 
## ...
## removed 13 features, from 174 supplied (glob) feature types
##    ... created a 1 x 9 sparse dfm
##    ... complete. 
## Elapsed time: 0.028 seconds.
## [1] "quick_brown"  "brown_fox"    "fox_named"    "named_seamus"
## [5] "seamus_jumps" "lazy_dog"     "dog_also"     "also_named"  
## [9] "boy_named"
features(dfm(testCorpus, ngrams = 1:2, ignoredFeatures = stopwords("english")))
## Creating a dfm from a corpus ...
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 1 document
## 
##    ... indexing features:
## 41 feature types
## 
## ...
## removed 20 features, from 174 supplied (glob) feature types
##    ... created a 1 x 21 sparse dfm
##    ... complete. 
## Elapsed time: 0.028 seconds.
##  [1] "quick"        "brown"        "fox"          "named"       
##  [5] "seamus"       "jumps"        "lazy"         "dog"         
##  [9] "also"         "newspaper"    "boy"          "mouth"       
## [13] "quick_brown"  "brown_fox"    "fox_named"    "named_seamus"
## [17] "seamus_jumps" "lazy_dog"     "dog_also"     "also_named"  
## [21] "boy_named"
## removing stopwords before constructing ngrams
tokensAll <- tokenize(toLower(testText), removePunct = TRUE)
tokensNoStopwords <- removeFeatures(tokensAll, stopwords("english"))
tokensNgramsNoStopwords <- ngrams(tokensNoStopwords, 2)
features(dfm(tokensNgramsNoStopwords, ngrams = 1:2))
## Warning in dfm.tokenizedTexts(tokensNgramsNoStopwords, ngrams = 1:2):
## Argument ngrams not used.
## 
##    ... indexing documents: 1 document
## 
##    ... indexing features:
## 13 feature types
## 
##    ... created a 1 x 13 sparse dfm
##    ... complete. 
## Elapsed time: 0.007 seconds.
##  [1] "quick_brown"      "brown_fox"        "fox_named"       
##  [4] "named_seamus"     "seamus_jumps"     "jumps_lazy"      
##  [7] "lazy_dog"         "dog_also"         "also_named"      
## [10] "seamus_newspaper" "newspaper_boy"    "boy_named"       
## [13] "seamus_mouth"
# keep only certain words
dfm(testCorpus, keptFeatures = "*s", verbose = FALSE)  # keep only words ending in "s"
## Document-feature matrix of: 1 document, 3 features.
## 1 x 3 sparse Matrix of class "dfmSparse"
##        features
## docs    seamus jumps his
##   text1      3     1   1
dfm(testCorpus, keptFeatures = "s$", valuetype = "regex", verbose = FALSE)
## Document-feature matrix of: 1 document, 3 features.
## 1 x 3 sparse Matrix of class "dfmSparse"
##        features
## docs    seamus jumps his
##   text1      3     1   1
# testing Twitter functions
testTweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
                "2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
                "Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(testTweets, keptFeatures = "#*", removeTwitter = FALSE)  # keep only hashtags
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 3 documents
## 
##    ... indexing features:
## 19 feature types
## 
## ...
## kept 6 features, from 1 supplied (glob) feature types
##    ... created a 3 x 6 sparse dfm
##    ... complete. 
## Elapsed time: 0.014 seconds.
## Document-feature matrix of: 3 documents, 6 features.
## 3 x 6 sparse Matrix of class "dfmSparse"
##        features
## docs    #justinbieber #la #beliebers #emabiggestfansjustinbieber #belieber
##   text1             1   1          1                           0         0
##   text2             1   0          0                           1         0
##   text3             1   0          0                           1         1
##        features
## docs    #fetusjustin
##   text1            0
##   text2            0
##   text3            1
dfm(testTweets, keptFeatures = "^#.*$", valuetype = "regex", removeTwitter = FALSE)
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 3 documents
## 
##    ... indexing features:
## 19 feature types
## 
## ...
## kept 6 features, from 1 supplied (regex) feature types
##    ... created a 3 x 6 sparse dfm
##    ... complete. 
## Elapsed time: 0.014 seconds.
## Document-feature matrix of: 3 documents, 6 features.
## 3 x 6 sparse Matrix of class "dfmSparse"
##        features
## docs    #justinbieber #la #beliebers #emabiggestfansjustinbieber #belieber
##   text1             1   1          1                           0         0
##   text2             1   0          0                           1         0
##   text3             1   0          0                           1         1
##        features
## docs    #fetusjustin
##   text1            0
##   text2            0
##   text3            1

One very nice feature, recently added, is the ability to create a new dfm with the same feature set as the old. This is very useful, for instance, if we train a model on one dfm, and need to predict on counts from another, but need the feature set to be equivalent.

# selecting on a dfm
textVec1 <- c("This is text one.", "This, the second text.", "Here: the third text.")
textVec2 <- c("Here are new words.", "New words in this text.")
features(dfm1 <- dfm(textVec1))
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 3 documents
## 
##    ... indexing features:
## 8 feature types
## 
##    ... created a 3 x 8 sparse dfm
##    ... complete. 
## Elapsed time: 0.011 seconds.
## [1] "this"   "is"     "text"   "one"    "the"    "second" "here"   "third"
features(dfm2a <- dfm(textVec2))
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 2 documents
## 
##    ... indexing features:
## 7 feature types
## 
##    ... created a 2 x 7 sparse dfm
##    ... complete. 
## Elapsed time: 0.009 seconds.
## [1] "here"  "are"   "new"   "words" "in"    "this"  "text"
(dfm2b <- selectFeatures(dfm2a, dfm1))
## found 3 features from 8 supplied types in a dfm,
##  padding 0s for another 5
## Document-feature matrix of: 2 documents, 8 features.
## 2 x 8 sparse Matrix of class "dfmSparse"
##       this is text one the second here third
## text1    0  0    0   0   0      0    1     0
## text2    1  0    1   0   0      0    0     0
identical(features(dfm1), features(dfm2b))
## [1] TRUE

4. Applying equivalency classes: dictionaries, thesaruses

Dictionary creation is done through the dictionary() function, which classes a named list of characters as a dictionary.

# import the Laver-Garry dictionary from http://bit.ly/1FH2nvf
lgdict <- dictionary(file = "http://www.kenbenoit.net/courses/essex2014qta/LaverGarry.cat",
                     format = "wordstat")
dfm(inaugTexts, dictionary=lgdict)
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 57 documents
## 
##    ... indexing features:
## 9,215 feature types
## 
## ...
## applying a dictionary consisting of 20 keys
##    ... created a 57 x 20 sparse dfm
##    ... complete. 
## Elapsed time: 0.855 seconds.
## Document-feature matrix of: 57 documents, 20 features.
# import a LIWC formatted dictionary
liwcdict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic",
                       format = "LIWC")
dfm(inaugTexts, dictionary = liwcdict)
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 57 documents
## 
##    ... indexing features:
## 9,215 feature types
## 
## ...
## applying a dictionary consisting of 73 keys
##    ... created a 57 x 73 sparse dfm
##    ... complete. 
## Elapsed time: 30.6 seconds.
## Document-feature matrix of: 57 documents, 73 features.

We apply dictionaries to a dfm using the applyDictionary() function. Through the valuetype, argument, we can match patterns of one of three types: "glob", "regex", or "fixed".

myDict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
                          opposition = c("Opposition", "reject", "notincorpus"),
                          taxglob = "tax*",
                          taxregex = "tax.+$",
                          country = c("United_States", "Sweden")))
myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.",
               "Does the United_States or Sweden have more progressive taxation?"),
             ignoredFeatures = stopwords("english"), verbose = FALSE)
myDfm
## Document-feature matrix of: 2 documents, 9 features.
## 2 x 9 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas ruined opposition tax plan united_states sweden
##   text1         1      1          1   1    1             0      0
##   text2         0      0          0   0    0             1      1
##        features
## docs    progressive taxation
##   text1           0        0
##   text2           1        1
# glob format
applyDictionary(myDfm, myDict, valuetype = "glob")
## applying a dictionary consisting of 5 keys
## Document-feature matrix of: 2 documents, 5 features.
## 2 x 5 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas opposition taxglob taxregex country
##   text1         1          1       1        0       0
##   text2         0          0       1        0       2
applyDictionary(myDfm, myDict, valuetype = "glob", case_insensitive = FALSE)
## applying a dictionary consisting of 5 keys
## Document-feature matrix of: 2 documents, 5 features.
## 2 x 5 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas opposition taxglob taxregex country
##   text1         0          0       1        0       0
##   text2         0          0       1        0       0
# regex v. glob format: note that "united_states" is a regex match for "tax*"
applyDictionary(myDfm, myDict, valuetype = "glob")
## applying a dictionary consisting of 5 keys
## Document-feature matrix of: 2 documents, 5 features.
## 2 x 5 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas opposition taxglob taxregex country
##   text1         1          1       1        0       0
##   text2         0          0       1        0       2
applyDictionary(myDfm, myDict, valuetype = "regex", case_insensitive = TRUE)
## applying a dictionary consisting of 5 keys
## Document-feature matrix of: 2 documents, 5 features.
## 2 x 5 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas opposition taxglob taxregex country
##   text1         1          1       1        0       0
##   text2         0          0       2        1       2
# fixed format: no pattern matching
applyDictionary(myDfm, myDict, valuetype = "fixed")
## applying a dictionary consisting of 5 keys
## Document-feature matrix of: 2 documents, 5 features.
## 2 x 5 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas opposition taxglob taxregex country
##   text1         1          1       0        0       0
##   text2         0          0       0        0       2
applyDictionary(myDfm, myDict, valuetype = "fixed", case_insensitive = FALSE)
## applying a dictionary consisting of 5 keys
## Document-feature matrix of: 2 documents, 5 features.
## 2 x 5 sparse Matrix of class "dfmSparse"
##        features
## docs    christmas opposition taxglob taxregex country
##   text1         0          0       0        0       0
##   text2         0          0       0        0       0

It is also possible to pass through a dictionary at the time of dfm() creation.

# dfm with dictionaries
mycorpus <- subset(inaugCorpus, Year>1900)
mydict <- dictionary(list(christmas=c("Christmas", "Santa", "holiday"),
                          opposition=c("Opposition", "reject", "notincorpus"),
                          taxing="taxing",
                          taxation="taxation",
                          taxregex="tax*",
                          country="united states"))
dictDfm <- dfm(mycorpus, dictionary=mydict)
## Creating a dfm from a corpus ...
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 29 documents
## 
##    ... indexing features:
## 6,276 feature types
## 
## ...
## applying a dictionary consisting of 6 keys
##    ... created a 29 x 6 sparse dfm
##    ... complete. 
## Elapsed time: 0.181 seconds.
head(dictDfm)
## Document-feature matrix of: 29 documents, 6 features.
## (showing first 6 documents and first 6 features)
##                 features
## docs             christmas opposition taxing taxation taxregex country
##   1901-McKinley          0          2      0        1        1       0
##   1905-Roosevelt         0          0      0        0        0       0
##   1909-Taft              0          1      0        4        6       0
##   1913-Wilson            0          0      0        1        1       0
##   1917-Wilson            0          0      0        0        0       0
##   1921-Harding           0          0      0        1        2       0

Finally, there is a related “thesaurus” feature, which collapses words in a dictionary but is not exclusive.

mytexts <- c("British English tokenises differently, with more colour.",
             "American English tokenizes color as one word.")
mydict <- dictionary(list(color = "colo*r", tokenize = "tokeni?e*"))
dfm(mytexts, thesaurus = mydict)
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 2 documents
## 
##    ... indexing features:
## 13 feature types
## 
## ...
## applying a dictionary consisting of 2 keys
##    ... created a 2 x 11 sparse dfm
##    ... complete. 
## Elapsed time: 0.023 seconds.
## Document-feature matrix of: 2 documents, 11 features.
## 2 x 11 sparse Matrix of class "dfmSparse"
##       british english differently with more american as one word COLOR
## text1       1       1           1    1    1        0  0   0    0     1
## text2       0       1           0    0    0        1  1   1    1     1
##       TOKENIZE
## text1        1
## text2        1

5. Stemming

Stemming relies on the SnowballC package’s implementation of the Porter stemmer, and is available for the following languages:

SnowballC::getStemLanguages()
##  [1] "danish"     "dutch"      "english"    "finnish"    "french"    
##  [6] "german"     "hungarian"  "italian"    "norwegian"  "porter"    
## [11] "portuguese" "romanian"   "russian"    "spanish"    "swedish"   
## [16] "turkish"

It’s not perfect:

wordstem(c("win", "winning", "wins", "won", "winner"))
## [1] "win"    "win"    "win"    "won"    "winner"

but it’s fast.

Stemmed objects must be tokenized, but can be of many different quanteda classes:

methods(wordstem)
## [1] wordstem.character*      wordstem.dfm*           
## [3] wordstem.tokenizedTexts*
## see '?methods' for accessing help and source code
wordstem(tokenize("This is a winning package, of many packages."))
## tokenizedText object from 1 document.
## Component 1 :
##  [1] "Thi"    "i"      "a"      "win"    "packag" ","      "of"    
##  [8] "mani"   "packag" "."
head(wordstem(dfm(inaugTexts[1:2], verbose = FALSE)))
## ```

Document-feature matrix of: 2 documents, 573 features.

(showing first 2 documents and first 6 features)

features

docs fellow-citizen of the senat and hous

1789-Washington 1 71 116 1 48 2

1793-Washington 0 11 13 0 2 0

```

# same as 
head(dfm(inaugTexts[1:2], stem = TRUE, verbose = FALSE))
## ```

Document-feature matrix of: 2 documents, 572 features.

(showing first 2 documents and first 6 features)

features

docs fellow-citizen of the senat and hous

1789-Washington 1 71 116 1 48 2

1793-Washington 0 11 13 0 2 0

```

6. dfm() and its many options

Operates on character (vectors), corpus, or tokenizedText objects,

## S3 method for class 'character'
dfm(x, verbose = TRUE, toLower = TRUE,
  removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
  removeTwitter = FALSE, stem = FALSE, ignoredFeatures = NULL,
  keptFeatures = NULL, language = "english", thesaurus = NULL,
  dictionary = NULL, valuetype = c("glob", "regex", "fixed"), ...)

quateda has a number of descriptive statistics available for reporting on texts. The simplest of these is through the summary() method:

require(quanteda)
txt <- c(sent1 = "This is an example of the summary method for character objects.",
         sent2 = "The cat in the hat swung the bat.")
summary(txt)
##    Text Types Tokens Sentences
## 1 sent1    12     12         1
## 2 sent2     8      9         1

This also works for corpus objects:

summary(corpus(ukimmigTexts, notes = "Created as a demo."))
## Corpus consisting of 9 documents.
## 
##          Text Types Tokens Sentences
##           BNP  1126   3330        88
##     Coalition   144    268         4
##  Conservative   252    503        15
##        Greens   325    687        21
##        Labour   296    703        29
##        LibDem   257    499        14
##            PC    80    118         5
##           SNP    90    136         4
##          UKIP   346    739        27
## 
## Source:  /Users/kbenoit/Dropbox (Personal)/GitHub/ITAUR-Short/2_descriptive/* on x86_64 by kbenoit
## Created: Thu Oct 27 15:05:11 2016
## Notes:   Created as a demo.

To access the syllables of a text, we use syllables():

syllables(c("Superman.", "supercalifragilisticexpialidocious", "The cat in the hat."))
## [1]  3 13  5

We can even compute the Scabble value of English words, using scrabble():

scrabble(c("cat", "quixotry", "zoo"))
## [1]  5 27 12

We can analyze the lexical diversity of texts, using lexdiv() on a dfm:

myDfm <- dfm(subset(inaugCorpus, Year > 1980), verbose = FALSE)
lexdiv(myDfm, "R")
##  1981-Reagan  1985-Reagan    1989-Bush 1993-Clinton 1997-Clinton 
##     17.07732     16.96115     15.54627     15.00938     15.48117 
##    2001-Bush    2005-Bush   2009-Obama   2013-Obama 
##     14.67819     15.93118     18.26638     16.96606
dotchart(sort(lexdiv(myDfm, "R")))

We can analyze the readability of texts, using readability() on a vector of texts or a corpus:

readab <- readability(subset(inaugCorpus, Year > 1980), measure = "Flesch.Kincaid")
dotchart(sort(readab))

We can identify documents and terms that are similar to one another, using similarity():

## Presidential Inaugural Address Corpus
presDfm <- dfm(inaugCorpus, ignoredFeatures = stopwords("english"))
## Creating a dfm from a corpus ...
## 
##    ... lowercasing
## 
##    ... tokenizing
## 
##    ... indexing documents: 57 documents
## 
##    ... indexing features:
## 9,215 feature types
## 
## ...
## removed 133 features, from 174 supplied (glob) feature types
##    ... created a 57 x 9082 sparse dfm
##    ... complete. 
## Elapsed time: 1.36 seconds.
# compute some document similarities
similarity(presDfm, "1985-Reagan", n=5, margin="documents")
## similarity Matrix:
## $`1985-Reagan`
##  1981-Reagan 1997-Clinton   2013-Obama   1969-Nixon   1973-Nixon 
##       0.6942       0.6595       0.6497       0.6275       0.6262
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents", method = "cosine")
## similarity Matrix:
## $`2009-Obama`
##   2013-Obama 1997-Clinton  1981-Reagan    1989-Bush  1985-Reagan 
##       0.6764       0.6697       0.6320       0.6221       0.6206 
## 
## $`2013-Obama`
##   2009-Obama  1981-Reagan  1985-Reagan 1997-Clinton    2001-Bush 
##       0.6764       0.6627       0.6621       0.6576       0.6206
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents", method = "Hellinger")
## similarity Matrix:
## $`2009-Obama`
##   2013-Obama 1997-Clinton 1993-Clinton  1985-Reagan    2005-Bush 
##       0.4937       0.4898       0.4867       0.4837       0.4818 
## 
## $`2013-Obama`
## 1997-Clinton   2009-Obama    2001-Bush    2005-Bush 1993-Clinton 
##       0.4945       0.4937       0.4909       0.4896       0.4880
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents", method = "eJaccard")
## similarity Matrix:
## $`2009-Obama`
##   2013-Obama 1997-Clinton  1981-Reagan    1989-Bush   1969-Nixon 
##       0.5110       0.4770       0.4563       0.4501       0.4422 
## 
## $`2013-Obama`
##   2009-Obama  1981-Reagan  1985-Reagan 1997-Clinton    2001-Bush 
##       0.5110       0.4877       0.4791       0.4613       0.4418
# compute some term similarities
lapply(similarity(presDfm, c("fair", "health", "terror"), margin = "features", method = "cosine"), head)
## $health
##         culture instrumentality          ideals          denial 
##       0.7378648       0.7378648       0.7361216       0.7071068 
##      heightened        refusing 
##       0.6708204       0.6708204 
## 
## $fair
##    united     great      best      thus   present  preserve 
## 0.7305066 0.7294634 0.7148319 0.7037316 0.7018624 0.6920211 
## 
## $terror
##      plague     tragedy    mystical       split        atom commonplace 
##   0.6708204   0.6708204   0.6708204   0.6708204   0.6708204   0.6708204

And this can be used for clustering documents:

data(SOTUCorpus, package="quantedaData")
presDfm <- dfm(subset(SOTUCorpus, Date > "1981-01-01"), stem = TRUE,
               ignoredFeatures = stopwords("english"), 
               verbose = FALSE)

##

presDfm <- trim(presDfm, minCount = 5, minDoc = 3)
## Removing features occurring fewer than 5 times: 4812
## Removing features occurring in fewer than 3 documents: 4273
# hierarchical clustering - get distances on normalized dfm
presDistMat <- dist(as.matrix(weight(presDfm, "relFreq")))
# hierarchical clustering the distance object
presCluster <- hclust(presDistMat)
# label with document names
presCluster$labels <- docnames(presDfm)
# plot as a dendrogram
plot(presCluster)

Or we could look at term clustering insteadd:

# word dendrogram with tf-idf weighting
wordDfm <- sort(weight(presDfm, "tfidf"))
wordDfm <- removeFeatures(wordDfm, "u.") # because it's oddly there
## removed 1 feature, from 1 supplied (glob) feature types
wordDfm <- t(wordDfm)[1:100, ]  # because transposed
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
plot(wordCluster, xlab = "", main = "tf-idf Frequency weighting")

Finally, there are number of helper functions to extract information from quanteda objects:

myCorpus <- subset(inaugCorpus, Year > 1980)

# return the number of documents
ndoc(myCorpus)           
## [1] 9
ndoc(dfm(myCorpus, verbose = FALSE))
## [1] 9
# how many tokens (total words)
ntoken(myCorpus)
##  1981-Reagan  1985-Reagan    1989-Bush 1993-Clinton 1997-Clinton 
##         2798         2935         2683         1837         2451 
##    2001-Bush    2005-Bush   2009-Obama   2013-Obama 
##         1810         2325         2729         2335
ntoken("How many words in this sentence?")
## [1] 7
# arguments to tokenize can be passed 
ntoken("How many words in this sentence?", removePunct = TRUE)
## [1] 6
# how many types (unique words)
ntype(myCorpus)
##  1981-Reagan  1985-Reagan    1989-Bush 1993-Clinton 1997-Clinton 
##          904          925          795          644          773 
##    2001-Bush    2005-Bush   2009-Obama   2013-Obama 
##          622          772          939          814
ntype("Yada yada yada.  (TADA.)")
## [1] 6
ntype("Yada yada yada.  (TADA.)", removePunct = TRUE)
## [1] 3
ntype(toLower("Yada yada yada.  (TADA.)"), removePunct = TRUE)
## [1] 2
# can count documents and features
ndoc(inaugCorpus)
## [1] 57
myDfm1 <- dfm(inaugCorpus, verbose = FALSE)
ndoc(myDfm1)
## [1] 57
nfeature(myDfm1)
## [1] 9215
myDfm2 <- dfm(inaugCorpus, ignoredFeatures = stopwords("english"), stem = TRUE, verbose = FALSE)

##

nfeature(myDfm2)
## [1] 5303
# can extract feature labels and document names
head(features(myDfm1), 20)
##  [1] "fellow-citizens" "of"              "the"            
##  [4] "senate"          "and"             "house"          
##  [7] "representatives" "among"           "vicissitudes"   
## [10] "incident"        "to"              "life"           
## [13] "no"              "event"           "could"          
## [16] "have"            "filled"          "me"             
## [19] "with"            "greater"
head(docnames(myDfm1))
## [1] "1789-Washington" "1793-Washington" "1797-Adams"      "1801-Jefferson" 
## [5] "1805-Jefferson"  "1809-Madison"
# and topfeatures
topfeatures(myDfm1)
##   the    of   and    to    in     a   our  that    we    be 
## 10011  7055  5233  4490  2771  2231  2134  1780  1693  1469
topfeatures(myDfm2) # without stopwords
##   will nation govern  peopl     us    can  state   upon  great  power 
##    891    662    653    613    476    470    448    371    367    366