Text manipulation

In this section we will work through some basic string manipulation functions in R.

There are several useful string manipulation functions in the R base library. In addition, we will look at the stringr package which provides an additional interface for simple text manipulation.

The fundamental type (or mode) in which R stores text is the character vector. The most simple case is a character vector of length one. The nchar function returns the number of characters in a character vector.

require(quanteda)
s1 <- 'my example text'
length(s1)

## [1] 1

nchar(s1)

## [1] 15

The nchar function is vectorized, meaning that when called on a vector it returns a value for each element of the vector.

s2 <- c('This is', 'my example text.', 'So imaginative.')
length(s2)

## [1] 3

nchar(s2)

## [1]  7 16 15

sum(nchar(s2))

## [1] 38

We can use this to answer some simple questions about the inaugural addresses.

Which were the longest and shortest speeches?

which.max(nchar(inaugTexts))

## 1841-Harrison 
##            14

which.min(nchar(inaugTexts))

## 1793-Washington 
##               2

Unlike in some other programming languages, it is not possible to index into a string in R:

s1 <- 'This file contains many fascinating example sentences.'
s1[6:9]

## [1] NA NA NA NA

To extract a substring, instead we use the substr function.

s1 <- 'This file contains many fascinating example sentences.'
substr(s1, 6,9)

## [1] "file"

Often we would like to split character vectors to extract a term of interest. This is possible using the strsplit function. Consider the names of the inaugural texts:

names(inaugTexts)

##  [1] "1789-Washington" "1793-Washington" "1797-Adams"     
##  [4] "1801-Jefferson"  "1805-Jefferson"  "1809-Madison"   
##  [7] "1813-Madison"    "1817-Monroe"     "1821-Monroe"    
## [10] "1825-Adams"      "1829-Jackson"    "1833-Jackson"   
## [13] "1837-VanBuren"   "1841-Harrison"   "1845-Polk"      
## [16] "1849-Taylor"     "1853-Pierce"     "1857-Buchanan"  
## [19] "1861-Lincoln"    "1865-Lincoln"    "1869-Grant"     
## [22] "1873-Grant"      "1877-Hayes"      "1881-Garfield"  
## [25] "1885-Cleveland"  "1889-Harrison"   "1893-Cleveland" 
## [28] "1897-McKinley"   "1901-McKinley"   "1905-Roosevelt" 
## [31] "1909-Taft"       "1913-Wilson"     "1917-Wilson"    
## [34] "1921-Harding"    "1925-Coolidge"   "1929-Hoover"    
## [37] "1933-Roosevelt"  "1937-Roosevelt"  "1941-Roosevelt" 
## [40] "1945-Roosevelt"  "1949-Truman"     "1953-Eisenhower"
## [43] "1957-Eisenhower" "1961-Kennedy"    "1965-Johnson"   
## [46] "1969-Nixon"      "1973-Nixon"      "1977-Carter"    
## [49] "1981-Reagan"     "1985-Reagan"     "1989-Bush"      
## [52] "1993-Clinton"    "1997-Clinton"    "2001-Bush"      
## [55] "2005-Bush"       "2009-Obama"      "2013-Obama"

# returns a list of parts
s1 <- 'split this string'
strsplit(s1, 'this')

## [[1]]
## [1] "split "  " string"

parts <- strsplit(names(inaugTexts), '-')
years <- sapply(parts, function(x) x[1])
pres <-  sapply(parts, function(x) x[2])

The paste function is used to join character vectors together. The way in which the elements are combined depends on the values of the sep and collapse arguments:

paste('one','two','three')

## [1] "one two three"

paste('one','two','three', sep='_')

## [1] "one_two_three"

paste(years, pres, sep='-')

##  [1] "1789-Washington" "1793-Washington" "1797-Adams"     
##  [4] "1801-Jefferson"  "1805-Jefferson"  "1809-Madison"   
##  [7] "1813-Madison"    "1817-Monroe"     "1821-Monroe"    
## [10] "1825-Adams"      "1829-Jackson"    "1833-Jackson"   
## [13] "1837-VanBuren"   "1841-Harrison"   "1845-Polk"      
## [16] "1849-Taylor"     "1853-Pierce"     "1857-Buchanan"  
## [19] "1861-Lincoln"    "1865-Lincoln"    "1869-Grant"     
## [22] "1873-Grant"      "1877-Hayes"      "1881-Garfield"  
## [25] "1885-Cleveland"  "1889-Harrison"   "1893-Cleveland" 
## [28] "1897-McKinley"   "1901-McKinley"   "1905-Roosevelt" 
## [31] "1909-Taft"       "1913-Wilson"     "1917-Wilson"    
## [34] "1921-Harding"    "1925-Coolidge"   "1929-Hoover"    
## [37] "1933-Roosevelt"  "1937-Roosevelt"  "1941-Roosevelt" 
## [40] "1945-Roosevelt"  "1949-Truman"     "1953-Eisenhower"
## [43] "1957-Eisenhower" "1961-Kennedy"    "1965-Johnson"   
## [46] "1969-Nixon"      "1973-Nixon"      "1977-Carter"    
## [49] "1981-Reagan"     "1985-Reagan"     "1989-Bush"      
## [52] "1993-Clinton"    "1997-Clinton"    "2001-Bush"      
## [55] "2005-Bush"       "2009-Obama"      "2013-Obama"

paste(years, pres, collapse='-')

## [1] "1789 Washington-1793 Washington-1797 Adams-1801 Jefferson-1805 Jefferson-1809 Madison-1813 Madison-1817 Monroe-1821 Monroe-1825 Adams-1829 Jackson-1833 Jackson-1837 VanBuren-1841 Harrison-1845 Polk-1849 Taylor-1853 Pierce-1857 Buchanan-1861 Lincoln-1865 Lincoln-1869 Grant-1873 Grant-1877 Hayes-1881 Garfield-1885 Cleveland-1889 Harrison-1893 Cleveland-1897 McKinley-1901 McKinley-1905 Roosevelt-1909 Taft-1913 Wilson-1917 Wilson-1921 Harding-1925 Coolidge-1929 Hoover-1933 Roosevelt-1937 Roosevelt-1941 Roosevelt-1945 Roosevelt-1949 Truman-1953 Eisenhower-1957 Eisenhower-1961 Kennedy-1965 Johnson-1969 Nixon-1973 Nixon-1977 Carter-1981 Reagan-1985 Reagan-1989 Bush-1993 Clinton-1997 Clinton-2001 Bush-2005 Bush-2009 Obama-2013 Obama"

tolower and toupper change the case of character vectors.

tolower(s1)

## [1] "split this string"

toupper(s1)

## [1] "SPLIT THIS STRING"

Note that quanteda has a special wrapper for changing case, called toLower(), which is better than the built-in tolower() and is defined for multiple objects:

require(quanteda)
tolower(c("This", "is", "Kεφαλαία Γράμματα"))

## [1] "this"              "is"                "kεφαλαία γράμματα"

methods(toLower)

## [1] toLower.character*      toLower.corpus*         toLower.NULL*          
## [4] toLower.tokenizedTexts*
## see '?methods' for accessing help and source code

Charcter vectors can be compared using the == and %in% operators:

tolower(s1) == toupper(s1)

## [1] FALSE

'apples'=='oranges'

## [1] FALSE

tolower(s1) == tolower(s1)

## [1] TRUE

'pears' == 'pears'

## [1] TRUE

c1 <- c('apples', 'oranges', 'pears')
'pears' %in% c1

## [1] TRUE

c2 <- c('bananas', 'pears')
c2 %in% c1

## [1] FALSE  TRUE

The base functions for searching and replacing within text are similar to familiar commands from the other text manipulation environments, grep and gsub. The grep manual page provides an overview of these functions.

The grep command tests whether a pattern occurs within a string:

grep('orangef', 'these are oranges')

## integer(0)

grep('pear', 'these are oranges')

## integer(0)

grep('orange', c('apples', 'oranges', 'pears'))

## [1] 2

grep('pears', c('apples', 'oranges', 'pears'))

## [1] 3

The gsub command substitutes one pattern for another within a string:

gsub('oranges', 'apples', 'these are oranges')

## [1] "these are apples"

In addition to the base string operations, the stringr and stringi packages provide more extensive and more organized interfaces for string manipulation. Here we will look at some examples from the stringr package. You might need to install the stringr package using install.packages.

For an overview of the most frequently used functions, see the vignette: https://cran.r-project.org/web/packages/stringr/vignettes/stringr.html.

For an index to in-depth explanations of each of the functions, see:

require(stringr)

## Loading required package: stringr

help(package='stringr')

defining a function:

vCount <- function(inText){
    vowels <- c('a','e','i','o','u')
    return(sum(str_count(inText, vowels)))
}
vCount('tts')

## [1] 0

Replace with stringr:

help(package='stringr')

fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-")

## [1] "-ne apple"     "tw- pears"     "thr-e bananas"

str_replace_all(fruits, "[aeiou]", "-")

## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

str_replace(fruits, "([aeiou])", "")

## [1] "ne apple"     "tw pears"     "thre bananas"

str_replace(fruits, "([aeiou])", "\\1\\1")

## [1] "oone apple"     "twoo pears"     "threee bananas"

str_replace(fruits, "[aeiou]", c("1", "2", "3"))

## [1] "1ne apple"     "tw2 pears"     "thr3e bananas"

str_replace(fruits, c("a", "e", "i"), "-")

## [1] "one -pple"     "two p-ars"     "three bananas"

fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-")

## [1] "-ne apple"     "tw- pears"     "thr-e bananas"

str_replace_all(fruits, "[aeiou]", "-")

## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

str_replace_all(fruits, "([aeiou])", "")

## [1] "n ppl"    "tw prs"   "thr bnns"

str_replace_all(fruits, "([aeiou])", "\\1\\1")

## [1] "oonee aapplee"      "twoo peeaars"       "threeee baanaanaas"

str_replace_all(fruits, "[aeiou]", c("1", "2", "3"))

## [1] "1n1 1ppl1"     "tw2 p22rs"     "thr33 b3n3n3s"

str_replace_all(fruits, c("a", "e", "i"), "-")

## [1] "one -pple"     "two p-ars"     "three bananas"

# If you want to apply multiple patterns and replacements to the same
# string, pass a named version to pattern.
str_replace_all(str_c(fruits, collapse = "---"),
 c("one" = 1, "two" = 2, "three" = 3))

## [1] "1 apple---2 pears---3 bananas"

Regex with stringr:

pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(strings, pattern)

## [1] TRUE TRUE

str_detect(strings, fixed(pattern))

## [1] FALSE  TRUE

str_detect(strings, coll(pattern))

## [1] FALSE  TRUE

# Word boundaries
words <- c("These are   some words.")
str_count(words, boundary("word"))

## [1] 4

str_split(words, " ")[[1]]

## [1] "These"  "are"    ""       ""       "some"   "words."

str_split(words, boundary("word"))[[1]]

## [1] "These" "are"   "some"  "words"

# Regular expression variations
str_extract_all("The Cat in the Hat", "[a-z]+")

## [[1]]
## [1] "he"  "at"  "in"  "the" "at"

str_extract_all("The Cat in the Hat", regex("[a-z]+", TRUE))

## [[1]]
## [1] "The" "Cat" "in"  "the" "Hat"

str_extract_all("a\nb\nc", "^.")

## [[1]]
## [1] "a"

str_extract_all("a\nb\nc", regex("^.", multiline = TRUE))

## [[1]]
## [1] "a" "b" "c"

str_extract_all("a\nb\nc", "a.")

## [[1]]
## character(0)

str_extract_all("a\nb\nc", regex("a.", dotall = TRUE))

## [[1]]
## [1] "a\n"

Trim:

str_trim("  String with trailing and leading white space\t")

## [1] "String with trailing and leading white space"

str_trim("\n\nString with trailing and leading white space\n\n")

## [1] "String with trailing and leading white space"

To extract texts from a quanteda corpus object, the command is very simple:

mytexts <- texts(subset(inaugCorpus, President == "Washington"))
str(mytexts)

##  Named chr [1:2] "Fellow-Citizens of the Senate and of the House of Representatives:\n\nAmong the vicissitudes incident to life no event could ha"| __truncated__ ...
##  - attr(*, "names")= chr [1:2] "1789-Washington" "1793-Washington"

fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "e")

## [1]  TRUE FALSE  TRUE  TRUE

fruit[str_detect(fruit, "e")]

## [1] "apple"    "pear"     "pinapple"

str_detect(fruit, "^a")

## [1]  TRUE FALSE FALSE FALSE

str_detect(fruit, "a$")

## [1] FALSE  TRUE FALSE FALSE

str_detect(fruit, "b")

## [1] FALSE  TRUE FALSE FALSE

str_detect(fruit, "[aeiou]")

## [1] TRUE TRUE TRUE TRUE

# Also vectorised over pattern
str_detect("aecfg", letters)

##  [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE

We challenge you to do this as simply using the tm package.

Text manipulation

Kenneth Benoit and Paul Nulty

October 18th 2015