Mhairi McNeill
16/09/2015
library(tm)
reviews <- read.csv("reviews.csv", stringsAsFactors=FALSE)
review_source <- VectorSource(reviews$text)
corpus <- Corpus(review_source)
inspect(corpus)
[1] " SO ADDICTING DEFF DOWNLAOD ITS EPIC YOU CAT LOVERS WILL FALL IN LOVE <3"
corpus <- tm_map(corpus, content_transformer(tolower))
inspect(corpus)
[1] " so addicting deff downlaod its epic you cat lovers will fall in love <3"
corpus <- tm_map(corpus, removePunctuation)
inspect(corpus)
[1] " so addicting deff downlaod its epic you cat lovers will fall in love 3"
corpus <- tm_map(corpus, stripWhitespace)
inspect(corpus)
[1] " so addicting deff downlaod its epic you cat lovers will fall in love 3"
corpus <- tm_map(corpus, removeWords, stopwords("english"))
inspect(corpus)
[1] " addicting deff downlaod epic cat lovers will fall love 3"
corpus <- tm_map(corpus, stemDocument)
inspect(corpus)
[1] " addict deff downlaod epic cat lover will fall love 3"
dtm <- DocumentTermMatrix(corpus)
tdm <- TermDocumentMatrix(corpus)
dim(dtm)
[1] 1000 2250
dtm_small <- removeSparseTerms(dtm, 0.99)
dim(dtm_small)
[1] 1000 205
frequency <- colSums(as.matrix(dtm))
frequency <- sort(frequency, decreasing=TRUE)
head(frequency)
game play great fun good love
976 250 249 241 238 234
library(wordcloud)
wordcloud(names(frequency)[1:100], frequency[1:100])
model <- lm(reviews$rating ~ .,
data = as.data.frame(as.matrix(dtm_small)))
by_country <-
reviews %>%
group_by(location) %>%
summarise(text = paste(text, collapse = ' '))
review_source <- VectorSource(by_country$text)
corpus <- Corpus(review_source)
Country | Top Terms |
---|---|
UK | iphone, version, watch |
US | everything, wish, back |
Australia | gems, amount, phone |
New Zealand | clans, troops, thanks |
Thanks for listening!