Wouter van Atteveldt
Session 1: Managing data in R
library(twitteR)
tweets = searchTwitteR("#bigdata", resultType="recent", n = 100)
tweets = plyr::ldply(tweets, as.data.frame)
kable(head(tweets[c("id", "created", "text")]))
id | created | text |
---|---|---|
737606276188753921 | 2016-05-31 11:26:54 | #BigData : comment s'enrichir en partageant #tribune @LesEchos https://t.co/6kbaQmd40J |
737606250024689666 | 2016-05-31 11:26:48 | RT @jamesturner247: Is Big Data Taking Us Closer to the Deeper Questions in Artificial Intelligence? https://t.co/Z7ZsI1mzLB #ArtificialInt… |
737606227358711809 | 2016-05-31 11:26:43 | RT @jamesturner247: Big Data and the Cloud: Uncover New #Insights Hiding in Your Data https://t.co/NM9BNukkXX #BigData #DataScience #Health… |
737606216243761152 | 2016-05-31 11:26:40 | momentum in today’s #BigData #data #analytics landscape. |
https://t.co/poK5ksaOTO https://t.co/Mmzlf6vJS1 | ||
737606192755675141 | 2016-05-31 11:26:35 | Heather Knight is speaking at #smartcon2016 in Istanbul |
Marilyn Monrobot - Kurucu, Robotist #bigdata #IoT #Startup https://t.co/xuHZR6vORY | ||
737606191333793792 | 2016-05-31 11:26:34 | RT @Informatica: At @strataconf, learn how to turn #bigdata into big value! https://t.co/T2Jvn3JRqh #StrataHadoop https://t.co/tqneHPVTzk |
library(RTextTools)
library(corpustools)
dtm = create_matrix(tweets$text)
dtm.wordcloud(dtm, freq.fun = sqrt)
Thursday: Introduction to R
Friday: Corpus Analysis & Topic Modeling
Saturday: Machine Learning & Sentiment Analysis
Sunday: Semantic Networks & Grammatical Analysis
install.packages("plyr")
library(plyr)
plyr::rename
devtools::install_github("amcat/amcat-r")
x = 12
class(x)
[1] "numeric"
x = c(1, 2, 3)
class(x)
[1] "numeric"
x = "a text"
class(x)
[1] "character"
df = data.frame(id=1:3, age=c(14, 18, 24),
name=c("Mary", "John", "Luke"))
df
id age name
1 1 14 Mary
2 2 18 John
3 3 24 Luke
class(df)
[1] "data.frame"
df$age
[1] 14 18 24
df[["age"]]
[1] 14 18 24
class(df$age)
[1] "numeric"
class(df$name)
[1] "factor"
Data frames:
colnames(df)
head(df)
tail(df)
nrow(df)
ncol(df)
summary(df)
Vectors:
mean(df$age)
length(df$age)
Subsetting
Recoding & Renaming columns
Ordering
df[1:2, 1:2]
id age
1 1 14
2 2 18
df[df$id %% 2 == 1, ]
id age name
1 1 14 Mary
3 3 24 Luke
df[, c("id", "name")]
id name
1 1 Mary
2 2 John
3 3 Luke
subset(df, id == 1)
id age name
1 1 14 Mary
subset(df, id >1 & age < 20)
id age name
2 2 18 John
df2 = df
df2$age2 = df2$age + df2$id
df2$age[df2$id == 1] = NA
df2$id = NULL
df2$old = df2$age > 20
df2$agecat =
ifelse(df2$age > 20, "Old", "Young")
df2
age name age2 old agecat
1 NA Mary 15 NA <NA>
2 18 John 20 FALSE Young
3 24 Luke 27 TRUE Old
character
vs factor
df2=df
df2$name = as.character(df2$name)
df2$name[df2$id != 1] =
paste("Mr.", df2$name[df2$id != 1])
df2$name = toupper(df2$name)
df2$name = gsub("\\.\\s*", "_", df2$name)
df2[grepl("mr", df2$name, ignore.case = T), ]
id age name
2 2 18 MR_JOHN
3 3 24 MR_LUKE
df2 = df
colnames(df2) = c("ID", "AGE", "NAME")
colnames(df2)[2] = "leeftijd"
df2 = plyr::rename(df2, c("NAME"="naam"))
df2
ID leeftijd naam
1 1 14 Mary
2 2 18 John
3 3 24 Luke
df[order(df$age), ]
id age name
1 1 14 Mary
2 2 18 John
3 3 24 Luke
plyr::arrange(df, -age)
id age name
1 3 24 Luke
2 2 18 John
3 1 14 Mary
df$col
, df[["col"]]
, df[c("col1" ,"col2")]
df[rows, columns]
l$el
, l[["el"]]
, l[[1]]
l[[1:3]]
m[rows, columns]
Organizing Data
Break
Hand-outs: