Wouter van Atteveldt
Session 2: Transforming Data & APIs
Thursday: Introduction to R
Friday: Corpus Analysis & Topic Modeling
Saturday: Machine Learning & Sentiment Analysis
Sunday: Semantic Networks & Grammatical Analysis
Combining data
Reshaping data
cbind(df, country=c("nl", "uk", "uk"))
id age name country
1 1 14 Mary nl
2 2 18 John uk
3 3 24 Luke uk
rbind(df, c(id=1, age=2, name="Mary"))
id age name
1 1 14 Mary
2 2 18 John
3 3 24 Luke
4 1 2 Mary
countries = data.frame(id=1:2, country=c("nl", "uk"))
merge(df, countries)
id age name country
1 1 14 Mary nl
2 2 18 John uk
merge(df, countries, all=T)
id age name country
1 1 14 Mary nl
2 2 18 John uk
3 3 24 Luke <NA>
merge(data1, data2)
merge(data1, data2, by="id")
merge(data1, data2, by.x="id", by.y="ID")
merge(data1, data2, by="id", all=T)
merge(data1, data2, by="id", all.x=T)
reshape2
package:
melt
: wide to longdcast
: long to wide (pivot table) wide = data.frame(id=1:3,
group=c("a","a","b"),
width=c(100, 110, 120),
height=c(50, 100, 150))
wide
id group width height
1 1 a 100 50
2 2 a 110 100
3 3 b 120 150
library(reshape2)
long = melt(wide, id.vars=c("id", "group"))
long
id group variable value
1 1 a width 100
2 2 a width 110
3 3 b width 120
4 1 a height 50
5 2 a height 100
6 3 b height 150
dcast(long, id + group ~ variable, value.var="value")
id group width height
1 1 a 100 50
2 2 a 110 100
3 3 b 120 150
dcast(long, group ~ variable, value.var = "value", fun.aggregate = max)
group width height
1 a 110 100
2 b 120 150
dcast(long, id ~., value.var = "value", fun.aggregate = mean)
id .
1 1 75
2 2 105
3 3 135
aggregate(long["value"], long["group"], max)
group value
1 a 110
2 b 150
Aggregate
Cast
rows ~ columns
)Vector properties
mean(x)
sd(x)
sum(x)
Basic tests
t.test(wide, width ~ group)
t.test(wide$width, wide$height, paired=T)
cor.test(wide$width, wide$height)
m = lm(long, width ~ group + height)
summary(m)
Transforming data in R
Thursday: Introduction to R
Friday: Corpus Analysis & Topic Modeling
Saturday: Machine Learning & Sentiment Analysis
Sunday: Semantic Networks & Grammatical Analysis
install_github("geoffjentry/twitteR")
setup_twitter_oauth(...)
tweets = searchTwitteR("#Trump2016", resultType="recent", n = 10)
tweets = plyr::ldply(tweets, as.data.frame)
install_github("pablobarbera/Rfacebook", subdir="Rfacebook")
fb_token = fbOAuth(fb_app_id, fb_app_secret)
p = getPage(page="nytimes", token=fb_token)
post = getPost(p$id[1], token=fb_token)
install.packages("rtimes")
options(nytimes_as_key = nyt_api_key)
res = as_search(q="trump",
begin_date = "20160101",
end_date = '20160501')
arts = plyr::ldply(res$data,
function(x) c(headline=x$headline$main,
date=x$pub_date))
httr
(or RCurl
)domain = 'https://api.nytimes.com'
path = 'svc/search/v2/articlesearch.json'
url = paste(domain, path, url, sep='/')
query = list(`api-key`=key, q="clinton")
r = httr::GET(url, query=query)
status_code(r)
result = content(r)
result$response$docs[[1]]$headline
Accessing APIs
Break
Hand-outs:
Mini-project: Retrieve data about a topic of your interest
Thursday: Introduction to R
Friday: Corpus Analysis & Topic Modeling
Saturday: Machine Learning & Sentiment Analysis
Sunday: Semantic Networks & Grammatical Analysis