Emoji Analysis

This analysis is based on a blog post and code by Jessica Peterka-Bonetta.

I only made small customizations in the code.

Load the dataset

First, let’s reload our dataset.

#install.packages(streamR)
library(streamR)

file <- "../data/stream/stream_got.json"
# file <- "~/Dropbox (UNC Charlotte)/summer-2017-social-media-workshop/data/stream/stream_got.json"

#?parseTweets
tweets <- parseTweets(tweets = file)

tweets$text <- iconv(tweets$text, from="UTF-8", to="ASCII", "byte")

Next, let’s create a dictionary for all of the emojis.

library(tidyverse); library(stringr); library(Unicode)

source("./emoji_functions.R")
emoji_file = "../data/emojis.csv"

emDict <- readr::read_delim(file = emoji_file, delim = ";")

emDict <- emDict %>%
  mutate(description = tolower(EN)) %>%
  mutate(unicode = as.u_char(unicode))

Then, let’s match the emojis to count the most used emojis.

#helper functions
matchto <- emDict$ftu8
description <- emDict$EN

rank <- emojis_matching(tweets$text, matchto, description) %>% 
  group_by(description) %>% 
  summarise(n = sum(count)) %>%
  arrange(-n)

head(rank, n = 10)
## # A tibble: 10 x 2
##                     description     n
##                           <chr> <int>
##  1       face with tears of joy    73
##  2                raising hands    49
##  3 smiling face with heart-eyes    24
##  4           loudly crying face    21
##  5       face screaming in fear    17
##  6                  pile of poo    13
##  7                         fire     9
##  8                    red heart     9
##  9                        crown     7
## 10                       dragon     7

Emoji Sentiment Analysis

We can then do an emoji “sentiment analysis”.

To do this, we need this site to provide sentiment scores of the tweets.

library(xml2); library(rvest)

# reference website
url <- "http://kt.ijs.si/data/Emoji_sentiment_ranking/index.html"

# get emoticons
emojis_raw <- url %>%
  read_html() %>%
  html_table() %>%
  data.frame %>%
  select(-Image.twemoji., -Sentiment.bar.c.i..95..)

# rename the file
names(emojis_raw) <- c("char", "unicode", "occurrences", "position", "negative", "neutral", 
                   "positive", "sentiment_score", "description", "block")

# change numeric unicode to character unicode to be able to match with emDict 
emojis <- emojis_raw %>%
  mutate(unicode = as.u_char(unicode)) %>%
  mutate(description = tolower(description)) 

# merge with emDict to get encoding
emojis_merged <- emojis %>%
  merge(emDict, by = "unicode")

new_matchto <- emojis_merged$ftu8
new_description <- emojis_merged$description.x
sentiment <- emojis_merged$sentiment_score

sentiments <- emojis_matching(tweets$text, new_matchto, new_description, sentiment) %>%
  mutate(sentiment = count*as.numeric(sentiment)) %>%
  group_by(text) %>% 
  summarise(sentiment_score = sum(sentiment), tweet_count = n()) %>%
  filter(!is.na(sentiment_score)) %>%
  arrange(desc(sentiment_score))

# top 10 most positive tweets by emoji sentiment
head(sentiments, n = 10)
## # A tibble: 10 x 3
##                                                                           text
##                                                                          <chr>
##  1 RT @LordSnow: Daenerys has finally made it to Westeros <ed><a0><bd><ed><b9>
##  2 RT @ItsGoTQuote: Some of the #GameofThrones cast for Time Magazine <ed><a0>
##  3 <ed><a0><bd><ed><b8><82><ed><a0><bd><ed><b8><82><ed><a0><bd><ed><b8><82><ed
##  4 "RT @francy_flo: TELL THEM QUEEN <ed><a0><bd><ed><b1><91><e2><9d><84><ef><b
##  5 It warms my heart <ed><a0><bd><ed><b2><9c><ed><a0><bd><ed><b2><99><ed><a0><
##  6 <ed><a0><bd><ed><b8><ad><ed><a0><bd><ed><b8><ad><ed><a0><bd><ed><b8><ad><ed
##  7 RT @darth_daisy_: <ed><a0><bd><ed><b8><ad><ed><a0><bd><ed><b8><ad><ed><a0><
##  8 RT @Rawrist: #GameofThrones is getting intense this season. <ed><a0><bd><ed
##  9 "RT @Emilie__Caron: When Arya Stark commits a mass murder: \n\n#GoTS7\n#Gam
## 10 RT @LordSnow: I am a humble king <ed><a0><bd><ed><b8><89> #GameOfThrones ht
## # ... with 2 more variables: sentiment_score <dbl>, tweet_count <int>