Load the data and convert to a data frame.
library(rjson)
logs <- fromJSON(file="./data/crystalball_userlog_final.json")
logs <- lapply(logs, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df <- do.call("rbind", logs)
df <- data.frame(user = row.names(df), log = df, stringsAsFactors = F)
How many unique users do we have?
length(unique(df$user))
## [1] 81
The data excludes test accounts and four users for incomplete data.
First, reshape the data from “wide” to “long” to create the time variables.
library(reshape)
df <- reshape(df, varying = c("log.1","log.2","log.3","log.4","log.5",
"log.6","log.7","log.8","log.9","log.10"), idvar = "user", direction = "long")
Next, create two custom variables for our groups.
df$frame <- ifelse(substr(df$user,0,1)=="h","High","Low")
df$train <- ifelse(substr(df$user,2,2)=="t","Time","Spatial")
table(df$frame, df$train)/10
##
## Spatial Time
## High 19 21
## Low 23 18
df$time <- df$time / 10
Notice that the groups are spread fairly evenly.
First, upload user-level attributes.
atts <- readr::read_csv("./data/demographics-pretest.csv")
Next, use the pairsD3 function to scatter the personality scores.
library(pairsD3)
pairsD3(atts[,9:13], group = as.factor(atts$GENDER))
Run a quick k=5 k-means. This was explored as a feature for personality clusters.
kmeans <- kmeans(atts[,9:13], centers = 5, iter.max = 10, nstart =3,
algorithm = c("Hartigan-Wong"), trace=FALSE)
pairsD3(atts[,9:13], group = as.factor(kmeans$cluster))
atts$flag2 <- ifelse(kmeans$cluster=="2",1,0)
atts$flag3 <- ifelse(kmeans$cluster=="3",1,0)
Alternatively, create binary flags for each personality measure.
Values above the mean are given a 1, values below are 0.
summary(atts[,9:13])
## Extraversion Agreeableness Conscientiousness Neuroticism
## Min. :2.100 Min. :2.700 Min. :2.70 Min. :1.300
## 1st Qu.:3.800 1st Qu.:4.900 1st Qu.:4.20 1st Qu.:3.100
## Median :4.400 Median :5.400 Median :5.00 Median :3.600
## Mean :4.315 Mean :5.352 Mean :4.89 Mean :3.705
## 3rd Qu.:5.000 3rd Qu.:5.900 3rd Qu.:5.60 3rd Qu.:4.500
## Max. :6.800 Max. :6.900 Max. :6.70 Max. :6.000
## Openness
## Min. :3.000
## 1st Qu.:4.600
## Median :5.000
## Mean :5.056
## 3rd Qu.:5.600
## Max. :6.700
atts$ExtravertFlag <- ifelse(atts$Extraversion>mean(atts$Extraversion),1,0)
atts$AgreeableFlag <- ifelse(atts$Agreeableness>mean(atts$Agreeableness),1,0)
atts$ConscientiousFlag <- ifelse(atts$Conscientiousness>mean(atts$Conscientiousness),1,0)
atts$NeuroticismFlag <- ifelse(atts$Neuroticism>mean(atts$Neuroticism),1,0)
atts$OpennessFlag <- ifelse(atts$Openness>mean(atts$Openness),1,0)
library(dplyr)
atts %>%
group_by(substr(atts$ID,1,2)) %>%
summarise(extra = mean(Extraversion),
agree = mean(Agreeableness),
consc = mean(Conscientiousness),
neuro = mean(Neuroticism),
open = mean(Openness))
## # A tibble: 4 x 6
## `substr(atts$ID, 1, 2)` extra agree consc neuro open
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hg 4.473684 5.431579 4.989474 3.673684 4.978947
## 2 ht 4.347619 5.242857 4.757143 3.609524 5.119048
## 3 lg 4.486957 5.547826 5.160870 3.591304 5.217391
## 4 lt 3.888889 5.144444 4.594444 3.994444 4.855556
We can test significant levels with a t-test.
t.test(atts$Agreeableness~atts$GENDER)
##
## Welch Two Sample t-test
##
## data: atts$Agreeableness by atts$GENDER
## t = 0.87279, df = 71.562, p-value = 0.3857
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2209302 0.5649938
## sample estimates:
## mean in group F mean in group M
## 5.432558 5.260526
Last, let’s create binary flags for major graduate level (Undegrad = 1, Grad/Prof = 0)
table(atts$Occupation)
##
## ANALYTIC CONSULTANT BANKING BUSINESS SYSTEMS CONSULTANT
## 1 1
## Faculty/Staff Master
## 2 45
## PhD Undergraduate
## 7 25
atts$Undergrad <- ifelse(atts$Occupation=="Undergraduate",1,0)
t.test(atts$Openness~atts$Undergrad)
##
## Welch Two Sample t-test
##
## data: atts$Openness by atts$Undergrad
## t = 3.0804, df = 41.527, p-value = 0.00366
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1872119 0.8992167
## sample estimates:
## mean in group 0 mean in group 1
## 5.223214 4.680000
Also, let’s explore the major. I have manually created a flag (MajorS) for Computing and Non-Computing.
table(atts$Major)
##
## Architecture
## 6
## Architecture & Business
## 1
## Architecture/IT
## 1
## Bioinformatics
## 2
## Business
## 7
## Business;Computer Science
## 2
## Business;Computer Science;Data Science and Business Analytics
## 1
## Business;Economics
## 1
## Business;Psychology
## 1
## Communication
## 3
## Computer Science
## 18
## Computer Science;Architecture
## 2
## Data Science
## 1
## Data Science an Business Analytics
## 1
## data science and business analytics
## 1
## Datascience and business analytics
## 1
## Data Science and Business Analytics
## 4
## DATA SCIENCE AND BUSINESS ANALYTICS
## 2
## DSBA
## 3
## DSBA Program
## 1
## Earth and Environmental Science
## 1
## Electrical and Computer Engineering
## 1
## Electrical Engineering
## 2
## Geography
## 1
## GEOGRAPHY
## 1
## History
## 1
## Information Technology
## 1
## Interdisciplinary Studies Degree
## 4
## Interdisciplinary Studies Degree;HEALTH INFORMATICS
## 1
## Post-Bac
## 1
## Pre-Public Health
## 1
## Psychology
## 1
## Psychology;Chemistry
## 1
## Public Health
## 1
## SIS
## 1
## Social Work
## 2
## Undeclared
## 1
table(atts$MajorS)
##
## 0 1
## 35 46
df2 <- merge(df, atts, by.x = "user", by.y = "ID", all.x = TRUE)
Let’s explore an example of the log sequence.
df$log[1]
## [1] "crystalball_login - menu_click_menu icon - menu date picker_click - map_click_cluster - map_navigate_zoom - map_navigate_move - map_navigate_zoom - map_click_button find events - calendar_hover_starglyph - calendar_hover_location - calendar_hover_keywordmore - calendar_hover_starglyph - calendar_hover_keywordmore - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_click_favorite - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_click_favorite - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_hover_location - calendar_hover_location - calendar_hover_location - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - map_navigate_zoom - map_navigate_zoom - map_navigate_zoom - map_navigate_zoom - map_navigate_move - map_navigate_zoom - map_navigate_zoom - map_navigate_zoom - map_click_cluster - map_click_circle - map_click_circle - map_click_circle - map_click_circle - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_hover_starglyph - calendar_hover_starglyph"
Note that most of the actions are divided into two words, with a “_“. The one exception is”menu date“. Let’s correct them and then use a string split on”-“.
alt <- c("menu date picker_click","menu_date_picker_click",
"social network_hover","social_network_hover",
"social network_navigate","social_network_navigate",
"social network_click","social_network_click",
"word cloud_click","word_cloud_click",
"word cloud_hover","word_cloud_hover",
"word cloud_navigate","word_cloud_navigate",
"menu_click_menu icon","menu_click_menu_icon",
"menu_click_button followerfriend ratio","menu_click_button_followerfriend_ratio",
"map_click_button date distribution","map_click_button_date_distribution",
"menu_click_tweet icon","menu_click_tweet_icon",
"map_click_button find events","map_click_button_find_events",
"menu_click_favorite icon","menu_click_favorite_icon",
"followerfriend ratio","followerfriend_ratio",
"tweet_click_button more","tweet_click_button_more")
# replace all actions with _
for (i in c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29)){
df$log <- gsub(alt[1*i],alt[1*i+1],df$log)
}
# replace " - " with " "
df$log <- gsub(" - "," ",df$log)
df$log <- gsub("_",".",df$log)
df$log <- gsub(" "," ",df$log)
raw <- strsplit(df$log, " ")
df$actions <- as.integer(lapply(raw, length))
df$unique.actions <- as.integer(lapply(lapply(raw, unique), length))
write.csv(df, "./data/clean.csv", row.names = F)
Explore sample action statistics by group.
library(tidyverse)
df %>% group_by(frame, train, time) %>% summarise(AvgActions=mean(actions),
StdActions=sd(actions),
AvgUnqActions=mean(unique.actions),
StdUnqActions=sd(unique.actions))
## # A tibble: 40 x 7
## # Groups: frame, train [?]
## frame train time AvgActions StdActions AvgUnqActions StdUnqActions
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 High Spatial 0.1 72.10526 34.68572 12.684211 3.873738
## 2 High Spatial 0.2 72.10526 34.68572 10.894737 4.293338
## 3 High Spatial 0.3 72.10526 34.68572 9.631579 3.269565
## 4 High Spatial 0.4 72.10526 34.68572 9.421053 3.790570
## 5 High Spatial 0.5 72.10526 34.68572 8.894737 3.125073
## 6 High Spatial 0.6 72.10526 34.68572 10.526316 4.599135
## 7 High Spatial 0.7 72.10526 34.68572 9.052632 4.116008
## 8 High Spatial 0.8 72.10526 34.68572 8.894737 4.280378
## 9 High Spatial 0.9 72.10526 34.68572 9.105263 2.726307
## 10 High Spatial 1.0 72.10526 34.68572 8.315789 4.164034
## # ... with 30 more rows
While at first glance it appears that High-Time and Low-Time have very high number of actions, notice that the standard deviations are very large – so much so that, for example, that all the other values are within 1 std dev of the largest (High-Time). This means that there is not a large enough sample to derive much statistical significance.
We can plot the histograms.
hist(df$actions,
breaks = 20,
main = "Histogram of Number of User Actions",
xlab = "User Actions",
ylab = "Number of Participant by Time Decile")
hist(df$unique.actions,
breaks = 20,
main = "Histogram of Number of Unique Actions",
xlab = "Unique Actions",
ylab = "Number of Participant by Time Decile")
Note that we are getting more distinct distributions by creating more documents (e.g. by breaking down the logs by time) and adding more distinct actions. Note that there are nearly 30 unique actions now.
Let’s run a bag-of-words (DFM) with a 1-3 n-gram.
library(quanteda)
myCorpus <- corpus(df$log)
docvars(myCorpus, "frame") <- df$frame
docvars(myCorpus, "train") <- df$train
docvars(myCorpus, "time") <- df$time
docvars(myCorpus, "extra") <- df2$ExtravertFlag
docvars(myCorpus, "agree") <- df2$AgreeableFlag
docvars(myCorpus, "conscient") <- df2$ConscientiousFlag
docvars(myCorpus, "neurotic") <- df2$NeuroticismFlag
docvars(myCorpus, "open") <- df2$OpennessFlag
docvars(myCorpus, "gender") <- df2$GENDER
docvars(myCorpus, "major") <- df2$MajorS
docvars(myCorpus, "undergrad") <- df2$Undergrad
docvars(myCorpus, "age") <- ifelse(df2$AGE>22,1,0)
# run DTM
dfm <- dfm(myCorpus, ngrams = 1:3)
# remove features not found in at least 50 "docs" -- modified from 25 when user-level
dfm <- dfm_trim(dfm, min_docfreq = 50)
Let’s examine the top actions…
x <- topfeatures(dfm)
plot(tfidf(dfm))
We can also run a dendrogram (clustering). I left the labels off because their length (e.g. tri-actions) is too long.
wordDfm <- sort(weight(dfm, "tf"))
wordDfm <- t(wordDfm)[1:50,] # keep the top 50 words
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)
# Define nodePar
plot(wordCluster, main="Frequency weighting (Labels Removed)", xlab=NA, sub=NA, labels = F)
Let’s convert our data to the stmdfm.
library(stm)
# use quanteda converter to convert our Dfm
stmdfm <- convert(dfm, to = "stm", docvars = docvars(myCorpus))
out <- prepDocuments(stmdfm$documents, stmdfm$vocab, stmdfm$meta, lower.thresh = 50)
Let’s first run a model search to obtain the number of topics.
c <- c(5,8,10,15,20,30,40)
kresult <- searchK(out$documents,
out$vocab,
K = c,
prevalence=~ s(time,5) + train + frame,
data=out$meta,
max.em.its = 100,
seed = 300
)
plot(kresult)
From this analysis, we decided on 8 topics given the high semantic coherence, relatively high held-out likelihood and parsimony. One key reason was we found 8 topics is where we found the “elbow” of the held-out likelihood. Note: we decided on 8 topics rather than 10 topics (e.g. since lower average Semantic Coherence) because the two additional topics simply broke out the Map View and Calendar View into to separate topics respectively (Map View Pan and Click, and Calendar View Dotted vs Solid Line). Collectively, we decided for the sake of parsimony, it would make more sense to keep these two views together rather than seperated.
We tested our model results (estimated anchor bias effects) with both (8 or 10 topics) and found both results were consistent.
Let’s run the model with three covariates: time (b-spline), training and framing.
k <- 8
stmFit <- stm(out$documents, out$vocab, K = k, prevalence =~ s(time,5) + train + frame,
max.em.its = 150, data = out$meta, init.type = "Spectral", seed = 300)
topicNames <- labelTopics(stmFit, n = 5)
t(topicNames$prob)
## [,1]
## [1,] "word.cloud.navigate"
## [2,] "word.cloud.navigate_word.cloud.navigate"
## [3,] "word.cloud.navigate_word.cloud.navigate_word.cloud.navigate"
## [4,] "word.cloud.hover_word.cloud.navigate"
## [5,] "word.cloud.navigate_word.cloud.hover"
## [,2]
## [1,] "social.network.navigate"
## [2,] "social.network.navigate_social.network.navigate"
## [3,] "social.network.navigate_social.network.navigate_social.network.navigate"
## [4,] "social.network.navigate_social.network.hover"
## [5,] "social.network.hover"
## [,3]
## [1,] "calendar.hover.keywordmore"
## [2,] "calendar.hover.keywordmore_calendar.hover.keywordmore"
## [3,] "calendar.hover.keywordmore_calendar.hover.keywordmore_calendar.hover.keywordmore"
## [4,] "calendar.hover.starglyph"
## [5,] "calendar.hover.emotionbar"
## [,4]
## [1,] "social.network.hover"
## [2,] "social.network.hover_social.network.hover"
## [3,] "social.network.hover_social.network.hover_social.network.hover"
## [4,] "word.cloud.hover_social.network.hover_social.network.hover"
## [5,] "word.cloud.hover_social.network.hover"
## [,5]
## [1,] "word.cloud.hover"
## [2,] "word.cloud.hover_word.cloud.hover"
## [3,] "word.cloud.hover_word.cloud.hover_word.cloud.hover"
## [4,] "word.cloud.click"
## [5,] "word.cloud.hover_word.cloud.click"
## [,6]
## [1,] "calendar.hover.circle"
## [2,] "calendar.hover.solidline"
## [3,] "calendar.hover.dottedline"
## [4,] "calendar.hover.circle_calendar.hover.circle"
## [5,] "calendar.hover.solidline_calendar.hover.circle"
## [,7]
## [1,] "calendar.hover.starglyph"
## [2,] "calendar.hover.starglyph_calendar.hover.starglyph"
## [3,] "calendar.hover.location"
## [4,] "calendar.hover.starglyph_calendar.hover.starglyph_calendar.hover.starglyph"
## [5,] "calendar.hover.starglyph_calendar.hover.location"
## [,8]
## [1,] "map.navigate.zoom"
## [2,] "map.navigate.zoom_map.navigate.zoom"
## [3,] "map.navigate.move"
## [4,] "map.click.circle"
## [5,] "map.navigate.zoom_map.navigate.zoom_map.navigate.zoom"
Using the word probabilities (top 5), let’s give each topic the name of the first action (for simplicity).
topic <- data.frame(
topicnames = c('Word Cloud: Navigation',
'Social Network: Navigation',
'Event List: All Tools',
'Social Network & Word Cloud',
'Word Cloud: Click',
'Calender View',
'Event Location & Flower Glyph',
'Map View'
),
TopicNumber = 1:k,
TopicProportions = colMeans(stmFit$theta))
Let’s plot the topic sizes.
plot(stmFit,
type = "summary",
xlim = c(0,.45),
ylim = c(0.2,8.4),
custom.labels = topic$topicnames,
main = "Topic Proportions by Interaction Clusters",
text.cex = 1)
Next, let’s calculate the effect of the two factors (frame = High or Low, and train = Geo or Time).
prep <- estimateEffect(1:k ~ s(time,5) + frame + train, stmFit, meta = out$meta, uncertainty = "Global")
We can then use the plot.estimateEffect function to compare the effect of the “train” field (Geo or Time) on topic proportions (likelihood of the topic).
We can then rank the topics by their point estimates.
# order based on Expected Topic Proportion
trank = order(topic$TopicProportions, decreasing = T)
temp.topic <- topic[trank,]
x <- plot(prep, "train", method = "difference", model = stmFit,
cov.value1 = "Time", cov.value2 = "Spatial",
verbose.labels = F,
topics = temp.topic$TopicNumber,
custom.labels = temp.topic$topicnames,
#custom.labels = " ",
labeltype = "custom",
ylab = "Expected Topic Difference",
xlab = "Geo Not Significant Time",
main = "Effect of Visual Anchor",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
width = 40,
ci.level = 0.95)
# optional: order based on Topic Prevalance
frank = order(unlist(Result$means))
#temp.topic <- topic[frank,]
plot(prep, "frame", method = "difference", model = stmFit,
cov.value1 = "High", cov.value2 = "Low",
verbose.labels = F,
topics = temp.topic$TopicNumber,
custom.labels = temp.topic$topicnames,
#custom.labels = " ",
labeltype = "custom",
ylab = "Expected Topic Difference (95% CI)",
xlab = "Low Not Significant High",
main = "Effect of Numerical Anchor",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
width = 40,
ci.level = 0.95)
95% confidence intervals have been added.
par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in trank){
plot(prep, "time", method = "continuous", topics = i, model = z,
main = paste0(topic$topicnames[i]),
printlegend = FALSE, ylab = "Exp. Topic Prob",
xlab = "Time (Deciles of Action)", ylim = c(-0.01,0.5),
ci.level = 0.95
)
}
par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in trank){
plot(prep, "time", method = "continuous", topics = i, main = paste0(topic$topicnames[i]),
printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
moderator = "train", moderator.value = "Time", linecol = "red", ylim = c(-0.01 ,0.5),
ci.level = 0)
plot(prep, "time", method = "continuous", topics = i,
printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
moderator = "train", moderator.value = "Spatial", linecol = "blue", add = "T",
ylim = c(-0.01 ,0.5), ci.level = 0)
legend(0.65, 0.5, c("Time", "Geo"), lwd = 2, col = c("red", "blue"))
}
par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in frank){
plot(prep, "time", method = "continuous", topics = i, main = paste0(topic$topicnames[i]),
printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
moderator = "frame", moderator.value = "High", linecol = "green", ylim = c(-0.01 ,0.5),
ci.level = 0)
plot(prep, "time", method = "continuous", topics = i,
printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
moderator = "frame", moderator.value = "Low", linecol = "brown", add = "T",
ylim = c(-0.01 ,0.5), ci.level = 0)
legend(0.65, 0.5, c("Low", "High"), lwd = 2, col = c("brown", "green"))
}
Let’s keep time and training and used different binary flags.
topicNames2 <- labelTopics(stmFit2)
topic2 <- data.frame(
topicnames = c('Word Cloud: Navigation',
'Social Network: Navigation',
'Event List: All Tools',
'Social Network & Word Cloud',
'Word Cloud: Click',
'Calender View',
'Event Location & Flower Glyph',
'Map View'
),
TopicNumber = 1:k,
TopicProportions = colMeans(stmFit2$theta))
extraResult <- plot(prep2, "extra", method = "difference",
cov.value1 = "1", cov.value2 = "0",
verbose.labels = F,
model = stmFit2,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Low Not Significant High",
main = "Effect of Extraversion",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
agreeResult <- plot(prep2, "agree", method = "difference",
cov.value1 = "1", cov.value2 = "0",
verbose.labels = F,
model = stmFit2,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Low Not Significant High",
main = "Effect of Agreeableness",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
consResult <- plot(prep2, "conscient", method = "difference",
cov.value1 = "1", cov.value2 = "0",
verbose.labels = F,
model = stmFit2,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Low Not Significant High",
main = "Effect of Conscientiousness",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
neuroResult <- plot(prep2, "neurotic", method = "difference",
cov.value1 = "1", cov.value2 = "0",
verbose.labels = F,
model = stmFit2,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Low Not Significant High",
main = "Effect of Neuroticism",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
openResult <- plot(prep2, "open", method = "difference",
cov.value1 = "1", cov.value2 = "0",
verbose.labels = F,
model = stmFit2,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Low Not Significant High",
main = "Effect of Openness",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
This matrix scatter plot shows the values of the estimated effects for each topic by each of the five binary variables.
df3 <- data.frame(label = unlist(agreeResult$labels),
Agreeable = unlist(agreeResult$means),
Conscientious = unlist(consResult$means),
Extravert = unlist(extraResult$means),
Neurotic = unlist(neuroResult$means),
Openness = unlist(openResult$means))
# another option
makePairs <- function(data)
{
grid <- expand.grid(x = 1:ncol(data), y = 1:ncol(data))
grid <- subset(grid, x != y)
all <- do.call("rbind", lapply(1:nrow(grid), function(i) {
xcol <- grid[i, "x"]
ycol <- grid[i, "y"]
data.frame(xvar = names(data)[ycol], yvar = names(data)[xcol],
x = data[, xcol], y = data[, ycol], data)
}))
all$xvar <- factor(all$xvar, levels = names(data))
all$yvar <- factor(all$yvar, levels = names(data))
densities <- do.call("rbind", lapply(1:ncol(data), function(i) {
data.frame(xvar = names(data)[i], yvar = names(data)[i], x = data[, i])
}))
list(all=all, densities=densities)
}
# expand iris data frame for pairs plot
gg1 = makePairs(df3[,2:6])
# pairs plot
# https://gastonsanchez.wordpress.com/2012/08/27/scatterplot-matrices-with-ggplot/
ggplot(gg1$all, aes_string(x = "x", y = "y")) +
facet_grid(xvar ~ yvar, scales = "free") +
xlim(-0.05,0.05) + ylim(-0.05,0.05) +
geom_point( na.rm = TRUE, alpha=0.8) +
#geom_point(aes(colour=Species), na.rm = TRUE, alpha=0.8) +
stat_density(aes(x = x, y = ..scaled.. * diff(range(x)) + min(x)),
data = gg1$densities, position = "identity",
colour = "grey20", geom = "line")
Let’s run Male/Female as a covariate.
Result <- plot(prep3, "gender", method = "difference",
cov.value1 = "F", cov.value2 = "M",
verbose.labels = F,
model = stmFit3,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Male Not Significant Female",
main = "Effect of Gender",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
Result: No Significant Actions (Topics)
We’ll use a binary flag for undergraduates (1) and graduate students (0).
Result <- plot(prep4, "undergrad", method = "difference",
cov.value1 = "0", cov.value2 = "1",
verbose.labels = F,
model = stmFit4,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Undergrad Not Significant Graduate",
main = "Effect of Education Level",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
We’ll use a binary flag for major: computing-related (1) and non-computing (social sciences and humanities) (0).
Result <- plot(prep5, "major", method = "difference",
cov.value1 = "0", cov.value2 = "1",
verbose.labels = F,
model = stmFit5,
labeltype = "custom",
custom.labels = topic2$topicnames,
#custom.labels = " ",
ylab = "Exp Topic Difference",
xlab = "Computing Not Significant Non-Computing",
main = "Effect of Major",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
We’ll use a binary flag for age: over 23 or older (1) or under 23 (0).
Result <- plot(prep5, "age", method = "difference",
cov.value1 = "1", cov.value2 = "0",
verbose.labels = F,
model = stmFit5,
custom.labels = topic$topicnames,
#custom.labels = " ",
labeltype = "custom",
ylab = "Exp Topic Difference",
xlab = "18-22 Not Significant 23+ Older",
main = "Effect of Age",
xlim = c(-0.2,0.2),
ylim = c(0.4,8.4),
ci.level = 0.95,
width = 40)
save.image(file = "./data/stmimage.Rdata")
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stm_1.2.2 quanteda_0.99 purrr_0.2.3 readr_1.1.1
## [5] tidyr_0.7.0 tibble_1.3.3 ggplot2_2.2.1 tidyverse_1.1.1
## [9] bindrcpp_0.2 dplyr_0.7.2 pairsD3_0.1.0 reshape_0.8.7
## [13] rjson_0.2.15
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.12 lubridate_1.6.0 lattice_0.20-34
## [4] assertthat_0.2.0 rprojroot_1.2 digest_0.6.12
## [7] psych_1.7.5 mime_0.5 slam_0.1-40
## [10] R6_2.2.2 cellranger_1.1.0 plyr_1.8.4
## [13] backports_1.1.0 evaluate_0.10.1 httr_1.3.1
## [16] rlang_0.1.2 lazyeval_0.2.0 readxl_1.0.0
## [19] data.table_1.10.4 Matrix_1.2-8 rmarkdown_1.6
## [22] labeling_0.3 splines_3.4.1 stringr_1.2.0
## [25] foreign_0.8-67 htmlwidgets_0.9 munsell_0.4.3
## [28] shiny_1.0.4 broom_0.4.2 compiler_3.4.1
## [31] httpuv_1.3.5 modelr_0.1.1 pkgconfig_2.0.1
## [34] mnormt_1.5-5 htmltools_0.3.6 matrixStats_0.52.2
## [37] grid_3.4.1 nlme_3.1-131 jsonlite_1.5
## [40] xtable_1.8-2 gtable_0.2.0 magrittr_1.5
## [43] scales_0.4.1 RcppParallel_4.3.20 stringi_1.1.5
## [46] reshape2_1.4.2 xml2_1.1.1 fastmatch_1.1-0
## [49] wordcloud_2.5 RColorBrewer_1.1-2 tools_3.4.1
## [52] forcats_0.2.0 glue_1.1.1 hms_0.3
## [55] parallel_3.4.1 yaml_2.1.14 colorspace_1.3-2
## [58] rvest_0.3.2 knitr_1.17 bindr_0.1
## [61] haven_1.1.0