Load the data

Load the data and convert to a data frame.

library(rjson)
logs <- fromJSON(file="./data/crystalball_userlog_final.json")

logs <- lapply(logs, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df <- do.call("rbind", logs)
df <- data.frame(user = row.names(df), log = df, stringsAsFactors = F)

How many unique users do we have?

length(unique(df$user))
## [1] 81

The data excludes test accounts and four users for incomplete data.

First, reshape the data from “wide” to “long” to create the time variables.

library(reshape)
df <- reshape(df, varying = c("log.1","log.2","log.3","log.4","log.5",
                              "log.6","log.7","log.8","log.9","log.10"), idvar = "user", direction = "long")

Next, create two custom variables for our groups.

df$frame <- ifelse(substr(df$user,0,1)=="h","High","Low")
df$train <- ifelse(substr(df$user,2,2)=="t","Time","Spatial")

table(df$frame, df$train)/10
##       
##        Spatial Time
##   High      19   21
##   Low       23   18
df$time <- df$time / 10 

Notice that the groups are spread fairly evenly.

Appending User Attributes

First, upload user-level attributes.

atts <- readr::read_csv("./data/demographics-pretest.csv")

Next, use the pairsD3 function to scatter the personality scores.

library(pairsD3)

pairsD3(atts[,9:13], group = as.factor(atts$GENDER))

Personality K-Means

Run a quick k=5 k-means. This was explored as a feature for personality clusters.

kmeans <- kmeans(atts[,9:13], centers = 5, iter.max = 10, nstart =3,
       algorithm = c("Hartigan-Wong"), trace=FALSE)

pairsD3(atts[,9:13], group = as.factor(kmeans$cluster))
atts$flag2 <- ifelse(kmeans$cluster=="2",1,0)
atts$flag3 <- ifelse(kmeans$cluster=="3",1,0)

Alternatively, create binary flags for each personality measure.

Values above the mean are given a 1, values below are 0.

summary(atts[,9:13])
##   Extraversion   Agreeableness   Conscientiousness  Neuroticism   
##  Min.   :2.100   Min.   :2.700   Min.   :2.70      Min.   :1.300  
##  1st Qu.:3.800   1st Qu.:4.900   1st Qu.:4.20      1st Qu.:3.100  
##  Median :4.400   Median :5.400   Median :5.00      Median :3.600  
##  Mean   :4.315   Mean   :5.352   Mean   :4.89      Mean   :3.705  
##  3rd Qu.:5.000   3rd Qu.:5.900   3rd Qu.:5.60      3rd Qu.:4.500  
##  Max.   :6.800   Max.   :6.900   Max.   :6.70      Max.   :6.000  
##     Openness    
##  Min.   :3.000  
##  1st Qu.:4.600  
##  Median :5.000  
##  Mean   :5.056  
##  3rd Qu.:5.600  
##  Max.   :6.700
atts$ExtravertFlag <- ifelse(atts$Extraversion>mean(atts$Extraversion),1,0)
atts$AgreeableFlag <- ifelse(atts$Agreeableness>mean(atts$Agreeableness),1,0)
atts$ConscientiousFlag <- ifelse(atts$Conscientiousness>mean(atts$Conscientiousness),1,0)
atts$NeuroticismFlag <- ifelse(atts$Neuroticism>mean(atts$Neuroticism),1,0)
atts$OpennessFlag <- ifelse(atts$Openness>mean(atts$Openness),1,0)

library(dplyr)
atts %>% 
  group_by(substr(atts$ID,1,2)) %>% 
  summarise(extra = mean(Extraversion),
            agree = mean(Agreeableness),
            consc = mean(Conscientiousness),
            neuro = mean(Neuroticism),
            open = mean(Openness))
## # A tibble: 4 x 6
##   `substr(atts$ID, 1, 2)`    extra    agree    consc    neuro     open
##                     <chr>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1                      hg 4.473684 5.431579 4.989474 3.673684 4.978947
## 2                      ht 4.347619 5.242857 4.757143 3.609524 5.119048
## 3                      lg 4.486957 5.547826 5.160870 3.591304 5.217391
## 4                      lt 3.888889 5.144444 4.594444 3.994444 4.855556

We can test significant levels with a t-test.

t.test(atts$Agreeableness~atts$GENDER)
## 
##  Welch Two Sample t-test
## 
## data:  atts$Agreeableness by atts$GENDER
## t = 0.87279, df = 71.562, p-value = 0.3857
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2209302  0.5649938
## sample estimates:
## mean in group F mean in group M 
##        5.432558        5.260526

Major & Graduate Level

Last, let’s create binary flags for major graduate level (Undegrad = 1, Grad/Prof = 0)

table(atts$Occupation)
## 
## ANALYTIC CONSULTANT BANKING BUSINESS SYSTEMS CONSULTANT 
##                           1                           1 
##               Faculty/Staff                      Master 
##                           2                          45 
##                         PhD               Undergraduate 
##                           7                          25
atts$Undergrad <- ifelse(atts$Occupation=="Undergraduate",1,0)

t.test(atts$Openness~atts$Undergrad)
## 
##  Welch Two Sample t-test
## 
## data:  atts$Openness by atts$Undergrad
## t = 3.0804, df = 41.527, p-value = 0.00366
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1872119 0.8992167
## sample estimates:
## mean in group 0 mean in group 1 
##        5.223214        4.680000

Also, let’s explore the major. I have manually created a flag (MajorS) for Computing and Non-Computing.

table(atts$Major)
## 
##                                                  Architecture 
##                                                             6 
##                                       Architecture & Business 
##                                                             1 
##                                               Architecture/IT 
##                                                             1 
##                                                Bioinformatics 
##                                                             2 
##                                                      Business 
##                                                             7 
##                                     Business;Computer Science 
##                                                             2 
## Business;Computer Science;Data Science and Business Analytics 
##                                                             1 
##                                            Business;Economics 
##                                                             1 
##                                           Business;Psychology 
##                                                             1 
##                                                 Communication 
##                                                             3 
##                                              Computer Science 
##                                                            18 
##                                 Computer Science;Architecture 
##                                                             2 
##                                                  Data Science 
##                                                             1 
##                            Data Science an Business Analytics 
##                                                             1 
##                           data science and business analytics 
##                                                             1 
##                            Datascience and business analytics 
##                                                             1 
##                           Data Science and Business Analytics 
##                                                             4 
##                           DATA SCIENCE AND BUSINESS ANALYTICS 
##                                                             2 
##                                                          DSBA 
##                                                             3 
##                                                  DSBA Program 
##                                                             1 
##                               Earth and Environmental Science 
##                                                             1 
##                           Electrical and Computer Engineering 
##                                                             1 
##                                        Electrical Engineering 
##                                                             2 
##                                                     Geography 
##                                                             1 
##                                                     GEOGRAPHY 
##                                                             1 
##                                                       History 
##                                                             1 
##                                        Information Technology 
##                                                             1 
##                              Interdisciplinary Studies Degree 
##                                                             4 
##           Interdisciplinary Studies Degree;HEALTH INFORMATICS 
##                                                             1 
##                                                      Post-Bac 
##                                                             1 
##                                             Pre-Public Health 
##                                                             1 
##                                                    Psychology 
##                                                             1 
##                                          Psychology;Chemistry 
##                                                             1 
##                                                 Public Health 
##                                                             1 
##                                                           SIS 
##                                                             1 
##                                                   Social Work 
##                                                             2 
##                                                    Undeclared 
##                                                             1
table(atts$MajorS)
## 
##  0  1 
## 35 46
df2 <- merge(df, atts, by.x = "user", by.y = "ID", all.x = TRUE)

Log Preprocessing

Let’s explore an example of the log sequence.

df$log[1]
## [1] "crystalball_login - menu_click_menu icon - menu date picker_click - map_click_cluster - map_navigate_zoom - map_navigate_move - map_navigate_zoom - map_click_button find events - calendar_hover_starglyph - calendar_hover_location - calendar_hover_keywordmore - calendar_hover_starglyph - calendar_hover_keywordmore - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_click_favorite - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_click_favorite - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_hover_location - calendar_hover_location - calendar_hover_location - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - map_navigate_zoom - map_navigate_zoom - map_navigate_zoom - map_navigate_zoom - map_navigate_move - map_navigate_zoom - map_navigate_zoom - map_navigate_zoom - map_click_cluster - map_click_circle - map_click_circle - map_click_circle - map_click_circle - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_starglyph - calendar_hover_location - calendar_hover_location - calendar_hover_starglyph - calendar_hover_starglyph"

Note that most of the actions are divided into two words, with a “_“. The one exception is”menu date“. Let’s correct them and then use a string split on”-“.

alt <- c("menu date picker_click","menu_date_picker_click",
         "social network_hover","social_network_hover",
         "social network_navigate","social_network_navigate",
         "social network_click","social_network_click",
         "word cloud_click","word_cloud_click",
         "word cloud_hover","word_cloud_hover",
         "word cloud_navigate","word_cloud_navigate",
         "menu_click_menu icon","menu_click_menu_icon",
         "menu_click_button followerfriend ratio","menu_click_button_followerfriend_ratio",
         "map_click_button date distribution","map_click_button_date_distribution",
         "menu_click_tweet icon","menu_click_tweet_icon",
         "map_click_button find events","map_click_button_find_events",
         "menu_click_favorite icon","menu_click_favorite_icon",
         "followerfriend ratio","followerfriend_ratio",
         "tweet_click_button more","tweet_click_button_more")

# replace all actions with _
for (i in c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29)){
  df$log <- gsub(alt[1*i],alt[1*i+1],df$log)
}

# replace " - " with " "
df$log <- gsub(" - "," ",df$log)
df$log <- gsub("_",".",df$log)
df$log <- gsub("  "," ",df$log)

raw <- strsplit(df$log, " ")

df$actions <- as.integer(lapply(raw, length))
df$unique.actions <- as.integer(lapply(lapply(raw, unique), length))

write.csv(df, "./data/clean.csv", row.names = F)

Explore sample action statistics by group.

library(tidyverse)

df %>% group_by(frame, train, time) %>% summarise(AvgActions=mean(actions),
                                            StdActions=sd(actions),
                                            AvgUnqActions=mean(unique.actions),
                                            StdUnqActions=sd(unique.actions))
## # A tibble: 40 x 7
## # Groups:   frame, train [?]
##    frame   train  time AvgActions StdActions AvgUnqActions StdUnqActions
##    <chr>   <chr> <dbl>      <dbl>      <dbl>         <dbl>         <dbl>
##  1  High Spatial   0.1   72.10526   34.68572     12.684211      3.873738
##  2  High Spatial   0.2   72.10526   34.68572     10.894737      4.293338
##  3  High Spatial   0.3   72.10526   34.68572      9.631579      3.269565
##  4  High Spatial   0.4   72.10526   34.68572      9.421053      3.790570
##  5  High Spatial   0.5   72.10526   34.68572      8.894737      3.125073
##  6  High Spatial   0.6   72.10526   34.68572     10.526316      4.599135
##  7  High Spatial   0.7   72.10526   34.68572      9.052632      4.116008
##  8  High Spatial   0.8   72.10526   34.68572      8.894737      4.280378
##  9  High Spatial   0.9   72.10526   34.68572      9.105263      2.726307
## 10  High Spatial   1.0   72.10526   34.68572      8.315789      4.164034
## # ... with 30 more rows

While at first glance it appears that High-Time and Low-Time have very high number of actions, notice that the standard deviations are very large – so much so that, for example, that all the other values are within 1 std dev of the largest (High-Time). This means that there is not a large enough sample to derive much statistical significance.

We can plot the histograms.

hist(df$actions, 
     breaks = 20, 
     main = "Histogram of Number of User Actions", 
     xlab = "User Actions",
     ylab = "Number of Participant by Time Decile")

hist(df$unique.actions, 
     breaks = 20, 
     main = "Histogram of Number of Unique Actions", 
     xlab = "Unique Actions",
     ylab = "Number of Participant by Time Decile")

Note that we are getting more distinct distributions by creating more documents (e.g. by breaking down the logs by time) and adding more distinct actions. Note that there are nearly 30 unique actions now.

Bag-of-Actions Counts

Let’s run a bag-of-words (DFM) with a 1-3 n-gram.

library(quanteda)
myCorpus <- corpus(df$log)
docvars(myCorpus, "frame") <- df$frame
docvars(myCorpus, "train") <- df$train
docvars(myCorpus, "time") <- df$time
docvars(myCorpus, "extra") <- df2$ExtravertFlag
docvars(myCorpus, "agree") <- df2$AgreeableFlag
docvars(myCorpus, "conscient") <- df2$ConscientiousFlag
docvars(myCorpus, "neurotic") <- df2$NeuroticismFlag
docvars(myCorpus, "open") <- df2$OpennessFlag
docvars(myCorpus, "gender") <- df2$GENDER
docvars(myCorpus, "major") <- df2$MajorS
docvars(myCorpus, "undergrad") <- df2$Undergrad
docvars(myCorpus, "age") <- ifelse(df2$AGE>22,1,0)

# run DTM 
dfm <- dfm(myCorpus, ngrams = 1:3)

# remove features not found in at least 50 "docs" -- modified from 25 when user-level
dfm <- dfm_trim(dfm, min_docfreq = 50)

Let’s examine the top actions…

x <- topfeatures(dfm)
plot(tfidf(dfm))

We can also run a dendrogram (clustering). I left the labels off because their length (e.g. tri-actions) is too long.

wordDfm <- sort(weight(dfm, "tf"))
wordDfm <- t(wordDfm)[1:50,]  # keep the top 50 words
wordDistMat <- dist(wordDfm)
wordCluster <- hclust(wordDistMat)

# Define nodePar
plot(wordCluster, main="Frequency weighting (Labels Removed)", xlab=NA, sub=NA, labels = F)

Structural Topic Modeling

Let’s convert our data to the stmdfm.

library(stm)

# use quanteda converter to convert our Dfm
stmdfm <- convert(dfm, to = "stm", docvars = docvars(myCorpus))

out <- prepDocuments(stmdfm$documents, stmdfm$vocab, stmdfm$meta, lower.thresh = 50)

Search for K

Let’s first run a model search to obtain the number of topics.

c <- c(5,8,10,15,20,30,40)

kresult <- searchK(out$documents, 
                   out$vocab, 
                   K = c, 
                   prevalence=~ s(time,5) + train + frame, 
                   data=out$meta, 
                   max.em.its = 100,
                   seed = 300
                   )
plot(kresult)

From this analysis, we decided on 8 topics given the high semantic coherence, relatively high held-out likelihood and parsimony. One key reason was we found 8 topics is where we found the “elbow” of the held-out likelihood. Note: we decided on 8 topics rather than 10 topics (e.g. since lower average Semantic Coherence) because the two additional topics simply broke out the Map View and Calendar View into to separate topics respectively (Map View Pan and Click, and Calendar View Dotted vs Solid Line). Collectively, we decided for the sake of parsimony, it would make more sense to keep these two views together rather than seperated.

We tested our model results (estimated anchor bias effects) with both (8 or 10 topics) and found both results were consistent.

Run Baseline Model

Let’s run the model with three covariates: time (b-spline), training and framing.

k <- 8

stmFit <- stm(out$documents, out$vocab, K = k, prevalence =~ s(time,5) + train + frame, 
              max.em.its = 150, data = out$meta, init.type = "Spectral", seed = 300)
topicNames <- labelTopics(stmFit, n = 5)
t(topicNames$prob)
##      [,1]                                                         
## [1,] "word.cloud.navigate"                                        
## [2,] "word.cloud.navigate_word.cloud.navigate"                    
## [3,] "word.cloud.navigate_word.cloud.navigate_word.cloud.navigate"
## [4,] "word.cloud.hover_word.cloud.navigate"                       
## [5,] "word.cloud.navigate_word.cloud.hover"                       
##      [,2]                                                                     
## [1,] "social.network.navigate"                                                
## [2,] "social.network.navigate_social.network.navigate"                        
## [3,] "social.network.navigate_social.network.navigate_social.network.navigate"
## [4,] "social.network.navigate_social.network.hover"                           
## [5,] "social.network.hover"                                                   
##      [,3]                                                                              
## [1,] "calendar.hover.keywordmore"                                                      
## [2,] "calendar.hover.keywordmore_calendar.hover.keywordmore"                           
## [3,] "calendar.hover.keywordmore_calendar.hover.keywordmore_calendar.hover.keywordmore"
## [4,] "calendar.hover.starglyph"                                                        
## [5,] "calendar.hover.emotionbar"                                                       
##      [,4]                                                            
## [1,] "social.network.hover"                                          
## [2,] "social.network.hover_social.network.hover"                     
## [3,] "social.network.hover_social.network.hover_social.network.hover"
## [4,] "word.cloud.hover_social.network.hover_social.network.hover"    
## [5,] "word.cloud.hover_social.network.hover"                         
##      [,5]                                                
## [1,] "word.cloud.hover"                                  
## [2,] "word.cloud.hover_word.cloud.hover"                 
## [3,] "word.cloud.hover_word.cloud.hover_word.cloud.hover"
## [4,] "word.cloud.click"                                  
## [5,] "word.cloud.hover_word.cloud.click"                 
##      [,6]                                            
## [1,] "calendar.hover.circle"                         
## [2,] "calendar.hover.solidline"                      
## [3,] "calendar.hover.dottedline"                     
## [4,] "calendar.hover.circle_calendar.hover.circle"   
## [5,] "calendar.hover.solidline_calendar.hover.circle"
##      [,7]                                                                        
## [1,] "calendar.hover.starglyph"                                                  
## [2,] "calendar.hover.starglyph_calendar.hover.starglyph"                         
## [3,] "calendar.hover.location"                                                   
## [4,] "calendar.hover.starglyph_calendar.hover.starglyph_calendar.hover.starglyph"
## [5,] "calendar.hover.starglyph_calendar.hover.location"                          
##      [,8]                                                   
## [1,] "map.navigate.zoom"                                    
## [2,] "map.navigate.zoom_map.navigate.zoom"                  
## [3,] "map.navigate.move"                                    
## [4,] "map.click.circle"                                     
## [5,] "map.navigate.zoom_map.navigate.zoom_map.navigate.zoom"

Using the word probabilities (top 5), let’s give each topic the name of the first action (for simplicity).

topic <- data.frame(
  topicnames = c('Word Cloud: Navigation',
                 'Social Network: Navigation',
                 'Event List: All Tools',
                 'Social Network & Word Cloud',
                 'Word Cloud: Click',
                 'Calender View',
                 'Event Location & Flower Glyph',
                 'Map View'
  ),
  TopicNumber = 1:k,
  TopicProportions = colMeans(stmFit$theta))

Let’s plot the topic sizes.

plot(stmFit, 
         type = "summary", 
         xlim = c(0,.45),
         ylim = c(0.2,8.4),
         custom.labels = topic$topicnames,
         main = "Topic Proportions by Interaction Clusters", 
         text.cex = 1)

Next, let’s calculate the effect of the two factors (frame = High or Low, and train = Geo or Time).

prep <- estimateEffect(1:k ~ s(time,5) + frame + train, stmFit, meta = out$meta, uncertainty = "Global")

Effect of Visual Anchor

We can then use the plot.estimateEffect function to compare the effect of the “train” field (Geo or Time) on topic proportions (likelihood of the topic).

We can then rank the topics by their point estimates.

# order based on Expected Topic Proportion
trank = order(topic$TopicProportions, decreasing = T)
temp.topic <- topic[trank,]

x <- plot(prep, "train", method = "difference", model = stmFit,
     cov.value1 = "Time", cov.value2 = "Spatial", 
     verbose.labels = F, 
     topics = temp.topic$TopicNumber,
     custom.labels = temp.topic$topicnames,
     #custom.labels = " ",
     labeltype = "custom",
     ylab = "Expected Topic Difference", 
     xlab = "Geo              Not Significant           Time",
     main = "Effect of Visual Anchor",
     xlim = c(-0.2,0.2),
          ylim = c(0.4,8.4),
     width = 40,
     ci.level = 0.95)

Effect of Numeric Anchor

# optional: order based on Topic Prevalance 
frank = order(unlist(Result$means))
#temp.topic <- topic[frank,]

plot(prep, "frame", method = "difference", model = stmFit,
     cov.value1 = "High", cov.value2 = "Low", 
     verbose.labels = F, 
     topics = temp.topic$TopicNumber,
     custom.labels = temp.topic$topicnames,
     #custom.labels = " ",
     labeltype = "custom",
     ylab = "Expected Topic Difference (95% CI)", 
     xlab = "Low              Not Significant           High",
     main = "Effect of Numerical Anchor",
     xlim = c(-0.2,0.2), 
     ylim = c(0.4,8.4),
     width = 40,
     ci.level = 0.95)

Effect of Time

95% confidence intervals have been added.

par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in trank){
  plot(prep, "time", method = "continuous", topics = i, model = z,  
       main = paste0(topic$topicnames[i]),
       printlegend = FALSE, ylab = "Exp. Topic Prob", 
       xlab = "Time (Deciles of Action)", ylim = c(-0.01,0.5),
       ci.level = 0.95
  )
}

Interaction: Time & Visual Anchor

par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in trank){
  plot(prep, "time", method = "continuous", topics = i, main = paste0(topic$topicnames[i]),
       printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
       moderator = "train", moderator.value = "Time",  linecol = "red", ylim = c(-0.01 ,0.5),
       ci.level = 0)
  plot(prep, "time", method = "continuous", topics = i,
       printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
       moderator = "train", moderator.value = "Spatial",  linecol = "blue", add = "T", 
       ylim = c(-0.01 ,0.5), ci.level = 0)

  legend(0.65, 0.5, c("Time", "Geo"), lwd = 2, col = c("red", "blue"))
}

Interaction: Time & Numeric Anchor

par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in frank){
  plot(prep, "time", method = "continuous", topics = i, main = paste0(topic$topicnames[i]),
       printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
       moderator = "frame", moderator.value = "High",  linecol = "green", ylim = c(-0.01 ,0.5),
       ci.level = 0)
  plot(prep, "time", method = "continuous", topics = i,
       printlegend = FALSE, ylab = "Exp. Topic Prob", xlab = "Time (Deciles of Action)",
       moderator = "frame", moderator.value = "Low",  linecol = "brown", add = "T", 
       ylim = c(-0.01 ,0.5), ci.level = 0)
  legend(0.65, 0.5, c("Low", "High"), lwd = 2, col = c("brown", "green"))
}

Personality Covariates

Let’s keep time and training and used different binary flags.

topicNames2 <- labelTopics(stmFit2)
topic2 <- data.frame(
    topicnames = c('Word Cloud: Navigation',
                 'Social Network: Navigation',
                 'Event List: All Tools',
                 'Social Network & Word Cloud',
                 'Word Cloud: Click',
                 'Calender View',
                 'Event Location & Flower Glyph',
                 'Map View'
  ),
  TopicNumber = 1:k,
  TopicProportions = colMeans(stmFit2$theta))

Effect of Extravert

extraResult <- plot(prep2, "extra", method = "difference",
               cov.value1 = "1", cov.value2 = "0", 
               verbose.labels = F,
               model = stmFit2,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Low          Not Significant          High",
               main = "Effect of Extraversion",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95, 
               width = 40)

Effect of Agreeable

agreeResult <- plot(prep2, "agree", method = "difference",
               cov.value1 = "1", cov.value2 = "0", 
               verbose.labels = F,
               model = stmFit2,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Low          Not Significant          High",
               main = "Effect of Agreeableness",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Effect of Conscientious

consResult <- plot(prep2, "conscient", method = "difference",
               cov.value1 = "1", cov.value2 = "0", 
               verbose.labels = F,
               model = stmFit2,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Low          Not Significant          High",
               main = "Effect of Conscientiousness",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Effect of Neurotic

neuroResult <- plot(prep2, "neurotic", method = "difference",
               cov.value1 = "1", cov.value2 = "0", 
               verbose.labels = F,
               model = stmFit2,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Low          Not Significant          High",
               main = "Effect of Neuroticism",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Effect of Openness

openResult <- plot(prep2, "open", method = "difference",
               cov.value1 = "1", cov.value2 = "0", 
               verbose.labels = F,
               model = stmFit2,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Low          Not Significant          High",
               main = "Effect of Openness",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Combined Effect

This matrix scatter plot shows the values of the estimated effects for each topic by each of the five binary variables.

df3 <- data.frame(label = unlist(agreeResult$labels),
                  Agreeable = unlist(agreeResult$means),
                  Conscientious = unlist(consResult$means),
                  Extravert = unlist(extraResult$means),
                  Neurotic = unlist(neuroResult$means),
                  Openness = unlist(openResult$means))


# another option
makePairs <- function(data) 
{
  grid <- expand.grid(x = 1:ncol(data), y = 1:ncol(data))
  grid <- subset(grid, x != y)
  all <- do.call("rbind", lapply(1:nrow(grid), function(i) {
    xcol <- grid[i, "x"]
    ycol <- grid[i, "y"]
    data.frame(xvar = names(data)[ycol], yvar = names(data)[xcol], 
               x = data[, xcol], y = data[, ycol], data)
  }))
  all$xvar <- factor(all$xvar, levels = names(data))
  all$yvar <- factor(all$yvar, levels = names(data))
  densities <- do.call("rbind", lapply(1:ncol(data), function(i) {
    data.frame(xvar = names(data)[i], yvar = names(data)[i], x = data[, i])
  }))
  list(all=all, densities=densities)
}
 
# expand iris data frame for pairs plot
gg1 = makePairs(df3[,2:6])

# pairs plot
# https://gastonsanchez.wordpress.com/2012/08/27/scatterplot-matrices-with-ggplot/
ggplot(gg1$all, aes_string(x = "x", y = "y")) + 
  facet_grid(xvar ~ yvar, scales = "free") + 
  xlim(-0.05,0.05) + ylim(-0.05,0.05) +
  geom_point( na.rm = TRUE, alpha=0.8) +
  #geom_point(aes(colour=Species), na.rm = TRUE, alpha=0.8) + 
  stat_density(aes(x = x, y = ..scaled.. * diff(range(x)) + min(x)), 
               data = gg1$densities, position = "identity", 
               colour = "grey20", geom = "line")

Gender Covariate

Let’s run Male/Female as a covariate.

Effect of Gender

Result <- plot(prep3, "gender", method = "difference", 
               cov.value1 = "F", cov.value2 = "M", 
               verbose.labels = F, 
               model = stmFit3,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Male          Not Significant        Female",
               main = "Effect of Gender",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Result: No Significant Actions (Topics)

Education Level Covariate

We’ll use a binary flag for undergraduates (1) and graduate students (0).

Effect of Education Level

Result <- plot(prep4, "undergrad", method = "difference", 
               cov.value1 = "0", cov.value2 = "1", 
               verbose.labels = F, 
               model = stmFit4,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Undergrad        Not Significant        Graduate",
               main = "Effect of Education Level",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Major Covariate

We’ll use a binary flag for major: computing-related (1) and non-computing (social sciences and humanities) (0).

Effect of Major

Result <- plot(prep5, "major", method = "difference", 
               cov.value1 = "0", cov.value2 = "1", 
               verbose.labels = F, 
               model = stmFit5,
               labeltype = "custom",
               custom.labels = topic2$topicnames,
               #custom.labels = " ",
               ylab = "Exp Topic Difference", 
               xlab = "Computing    Not Significant  Non-Computing",
               main = "Effect of Major",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Age Covariate

We’ll use a binary flag for age: over 23 or older (1) or under 23 (0).

Effect of Age

Result <- plot(prep5, "age", method = "difference", 
               cov.value1 = "1", cov.value2 = "0", 
               verbose.labels = F, 
               model = stmFit5,
               custom.labels = topic$topicnames,
               #custom.labels = " ",
               labeltype = "custom",
               ylab = "Exp Topic Difference", 
               xlab = "18-22           Not Significant        23+ Older",
               main = "Effect of Age",
               xlim = c(-0.2,0.2),
               ylim = c(0.4,8.4),
               ci.level = 0.95,
               width = 40)

Save Image and Packages Used

save.image(file = "./data/stmimage.Rdata")
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] stm_1.2.2       quanteda_0.99   purrr_0.2.3     readr_1.1.1    
##  [5] tidyr_0.7.0     tibble_1.3.3    ggplot2_2.2.1   tidyverse_1.1.1
##  [9] bindrcpp_0.2    dplyr_0.7.2     pairsD3_0.1.0   reshape_0.8.7  
## [13] rjson_0.2.15   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.12        lubridate_1.6.0     lattice_0.20-34    
##  [4] assertthat_0.2.0    rprojroot_1.2       digest_0.6.12      
##  [7] psych_1.7.5         mime_0.5            slam_0.1-40        
## [10] R6_2.2.2            cellranger_1.1.0    plyr_1.8.4         
## [13] backports_1.1.0     evaluate_0.10.1     httr_1.3.1         
## [16] rlang_0.1.2         lazyeval_0.2.0      readxl_1.0.0       
## [19] data.table_1.10.4   Matrix_1.2-8        rmarkdown_1.6      
## [22] labeling_0.3        splines_3.4.1       stringr_1.2.0      
## [25] foreign_0.8-67      htmlwidgets_0.9     munsell_0.4.3      
## [28] shiny_1.0.4         broom_0.4.2         compiler_3.4.1     
## [31] httpuv_1.3.5        modelr_0.1.1        pkgconfig_2.0.1    
## [34] mnormt_1.5-5        htmltools_0.3.6     matrixStats_0.52.2 
## [37] grid_3.4.1          nlme_3.1-131        jsonlite_1.5       
## [40] xtable_1.8-2        gtable_0.2.0        magrittr_1.5       
## [43] scales_0.4.1        RcppParallel_4.3.20 stringi_1.1.5      
## [46] reshape2_1.4.2      xml2_1.1.1          fastmatch_1.1-0    
## [49] wordcloud_2.5       RColorBrewer_1.1-2  tools_3.4.1        
## [52] forcats_0.2.0       glue_1.1.1          hms_0.3            
## [55] parallel_3.4.1      yaml_2.1.14         colorspace_1.3-2   
## [58] rvest_0.3.2         knitr_1.17          bindr_0.1          
## [61] haven_1.1.0