Sets in R

setA=c(1,2,3)
setB=c(2,3,4,5)
union(setA,setB)
## [1] 1 2 3 4 5
intersect(setA,setB)
## [1] 2 3

Probability distributions

Steph Curry example

pFreeThrows=dbinom(seq(0,4),4,0.91)
curryDf = data.frame(numSuccesses=seq(0,4),probability=pFreeThrows)
curryDf
sum(curryDf$probability[1:3])
## [1] 0.04296483

Plot probability distribution

ggplot(curryDf,aes(x=numSuccesses,y=probability)) +
  geom_bar(stat="identity") + scale_x_continuous(breaks=seq(0,4)) +
  xlab('Number of successes out of 10 free throws') + ylab('Probability') 

ggplot(curryDf,aes(x=numSuccesses,y=probability)) +
  geom_bar(stat="identity") + scale_x_continuous(breaks=seq(0,4)) +
  xlab('Number of successes out of 10 free throws') + ylab('Probability') +
  geom_bar(data=curryDf[1:3,],aes(x=numSuccesses,y=probability),stat="identity",fill='green') 

Color in tail probability for 7+ heads - using cumulative

curryDf$cumulative=cumsum(curryDf$probability)
ggplot(curryDf,aes(x=numSuccesses,y=cumulative)) +
  geom_bar(stat="identity") + scale_x_continuous(breaks=seq(0,4)) +
  xlab('Number of successes out of 10 free throws') + ylab('Cumulative Probability') +
  geom_bar(data=curryDf[1:3,],aes(x=numSuccesses,y=cumulative),stat="identity",fill='green') +
  geom_hline(yintercept=curryDf$probability[3],color='blue',linetype='dashed') 

Election results

Here we analyze the election results as they came in during the special election for US Senator in Alabama on Dec 12, 2017 (transcribed from https://www.ajc.com/news/national/alabama-senate-race-live-updates-roy-moore-doug-jones/KPRfkdaweoiXICW3FHjXqI/).

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
electionReturns=read.table('alabama_election_returns.csv',sep=',',header=TRUE) %>% gather(candidate,pctVotes,-pctResp)
ggplot(electionReturns,aes(pctResp,pctVotes,color=candidate)) +
  geom_line(size=1) +
  scale_color_manual(values=c( "#9999CC","#CC6666")) +
  xlab('Percentage of precincts reporting') +
  ylab('Percentage of votes')

Law of large numbers

Let’s run 100,000 trials of 10 coin flips, and look at how the average empirical probability changes as a fnction of sample size

set.seed(12345)
nsamples=30000
sampDf=data.frame(x=seq(nsamples),samples=rbinom(nsamples,10,0.5)/10) %>% mutate(mean=cumsum(samples)/seq_along(samples))

ggplot(sampDf[10:nsamples,],aes(x=x,y=mean)) + 
  geom_hline(yintercept = 0.5,color='blue',linetype='dashed') +
  geom_line() + 
  xlab('Number of trials') + ylab('Estimated probability of heads')