- Better understand particular details of popular ML algorithms and techniques
- Less code, more insight
- Familiarity with basic statistics and linear algebra concepts assumed
Ilan Man
Strategy Operations @ Squarespace
\[cost(h_{\theta}(x)) = \left\{ \begin{array}{lr} -log(1-h_{\theta}(x)) & : y = 0\\ -log(h_{\theta}(x)) & : y = 1 \end{array} \right. \]
\(f(x) = x^{4} - 3\log(x)\)
fn <- function(x) x^4 - 3*log(x)
dfn <- function(x) 4*x^3 - 3/x
d2fn <- function(x) 12*x^2 + 3/x^2
newton <- function(num.its, dfn, d2fn){
theta <- rep(0,num.its)
theta[1] <- round(runif(1,0,100),0)
for (i in 2:num.its) {
h <- - dfn(theta[i-1]) / d2fn(theta[i-1])
theta[i] <- theta[i-1] + h
}
out <- cbind(1:num.its,theta)
dimnames(out)[[2]] <- c("iteration","estimate")
return(out)
}
iteration estimate
[1,] 1 47.000
[2,] 2 31.333
[3,] 3 20.889
[4,] 4 13.926
[5,] 5 9.284
iteration estimate
[16,] 16 0.9306
[17,] 17 0.9306
[18,] 18 0.9306
[19,] 19 0.9306
[20,] 20 0.9306
0.9658 ## value of f(x) at minimum
optimize(fn,c(-100,100)) ## built-in R optimization function
optimize(fn,c(-100,100)) ## built-in R optimization function
$minimum
[1] 0.9306
$objective
[1] 0.9658
\(\bf{A}x = \lambda x\)
\(\bf{A}x = \lambda x\)
\(\bf{A}x - \lambda Ix = 0\)
\((\bf{A} - \lambda I)x = 0\)
For this to be non-trivial \(\det(\bf{A} - \lambda I)\) = 0
\[A = \begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix}, I= \begin{bmatrix} 1 & 0\\ 0 & 1 \end{bmatrix}, X = \begin{bmatrix} x_{1}\\ x_{2} \end{bmatrix}\]
\[A = \begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix}, I= \begin{bmatrix} 1 & 0\\ 0 & 1 \end{bmatrix}, X = \begin{bmatrix} x_{1}\\ x_{2} \end{bmatrix}\] \[\begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix}X = \lambda X\]
\[A = \begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix}, I= \begin{bmatrix} 1 & 0\\ 0 & 1 \end{bmatrix}, X = \begin{bmatrix} x_{1}\\ x_{2} \end{bmatrix}\] \[\begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix}X = \lambda X\] \[\begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix}X - \lambda X = 0\] \[(\begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix} - \lambda I)X = 0\]
\[\left | \begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix} - \lambda I \right |= 0\] \[\left|\begin{bmatrix} 5 & 2\\ 2 & 5 \end{bmatrix} - \lambda \begin{bmatrix} 1 & 0\\ 0 & 1 \end{bmatrix} \right| = 0\] \[\left|\begin{bmatrix} 5-\lambda & 2\\ 2 & 5-\lambda \end{bmatrix}\right| = 0\]
\((5-\lambda)\times(5-\lambda) - 4 = 0\)
\(\lambda^{2} - 10\lambda + 21 = 0\)
\(\lambda = ?\)
A = matrix(c(5,2,2,5),nrow=2)
roots <- Re(polyroot(c(21,-10,1)))
roots
## [1] 3 7
\(\lambda = 3, 7\)
\[Eigenvector = \begin{bmatrix} 1\\ -1 \end{bmatrix}\]
\[Eigenvector = \begin{bmatrix} 1\\ 1 \end{bmatrix}\]
\(\bf{Ax} = \bf{\lambda x}\)
x1 = c(1,-1)
x2 = c(1,1)
A %*% x1 == 3 * x1
A %*% x2 == 7 * x2
\(\bf{Ax} = \lambda \bf{x}\)
A %*% x1 == 3 * x1
[,1]
[1,] TRUE
[2,] TRUE
A %*% x2 == 7 * x2
[,1]
[1,] TRUE
[2,] TRUE
\(\bf{A} = \bf{PDP^{T}}\)
m <- matrix(c(x1,x2),ncol=2) ## x1, x2 are eigenvectors
m <- m/sqrt(norm(m)) ## normalize
as.matrix(m %*% diag(roots) %*% t(m))
## [,1] [,2]
## [1,] 5 2
## [2,] 2 5
\(\bf{PX} = \bf{Y}\)
\(\bf{C_{Y}} = \frac{1}{(n-1)}\bf{YY^{T}}\)
\(\bf{PX} = \bf{Y}\)
\(\bf{C_{Y}} = \frac{1}{(n-1)}\bf{YY^{T}}\)
\(=\frac{1}{(n-1)}\bf{PX(PX)^{T}}\)
\(=\frac{1}{(n-1)}\bf{P(XX^{T})P^{T}}\), because \((AB)^{T} = B^{T}A^{T}\)
\(\bf{PX} = \bf{Y}\)
\(\bf{C_{Y}} = \frac{1}{(n-1)}\bf{YY^{T}}\)
\(=\frac{1}{(n-1)}\bf{PX(PX)^{T}}\)
\(=\frac{1}{(n-1)}\bf{P(XX^{T})P^{T}}\), because \((AB)^{T} = B^{T}A^{T}\)
\(=\frac{1}{(n-1)}\bf{PAP^{T}}\)
data <- read.csv('tennis_data_2013.csv')
data$Player1 <- as.character(data$Player1)
data$Player2 <- as.character(data$Player2)
tennis <- data
m <- length(data)
for (i in 10:m){
tennis[,i] <- ifelse(is.na(data[,i]),0,data[,i])
}
features <- tennis[,10:m]
dim(features)
## [1] 943 26
str(features)
## 'data.frame': 943 obs. of 26 variables:
## $ FSP.1: int 61 61 52 53 76 65 68 47 64 77 ...
## $ FSW.1: int 35 31 53 39 63 51 73 18 26 76 ...
## $ SSP.1: int 39 39 48 47 24 35 32 53 36 23 ...
## $ SSW.1: int 18 13 20 24 12 22 24 15 12 11 ...
## $ ACE.1: num 5 13 8 8 0 9 5 3 3 6 ...
## $ DBF.1: num 1 1 4 6 4 3 3 4 0 4 ...
## $ WNR.1: num 17 13 37 8 16 35 41 21 20 6 ...
## $ UFE.1: num 29 1 50 6 35 41 50 31 39 4 ...
## $ BPC.1: num 1 7 1 6 3 2 9 6 3 7 ...
## $ BPW.1: num 3 14 9 9 12 7 17 20 7 24 ...
## $ NPA.1: num 8 0 16 0 9 6 14 6 5 0 ...
## $ NPW.1: num 11 0 23 0 13 12 30 9 14 0 ...
## $ TPW.1: num 70 80 106 104 128 108 173 78 67 162 ...
## $ FSP.2: int 68 60 77 50 53 63 60 54 67 60 ...
## $ FSW.2: int 45 23 57 24 59 60 66 26 42 68 ...
## $ SSP.2: int 32 40 23 50 47 37 40 46 33 40 ...
## $ SSW.2: int 17 9 15 19 32 22 34 13 14 25 ...
## $ ACE.2: num 10 1 9 1 17 24 2 0 12 8 ...
## $ DBF.2: num 0 4 1 8 11 4 6 11 0 12 ...
## $ WNR.2: num 40 1 41 1 59 47 57 11 32 8 ...
## $ UFE.2: num 30 4 41 8 79 45 72 46 20 12 ...
## $ BPC.2: num 4 0 4 1 3 4 10 2 7 6 ...
## $ BPW.2: num 8 0 13 7 5 7 17 6 10 14 ...
## $ NPA.2: num 8 0 12 0 16 14 25 8 8 0 ...
## $ NPW.2: num 9 0 16 0 28 17 36 12 11 0 ...
## $ TPW.2: num 101 42 126 79 127 122 173 61 94 141 ...
## Manually Calculated PCs
scaled_features <- as.matrix(scale(features))
Cx <- cov(scaled_features)
eigenvalues <- eigen(Cx)$values
eigenvectors <- eigen(Cx)$vectors
PC <- scaled_features %*% eigenvectors
Cy <- cov(PC)
sum_diff <- (sum(diag(Cy) - eigenvalues))^2
round(sum_diff,6)
## [1] 0
off_diag <- upper.tri(Cy)|lower.tri(Cy) ## remove diagonal elements
round(sum(Cy[off_diag]),6) ## off diagonals are 0 since PC's are orthogonal
## [1] 0
pca.df <- prcomp(scaled_features) ## Built in R function
## Eigenvalues of Cx = Variance Explained by PCs
round(eigenvalues,10) == round((pca.df$sdev)^2,10)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
round(eigenvectors[,1],10) == round(pca.df$rotation[,1],10) ## Eigenvectors of Cx = PCs
FSP.1 FSW.1 SSP.1 SSW.1 ACE.1 DBF.1 WNR.1 UFE.1 BPC.1 BPW.1 NPA.1 NPW.1
TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
TPW.1 FSP.2 FSW.2 SSP.2 SSW.2 ACE.2 DBF.2 WNR.2 UFE.2 BPC.2 BPW.2 NPA.2
TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
NPW.2 TPW.2
TRUE TRUE
PC1 <- pca.df$x[,1]
mean_PC1 <- mean(pca.df$x[,1])
gen <- ifelse(PC1 > mean_PC1,"F","M")
sum(diag(table(gen,as.character(data$Gender))))/rows
[1] 0.7646
wine <- read.csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data')
names(wine) <- c("class",'Alcohol','Malic','Ash','Alcalinity','Magnesium','Total_phenols',
'Flavanoids','NFphenols','Proanthocyanins','Color','Hue','Diluted','Proline')
str(wine[,1:7])
## 'data.frame': 177 obs. of 7 variables:
## $ class : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 13.2 13.2 14.4 13.2 14.2 ...
## $ Malic : num 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 2.16 ...
## $ Ash : num 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 2.3 ...
## $ Alcalinity : num 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 18 ...
## $ Magnesium : int 100 101 113 118 112 96 121 97 98 105 ...
## $ Total_phenols: num 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 2.95 ...