Kody z rozdziału 4. Niezbędnik statystyka ,,Przewodnika po programie R’’ wydanie 4.

Aby zainstalować i włączyć pakiet Przewodnik wykonaj poniższe dwie liniki.

devtools::install_github("pbiecek/PrzewodnikPakiet")
library("Przewodnik")

4.1. Generatory liczb losowych

4.1.1. Wprowadzenie do generatorów liczb pseudolosowych

(RNGkind("Super", "Box"))
## [1] "Mersenne-Twister" "Inversion"
(RNGkind("Super", "Ahr"))
## [1] "Super-Duper" "Box-Muller"
.Random.seed[1:6]
## [1]         102 -1448306788   981750765          NA          NA          NA
save.seed <- .Random.seed
rnorm(9)
## [1] -0.7360222 -1.1023027  0.3359402  0.4308841  0.3730457  1.0314422
## [7]  0.1541606 -0.1940883 -2.4327144
.Random.seed <- save.seed
rnorm(9)
## [1] -0.7360222 -1.1023027  0.3359402  0.4308841  0.3730457  1.0314422
## [7]  0.1541606 -0.1940883 -2.4327144
set.seed(1313)
runif(10)
##  [1] 0.27578588 0.06637362 0.82379757 0.52979504 0.91424061 0.51159113
##  [7] 0.94690240 0.56341309 0.61462777 0.54047574
set.seed(1313)
runif(10)
##  [1] 0.27578588 0.06637362 0.82379757 0.52979504 0.91424061 0.51159113
##  [7] 0.94690240 0.56341309 0.61462777 0.54047574

4.1.2. Popularne rozkłady zmiennych losowych

runif(5)
## [1] 0.7338347 0.1627882 0.1364178 0.4895255 0.6499082
punif(0.5)
## [1] 0.5
dunif(0.5)
## [1] 1
qunif(0.1)
## [1] 0.1
x <- seq(-4,4,by=0.01)
plot(x, dnorm(x), type="l", lwd=3, cex.axis=1.5, cex.lab=1.5)
par(usr=c(-4,4,-0.04,1.04))

lines(x, pnorm(x), lty=2, lwd=3, cex.axis=1.5, cex.lab=1.5)
axis(side=4, cex.axis=1.5, cex.lab=1.5)
mtext(side=4, "pnorm()", line=2.5, cex.axis=1.5, cex=1.5)

pnorm(-3) + (1 - pnorm(3))
## [1] 0.002699796
dnorm(-1:1, mean=0, sd=1)
## [1] 0.2419707 0.3989423 0.2419707
qnorm(c(0.001, 0.025, 0.05, 0.5, 0.95, 0.975, 0.999))
## [1] -3.090232 -1.959964 -1.644854  0.000000  1.644854  1.959964  3.090232
rnorm(10, mean = 2, sd = 1)
##  [1]  1.36302924  1.63030216 -0.02528017  3.19469911  0.91553561
##  [6]  2.07589592  1.68723109  2.52672495  1.45562343  3.24331153
rnorm(10, mean = 1:10, sd=1:10)
##  [1]  1.0029785  0.2818232  2.5886568  5.0915514  1.5614802  9.7741237
##  [7] 12.2775071 -7.7713310 13.8131033  5.7068155

4.1.3. Wybrane metody generowania zmiennych losowych

4.1.3.1. Proces Poissona

rPoisProcess <- function (T=1, lambda=1) {
  N <- rpois(1, T*lambda)
  sort(runif(N,0,T))
}

rPoisProcess(T=10, lambda=0.2)
## [1] 3.692069
rRandomField <- function (T1=1, T2=1, lambda=1) {
  N = rpois(1, T1*T2*lambda)
  data.frame(x=runif(N,0,T1), y=runif(N,0,T2))
}
rRandomField(T1=10, T2=10, lambda=0.02)
## [1] x y
## <0 rows> (or 0-length row.names)
set.seed(1313)
n <- 100; d <- 100; N <- 10000
Sigma <- cov(matrix(rnorm(n*d),n,d))
X <- matrix(rnorm(n*d),n,d)
system.time(replicate(N, {
  Q <- chol(Sigma)
  X %*% Q
} ))
##    user  system elapsed 
##  10.317   0.919  11.734
system.time(replicate(N, {
  tmp <- svd(Sigma)
  X %*% (tmp$u %*% diag(sqrt(tmp$d)) %*% t(tmp$v))
}))
##    user  system elapsed 
##  82.807   5.601  89.890
system.time(replicate(N, {
  tmp <- eigen(Sigma, symmetric=T)
  X %*% (tmp$vectors %*% diag(sqrt(abs(tmp$values))) %*% t(tmp$vectors))
}))
##    user  system elapsed 
##  50.521   2.410  53.160

4.1.3.3. Kopule/funkcje łączące

library(copula)
(norm.cop <- normalCopula(0.5))
## Normal copula, dim. d = 2 
## Dimension:  2 
## Parameters:
##   rho.1   = 0.5
str(norm.cop)
## Formal class 'normalCopula' [package "copula"] with 8 slots
##   ..@ dispstr     : chr "ex"
##   ..@ getRho      :function (obj)  
##   ..@ dimension   : int 2
##   ..@ parameters  : num 0.5
##   ..@ param.names : chr "rho.1"
##   ..@ param.lowbnd: num -1
##   ..@ param.upbnd : num 1
##   ..@ fullname    : chr "<deprecated slot>"
x <- rCopula(1000, norm.cop)
head(x)
##           [,1]       [,2]
## [1,] 0.5499854 0.69734644
## [2,] 0.8834138 0.50708612
## [3,] 0.8137926 0.03605611
## [4,] 0.7966665 0.66943528
## [5,] 0.7677999 0.61536497
## [6,] 0.6369891 0.99347898
N <- 40
clayton.cop <- claytonCopula(1, dim = 2)
x <- rCopula(N, clayton.cop)
y <- cbind(qexp(x[,1],rate=2), qexp(x[,2],rate=2))
head(y)
##           [,1]      [,2]
## [1,] 0.6404904 0.3348808
## [2,] 0.1270878 0.4246461
## [3,] 0.3249810 0.8117413
## [4,] 0.2593331 0.4624904
## [5,] 0.6669418 0.3084084
## [6,] 0.3301213 0.1718270
N <- 40
pv <- matrix(0, 2, 1000)
clayton.cop <- claytonCopula(1, dim = 2)
rownames(pv) <- c("Pearson", "Spearman")

for (i in 1:1000) {
  x <- rCopula(N, clayton.cop)
  y <- cbind(qexp(x[,1],2), qexp(x[,2],2))
  pv[1, i] <- cor.test(y[,1], y[,2], method="pearson")$p.value
  pv[2, i] <- cor.test(y[,1], y[,2], method="spearman")$p.value
}

rowMeans(pv < 0.05)
##  Pearson Spearman 
##    0.471    0.851

4.1.4. Estymacja parametrów rozkładu

library(MASS)
wek <- rlnorm(100)
fitdistr(wek, "normal")
##      mean         sd    
##   1.3491925   1.4312316 
##  (0.1431232) (0.1012034)
fitdistr(wek, "gamma", list(shape=3, rate=3))
##      shape       rate   
##   1.4143092   1.0482635 
##  (0.1812709) (0.1606885)

4.2. Wstępne przetwarzanie danych

4.2.1. Brakujące obserwacje

library(dprep)
data(hepatitis)
dim(hepatitis)
## [1] 155  20
sum(is.na(hepatitis))
## [1] 167
sum(apply(is.na(hepatitis),1,max))
## [1] 75
indeksyKompletnych = complete.cases(hepatitis)
dim(hepatitis[indeksyKompletnych,])
## [1] 80 20
summary(hepatitis[,19])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   46.00   61.00   61.85   76.25  100.00      67
summary(na.omit(hepatitis[,19]))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   46.00   61.00   61.85   76.25  100.00
library(Hmisc)
wektor <- c(1,5,NA,2,8,8,3,NA,10)
impute(wektor, 2.5)
##     1     2     3     4     5     6     7     8     9 
##   1.0   5.0  2.5*   2.0   8.0   8.0   3.0  2.5*  10.0
impute(wektor, mean)
##          1          2          3          4          5          6 
##   1.000000   5.000000  5.285714*   2.000000   8.000000   8.000000 
##          7          8          9 
##   3.000000  5.285714*  10.000000
impute(wektor, "random")
##   1   2   3   4   5   6   7   8   9 
##   1   5  1*   2   8   8   3 10*  10
imputacjemedian <- e1071::impute(hepatitis, "median")
imputacjemean <- ce.mimp(hepatitis, "mean", atr=1:20)
## 
## Summary of imputations using substitution of  mean (mode for nominal features):
##        Row Column Class Imput.value
##   [1,]   1     19     2   66.571429
##   [2,]   2     19     2   66.571429
##   [3,]   3     19     2   66.571429
##   [4,]   4      4     2    1.540984
##   [5,]   5     16     2  101.313725
##   [6,]   5     19     2   66.571429
##   [7,]   7     15     1    2.543333
##   [8,]   7     16     1  122.375000
##   [9,]   7     17     1   99.833333
##  [10,]   7     18     1    3.151852
##  [11,]   7     19     1   43.500000
##  [12,]   8     16     2  101.313725
##  [13,]   8     17     2   82.438017
##  [14,]   8     18     2    3.977679
##  [15,]   8     19     2   66.571429
##  [16,]   9     16     2  101.313725
##  [17,]   9     19     2   66.571429
##  [18,]  10     16     2  101.313725
##  [19,]  10     19     2   66.571429
##  [20,]  15     15     2    1.146218
##  [21,]  15     16     2  101.313725
##  [22,]  15     18     2    3.977679
##  [23,]  15     19     2   66.571429
##  [24,]  17     19     2   66.571429
##  [25,]  27     19     2   66.571429
##  [26,]  32      9     1    1.888889
##  [27,]  32     10     1    1.518519
##  [28,]  32     16     1  122.375000
##  [29,]  32     18     1    3.151852
##  [30,]  32     19     1   43.500000
##  [31,]  36     19     1   43.500000
##  [32,]  38     19     2   66.571429
##  [33,]  42      9     2    1.813559
##  [34,]  42     10     2    1.598291
##  [35,]  42     11     2    1.848739
##  [36,]  42     12     2    1.756303
##  [37,]  42     13     2    1.949580
##  [38,]  42     14     2    1.941176
##  [39,]  42     19     2   66.571429
##  [40,]  45     15     2    1.146218
##  [41,]  45     16     2  101.313725
##  [42,]  45     18     2    3.977679
##  [43,]  45     19     2   66.571429
##  [44,]  46     19     2   66.571429
##  [45,]  47     19     2   66.571429
##  [46,]  51     19     2   66.571429
##  [47,]  52     19     2   66.571429
##  [48,]  56     18     2    3.977679
##  [49,]  56     19     2   66.571429
##  [50,]  57      6     2    1.426230
##  [51,]  57      7     2    1.688525
##  [52,]  57      8     2    1.819672
##  [53,]  57      9     2    1.813559
##  [54,]  57     10     2    1.598291
##  [55,]  57     11     2    1.848739
##  [56,]  57     12     2    1.756303
##  [57,]  57     13     2    1.949580
##  [58,]  57     14     2    1.941176
##  [59,]  57     15     2    1.146218
##  [60,]  57     16     2  101.313725
##  [61,]  57     17     2   82.438017
##  [62,]  57     18     2    3.977679
##  [63,]  57     19     2   66.571429
##  [64,]  60     18     2    3.977679
##  [65,]  60     19     2   66.571429
##  [66,]  66     16     2  101.313725
##  [67,]  66     19     2   66.571429
##  [68,]  67     19     2   66.571429
##  [69,]  68     16     1  122.375000
##  [70,]  70     19     2   66.571429
##  [71,]  71     19     2   66.571429
##  [72,]  72     18     1    3.151852
##  [73,]  72     19     1   43.500000
##  [74,]  73      9     2    1.813559
##  [75,]  73     10     2    1.598291
##  [76,]  73     11     2    1.848739
##  [77,]  73     12     2    1.756303
##  [78,]  73     13     2    1.949580
##  [79,]  73     14     2    1.941176
##  [80,]  73     19     2   66.571429
##  [81,]  74     16     2  101.313725
##  [82,]  75     19     2   66.571429
##  [83,]  77     19     1   43.500000
##  [84,]  80     19     2   66.571429
##  [85,]  81     16     2  101.313725
##  [86,]  81     19     2   66.571429
##  [87,]  84     11     2    1.848739
##  [88,]  84     12     2    1.756303
##  [89,]  84     13     2    1.949580
##  [90,]  84     14     2    1.941176
##  [91,]  84     19     2   66.571429
##  [92,]  87     18     1    3.151852
##  [93,]  88     19     1   43.500000
##  [94,]  89     19     1   43.500000
##  [95,]  92     16     1  122.375000
##  [96,]  92     19     1   43.500000
##  [97,]  93      9     2    1.813559
##  [98,]  93     10     2    1.598291
##  [99,]  93     16     2  101.313725
## [100,]  93     19     2   66.571429
## [101,]  94     16     2  101.313725
## [102,]  94     19     2   66.571429
## [103,]  98     19     2   66.571429
## [104,] 100     15     2    1.146218
## [105,] 100     16     2  101.313725
## [106,] 100     18     2    3.977679
## [107,] 100     19     2   66.571429
## [108,] 102     16     2  101.313725
## [109,] 102     18     2    3.977679
## [110,] 102     19     2   66.571429
## [111,] 106     16     2  101.313725
## [112,] 106     19     2   66.571429
## [113,] 107      9     1    1.888889
## [114,] 107     10     1    1.518519
## [115,] 107     19     1   43.500000
## [116,] 108     16     2  101.313725
## [117,] 108     18     2    3.977679
## [118,] 108     19     2   66.571429
## [119,] 111     19     2   66.571429
## [120,] 113     19     2   66.571429
## [121,] 114     19     2   66.571429
## [122,] 115     19     2   66.571429
## [123,] 116     18     2    3.977679
## [124,] 116     19     2   66.571429
## [125,] 117     16     2  101.313725
## [126,] 117     19     2   66.571429
## [127,] 119      9     1    1.888889
## [128,] 119     10     1    1.518519
## [129,] 119     15     1    2.543333
## [130,] 119     16     1  122.375000
## [131,] 119     17     1   99.833333
## [132,] 119     18     1    3.151852
## [133,] 119     19     1   43.500000
## [134,] 120     19     2   66.571429
## [135,] 121     19     1   43.500000
## [136,] 123     18     2    3.977679
## [137,] 123     19     2   66.571429
## [138,] 124     16     2  101.313725
## [139,] 124     19     2   66.571429
## [140,] 127      9     2    1.813559
## [141,] 127     10     2    1.598291
## [142,] 127     16     2  101.313725
## [143,] 127     19     2   66.571429
## [144,] 132     16     1  122.375000
## [145,] 132     19     1   43.500000
## [146,] 133     19     2   66.571429
## [147,] 137     16     2  101.313725
## [148,] 137     19     2   66.571429
## [149,] 141     19     2   66.571429
## [150,] 142      9     1    1.888889
## [151,] 142     10     1    1.518519
## [152,] 143     16     2  101.313725
## [153,] 145     16     1  122.375000
## [154,] 145     19     1   43.500000
## [155,] 147     19     1   43.500000
## [156,] 148      9     1    1.888889
## [157,] 148     10     1    1.518519
## [158,] 148     11     1    1.612903
## [159,] 148     12     1    1.290323
## [160,] 148     13     1    1.548387
## [161,] 148     14     1    1.645161
## [162,] 149     10     2    1.598291
## [163,] 149     19     2   66.571429
## [164,] 150     19     2   66.571429
## [165,] 151     16     1  122.375000
## [166,] 152     19     2   66.571429
## [167,] 153     19     2   66.571429
## 
## Total number of imputations per class: 
## 
##   1   2 
##  45 122 
## 
## Total number of imputations:  167
imputacjeknn <- ec.knnimp(hepatitis, k=10)

library(rms)
replikacje <- aregImpute(~V18+V12+V6, n.impute=100, data=hepatitis, pr=FALSE)
modelRepl <- fit.mult.impute(V18~V6+V12, ols, replikacje, data=hepatitis)
## 
## Variance Inflation Factors Due to Imputation:
## 
## Intercept        V6       V12 
##      1.12      1.08      1.15 
## 
## Rate of Missing Information:
## 
## Intercept        V6       V12 
##      0.11      0.08      0.13 
## 
## d.f. for t-distribution for Tests of Single Coefficients:
## 
## Intercept        V6       V12 
##   8927.47  17226.98   5657.04 
## 
## The following fit components were averaged over the 100 model fits:
## 
##   fitted.values stats linear.predictors
summary.lm(modelRepl)
## 
## Call:
## fit.mult.impute(formula = V18 ~ V6 + V12, fitter = ols, xtrans = replikacje, 
##     data = hepatitis)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7375 -0.2562  0.0438  0.3624  2.2438 
## 
## Coefficients:
##           Estimate Std. Error t value Pr(>|t|)    
## Intercept   2.9095     0.1974  14.737  < 2e-16 ***
## V6          0.3046     0.1118   2.726  0.00717 ** 
## V12         0.3061     0.1133   2.703  0.00766 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6147 on 152 degrees of freedom
## Multiple R-squared:  0.1342, Adjusted R-squared:  0.1228 
## F-statistic: 11.78 on 2 and 152 DF,  p-value: 1.747e-05
modelBezRepl <- lm(V18~V6+V12, data=hepatitis)
summary(modelBezRepl)
## 
## Call:
## lm(formula = V18 ~ V6 + V12, data = hepatitis)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.72163 -0.27160 -0.00778  0.37837  2.27147 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.8870     0.2047  14.104  < 2e-16 ***
## V6            0.3069     0.1167   2.630  0.00956 ** 
## V12           0.3139     0.1173   2.677  0.00838 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6003 on 132 degrees of freedom
##   (20 observations deleted due to missingness)
## Multiple R-squared:  0.1469, Adjusted R-squared:  0.1339 
## F-statistic: 11.36 on 2 and 132 DF,  p-value: 2.8e-05

4.2.2. Normalizacja, skalowanie i transformacje nieliniowe

library(Przewodnik)
cov(daneSoc[,c(1,6,7)])
##                             wiek cisnienie.skurczowe cisnienie.rozkurczowe
## wiek                  191.742176           -6.016662             -8.941249
## cisnienie.skurczowe    -6.016662          246.903989             82.809186
## cisnienie.rozkurczowe  -8.941249           82.809186             60.324616
cov(scale(daneSoc[,c(1,6,7)]))
##                              wiek cisnienie.skurczowe
## wiek                   1.00000000         -0.02765239
## cisnienie.skurczowe   -0.02765239          1.00000000
## cisnienie.rozkurczowe -0.08313656          0.67852707
##                       cisnienie.rozkurczowe
## wiek                            -0.08313656
## cisnienie.skurczowe              0.67852707
## cisnienie.rozkurczowe            1.00000000
wektor.sd <- apply(daneSoc[,c(1,6,7)], 2, sd)
head( sweep(daneSoc[,c(1,6,7)], 2, FUN= "/", wektor.sd) )
##       wiek cisnienie.skurczowe cisnienie.rozkurczowe
## 1 5.055210            9.100641             10.686385
## 2 4.766341            7.827824             10.300130
## 3 5.127427           10.628021             10.300130
## 4 4.116385            9.546127             11.201391
## 5 3.249778            8.273310             10.686385
## 6 3.466430            8.782437              9.656372
head( apply(daneSoc[,c(1,6,7)], 2, function(x) x/sd(x)) )
##          wiek cisnienie.skurczowe cisnienie.rozkurczowe
## [1,] 5.055210            9.100641             10.686385
## [2,] 4.766341            7.827824             10.300130
## [3,] 5.127427           10.628021             10.300130
## [4,] 4.116385            9.546127             11.201391
## [5,] 3.249778            8.273310             10.686385
## [6,] 3.466430            8.782437              9.656372
library("Przewodnik")
Lbox <- boxcox(daneO[,"VEGF"]~daneO[,"Rozmiar.guza"])

Lbox$x[which.max(Lbox$y)]
## [1] -0.02020202
Lbox <- boxcox(daneO[,"Wiek"]~daneO[,"Rozmiar.guza"])

Lbox$x[which.max(Lbox$y)]
## [1] 2
zmOryginalna <- c(0.1,0.8,0.5,0.2,0.9,0.7)
(zmZmieniona <- cut(zmOryginalna, c(0,0.33,0.66,1), c("niski", "sredni", "wysoki")))
## [1] niski  wysoki sredni niski  wysoki wysoki
## Levels: niski sredni wysoki
levels(zmZmieniona) <- c("niewysoki", "niewysoki", "wysoki")
zmZmieniona
## [1] niewysoki wysoki    niewysoki niewysoki wysoki    wysoki   
## Levels: niewysoki wysoki
tDaneO <- transform(daneO, logVEGF = log(VEGF),
fNowo4wor = factor(Nowotwor), cutWiek = cut(Wiek,4))
head(tDaneO, 3)
##   Wiek Rozmiar.guza Wezly.chlonne Nowotwor Receptory.estrogenowe
## 1   29            1             0        2                   (-)
## 2   29            1             0        2                  (++)
## 3   30            1             1        2                   (-)
##   Receptory.progesteronowe Niepowodzenia Okres.bez.wznowy VEGF  logVEGF
## 1                     (++)          brak               22  914 6.817831
## 2                     (++)          brak               53 1118 7.019297
## 3                      (+)          brak               38  630 6.445720
##   fNowo4wor cutWiek
## 1         2 (29,36]
## 2         2 (29,36]
## 3         2 (29,36]

4.3. Analiza wariancji, regresja liniowa i logistyczna

4.3.2. Analiza jednoczynnikowa

library("Przewodnik")
summary(mieszkania)
##       cena            pokoi       powierzchnia         dzielnica 
##  Min.   : 83280   Min.   :1.00   Min.   :17.00   Biskupin   :65  
##  1st Qu.:143304   1st Qu.:2.00   1st Qu.:31.15   Krzyki     :79  
##  Median :174935   Median :3.00   Median :43.70   Srodmiescie:56  
##  Mean   :175934   Mean   :2.55   Mean   :46.20                   
##  3rd Qu.:208741   3rd Qu.:3.00   3rd Qu.:61.40                   
##  Max.   :295762   Max.   :4.00   Max.   :87.70                   
##      typ.budynku
##  kamienica :61  
##  niski blok:63  
##  wiezowiec :76  
##                 
##                 
## 
(a1 <- anova(lm(cena~dzielnica, data = mieszkania)))
## Analysis of Variance Table
## 
## Response: cena
##            Df     Sum Sq    Mean Sq F value   Pr(>F)   
## dzielnica   2 1.7995e+10 8997691613  5.0456 0.007294 **
## Residuals 197 3.5130e+11 1783263361                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(a2 <- anova(lm(cena~typ.budynku, data = mieszkania)))
## Analysis of Variance Table
## 
## Response: cena
##              Df     Sum Sq    Mean Sq F value   Pr(>F)   
## typ.budynku   2 2.2770e+10 1.1385e+10  6.4725 0.001895 **
## Residuals   197 3.4653e+11 1.7590e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
a1[1,5]
## [1] 0.007294371
a2[1,5]
## [1] 0.001894708
summary(model1 <- aov(cena~dzielnica, data = mieszkania))
##              Df    Sum Sq   Mean Sq F value  Pr(>F)   
## dzielnica     2 1.800e+10 8.998e+09   5.046 0.00729 **
## Residuals   197 3.513e+11 1.783e+09                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(model1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = cena ~ dzielnica, data = mieszkania)
## 
## $dzielnica
##                            diff       lwr        upr     p adj
## Krzyki-Biskupin      -21321.019 -38021.10 -4620.9333 0.0081457
## Srodmiescie-Biskupin -18350.541 -36532.88  -168.2053 0.0473579
## Srodmiescie-Krzyki     2970.478 -14450.28 20391.2340 0.9145465
plot(TukeyHSD(model1))

plot(cena ~ dzielnica, data = mieszkania)

library("agricolae")
df <- df.residual(model1)
MSerror <- deviance(model1)/df
HSD.test(mieszkania$cena, mieszkania$dzielnica, df, MSerror)

t(contr.treatment(5))
##   1 2 3 4 5
## 2 0 1 0 0 0
## 3 0 0 1 0 0
## 4 0 0 0 1 0
## 5 0 0 0 0 1
t(contr.helmert(5))
##       1  2  3  4 5
## [1,] -1  1  0  0 0
## [2,] -1 -1  2  0 0
## [3,] -1 -1 -1  3 0
## [4,] -1 -1 -1 -1 4
kontr <- contr.helmert(3)
model <- lm(cena~dzielnica, data=mieszkania,
contrasts = list(dzielnica=kontr))
summary(model)
## 
## Call:
## lm(formula = cena ~ dzielnica, data = mieszkania, contrasts = list(dzielnica = kontr))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84893 -31892   -880  29611 106268 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   176270       3016  58.450  < 2e-16 ***
## dzielnica1    -10660       3536  -3.015  0.00291 ** 
## dzielnica2     -2563       2220  -1.155  0.24958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42230 on 197 degrees of freedom
## Multiple R-squared:  0.04873,    Adjusted R-squared:  0.03907 
## F-statistic: 5.046 on 2 and 197 DF,  p-value: 0.007294
kontr <- cbind(c(2, -1, -1), c(0, 1, -1))
model <- lm(cena~dzielnica, data=mieszkania,
contrasts = list(dzielnica=kontr))
summary(model)$coef
##               Estimate Std. Error    t value      Pr(>|t|)
## (Intercept) 176270.141   3015.732 58.4501978 2.070588e-126
## dzielnica1    6611.927   2135.391  3.0963539  2.244773e-03
## dzielnica2   -1485.239   3688.392 -0.4026793  6.876208e-01

4.3.3. Analiza wieloczynnikowa

(a3 <- anova(lm(cena~dzielnica+typ.budynku, data = mieszkania)))
## Analysis of Variance Table
## 
## Response: cena
##              Df     Sum Sq    Mean Sq F value   Pr(>F)   
## dzielnica     2 1.7995e+10 8.9977e+09  5.3397 0.005524 **
## typ.budynku   2 2.2719e+10 1.1359e+10  6.7413 0.001476 **
## Residuals   195 3.2858e+11 1.6850e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
a3[1:2,5]
## [1] 0.005524071 0.001475884
plot.design(mieszkania[,c("dzielnica", "typ.budynku", "cena")])

interaction.plot(mieszkania$dzielnica, mieszkania$typ.budynku, mieszkania$cena, lwd=3)

a4 <- anova(lm(cena~dzielnica*typ.budynku, data = mieszkania))
a4 <- anova(lm(cena~(dzielnica+typ.budynku)^2, data = mieszkania))
(a4 <- anova(lm(cena~dzielnica+typ.budynku+dzielnica:typ.budynku, data = mieszkania)))
## Analysis of Variance Table
## 
## Response: cena
##                        Df     Sum Sq    Mean Sq F value   Pr(>F)   
## dzielnica               2 1.7995e+10 8.9977e+09  5.2461 0.006054 **
## typ.budynku             2 2.2719e+10 1.1359e+10  6.6231 0.001656 **
## dzielnica:typ.budynku   4 9.9528e+08 2.4882e+08  0.1451 0.964995   
## Residuals             191 3.2759e+11 1.7151e+09                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(cena~dzielnica+typ.budynku, data = mieszkania))[1:2,5]
## [1] 0.005524071 0.001475884
anova(lm(cena~dzielnica, data = mieszkania))[1,5]
## [1] 0.007294371
model <- manova(cbind(cena,powierzchnia)~dzielnica+typ.budynku, data=mieszkania)
summary(model, test="Hotelling-Lawley")
##              Df Hotelling-Lawley approx F num Df den Df    Pr(>F)    
## dzielnica     2          0.86337   41.658      4    386 < 2.2e-16 ***
## typ.budynku   2          0.32249   15.560      4    386 8.198e-12 ***
## Residuals   195                                                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.3.4. Regresja

modelPP <- lm(cena~powierzchnia+pokoi, data = mieszkania)
modelPP$coefficients
##  (Intercept) powierzchnia        pokoi 
##   82407.0883    2070.8966    -840.1008
modelPP$coeff
##  (Intercept) powierzchnia        pokoi 
##   82407.0883    2070.8966    -840.1008
summary(modelPP)
## 
## Call:
## lm(formula = cena ~ powierzchnia + pokoi, data = mieszkania)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39705  -9386   -863   9454  35097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   82407.1     2569.9  32.066   <2e-16 ***
## powierzchnia   2070.9      149.2  13.883   <2e-16 ***
## pokoi          -840.1     2765.1  -0.304    0.762    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14110 on 197 degrees of freedom
## Multiple R-squared:  0.8937, Adjusted R-squared:  0.8926 
## F-statistic: 828.3 on 2 and 197 DF,  p-value: < 2.2e-16
podsumowanieModeluPP <- summary(modelPP)
podsumowanieModeluPP$coef[2:3,4]
## powierzchnia        pokoi 
## 5.241555e-31 7.615836e-01
podsumowanieModeluPP$coef[,1]
##  (Intercept) powierzchnia        pokoi 
##   82407.0883    2070.8966    -840.1008
c(podsumowanieModeluPP$r.squared, podsumowanieModeluPP$adj.r.squared)
## [1] 0.8937227 0.8926437
summary(lm(cena~powierzchnia+pokoi+dzielnica, data = mieszkania))
## 
## Call:
## lm(formula = cena ~ powierzchnia + pokoi + dzielnica, data = mieszkania)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30501  -8480   -144   7346  35730 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           94222.02    2320.36  40.607  < 2e-16 ***
## powierzchnia           2022.99     116.31  17.393  < 2e-16 ***
## pokoi                    34.36    2157.52   0.016    0.987    
## dzielnicaKrzyki      -20934.86    1842.79 -11.360  < 2e-16 ***
## dzielnicaSrodmiescie -12722.60    2008.03  -6.336  1.6e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11000 on 195 degrees of freedom
## Multiple R-squared:  0.9362, Adjusted R-squared:  0.9348 
## F-statistic: 714.8 on 4 and 195 DF,  p-value: < 2.2e-16
modelDP <- lm(cena~powierzchnia+dzielnica, data = mieszkania)
ndane <- data.frame(powierzchnia=c(48,68), dzielnica=c("Biskupin", "Krzyki"))
predict(modelDP, newdata = ndane)
##        1        2 
## 191415.9 210976.9
vif(lm(cena~powierzchnia+pokoi, data=mieszkania))
## powierzchnia        pokoi 
##     8.961022     8.961022
vif(lm(rnorm(200)~powierzchnia+pokoi+cena, data=mieszkania))
## powierzchnia        pokoi         cena 
##    17.727848     8.965221     9.409346

4.3.5. Wprowadzenie do regresji logistycznej

library("Przewodnik")
modelNV <- glm(Niepowodzenia~Nowotwor+log(VEGF), data=daneO, family="binomial")
summary(modelNV)
## 
## Call:
## glm(formula = Niepowodzenia ~ Nowotwor + log(VEGF), family = "binomial", 
##     data = daneO)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3475  -0.4597  -0.2263  -0.1132   2.3991  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -17.3914     4.3757  -3.975 7.05e-05 ***
## Nowotwor      2.2586     0.7657   2.950  0.00318 ** 
## log(VEGF)     1.3293     0.4250   3.128  0.00176 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 73.052  on 85  degrees of freedom
## Residual deviance: 47.152  on 83  degrees of freedom
##   (11 observations deleted due to missingness)
## AIC: 53.152
## 
## Number of Fisher Scoring iterations: 6
modelN <- glm(Niepowodzenia~Nowotwor, daneO, family="binomial")
ndaneO <- data.frame(Nowotwor=c(1, 2, 3))
predict(modelN, ndaneO, type = "response")
##          1          2          3 
## 0.01125905 0.07249761 0.34918513
cdplot(daneO$VEGF, daneO$Niepowodzenia, bw=3000)

cdplot(mieszkania$cena, mieszkania$typ.budynku)

library(pscl)
modelN <- glm(Niepowodzenia~Nowotwor, daneO, family="binomial")
pR2(modelN)
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -31.0351909 -38.2140122  14.3576426   0.1878584   0.1537575   0.2611330
x <- runif(100)*3
y <- x^2.5-5+rnorm(100,0,3)
model <- nls(y ~ x^a - b, start = list(a = 2, b = 2))
summary(model)
## 
## Formula: y ~ x^a - b
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a  2.57256    0.05526   46.55   <2e-16 ***
## b  5.67171    0.35765   15.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.867 on 98 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 1.33e-06
mojModel <- function(a,b,x) sin(a+b*x)
x <- runif(100)*6
y <- mojModel(pi/2,2,x)+rnorm(100,0,1)

model <- nls(y~mojModel(a,b,x), start=list(a=2,b=2), algorithm="plinear")
summary(model)
## 
## Formula: y ~ mojModel(a, b, x)
## 
## Parameters:
##      Estimate Std. Error t value Pr(>|t|)    
## a     1.48396    0.31354   4.733 7.53e-06 ***
## b     2.03427    0.09304  21.864  < 2e-16 ***
## .lin  0.97579    0.16210   6.020 3.11e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.134 on 97 degrees of freedom
## 
## Number of iterations to convergence: 4 
## Achieved convergence tolerance: 2.721e-06

4.4. Testowanie

library(goftest)
x <- rnorm(100, 10, 1.4)
cvm.test(x)
## 
##  Cramer-von Mises test of goodness-of-fit
##  Null hypothesis: uniform distribution
## 
## data:  x
## omega2 = 33.333, p-value < 2.2e-16
(pwynik <- cvm.test(x)$p.value)
## [1] 0
library(car)
y <- rnorm(100)
qqnorm(y)
qqline(y, col = "red")

qqPlot(y)

4.4.1. Testy zgodności z rozkładem jednostajnym

przedzialy <- seq(0,1,0.2)
(x <- table(cut(runif(100), przedzialy)))
## 
##   (0,0.2] (0.2,0.4] (0.4,0.6] (0.6,0.8]   (0.8,1] 
##        19        23        15        23        20
chisq.test(x)
## 
##  Chi-squared test for given probabilities
## 
## data:  x
## X-squared = 2.2, df = 4, p-value = 0.699
(x <- table(cut(rbeta(100,3,2), przedzialy)))
## 
##   (0,0.2] (0.2,0.4] (0.4,0.6] (0.6,0.8]   (0.8,1] 
##         4        18        28        31        19
chisq.test(x)
## 
##  Chi-squared test for given probabilities
## 
## data:  x
## X-squared = 22.3, df = 4, p-value = 0.0001747
x <- runif(100)
ks.test(x, "punif")
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.10216, p-value = 0.2475
## alternative hypothesis: two-sided
ks.test(x, "pnorm", 1, 1)
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.50278, p-value < 2.2e-16
## alternative hypothesis: two-sided
x <- runif(100); y <- rnorm(100)
ks.test(x, y)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.46, p-value = 1.292e-09
## alternative hypothesis: two-sided
x <- rexp(100)
y <- rnorm(500)
qqplot(x, y)

4.4.2. Testowanie hipotezy o równości parametrów położenia

x <- rnorm(50)
y <- x + 0.3 + rnorm(50,0,0.01)
t.test(y, mu=0)
## 
##  One Sample t-test
## 
## data:  y
## t = 2.4908, df = 49, p-value = 0.01618
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.07394866 0.69158208
## sample estimates:
## mean of x 
## 0.3827654
t.test(x, y)$p.value
## [1] 0.1717698
t.test(x, y, paired=TRUE)
## 
##  Paired t-test
## 
## data:  x and y
## t = -199.33, df = 49, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3023476 -0.2963122
## sample estimates:
## mean of the differences 
##              -0.2993299
t.test(x, y, paired=T)$p.value
## [1] 5.921943e-73
wilcox.test(y, mu=0)$p.value
## [1] 0.02389165
wilcox.test(x, y)$p.value
## [1] 0.1337684

4.4.3. Testowanie hipotezy o równości parametrów skali

x <- rnorm(20,2,1)
y <- rnorm(20,2,2)
z <- rnorm(20,2,3)

var.test(x,y)
## 
##  F test to compare two variances
## 
## data:  x and y
## F = 0.23729, num df = 19, denom df = 19, p-value = 0.002921
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.0939210 0.5994935
## sample estimates:
## ratio of variances 
##          0.2372868
ansari.test(x,y)$p.value
## [1] 0.0008723042
bartlett.test(list(x,y,z))$p.value
## [1] 1.746499e-05
mood.test(x,y)$p.value
## [1] 0.001481204
grupy <- gl(3,20); wart <- c(x,y,z)
fligner.test(wart~grupy, data.frame(wart, grupy))$p.value
## [1] 8.487498e-06
library(car)
leveneTest(wart,grupy)[1,3]
## [1] 2.91234e-06

4.4.4. Testowanie hipotez dotyczących wskaźnika struktury

(wtestu <- prop.test(594, 1234, p=0.5))
## 
##  1-sample proportions test with continuity correction
## 
## data:  594 out of 1234, null probability 0.5
## X-squared = 1.641, df = 1, p-value = 0.2002
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4531816 0.5096586
## sample estimates:
##         p 
## 0.4813614
wtestu$conf.int
## [1] 0.4531816 0.5096586
## attr(,"conf.level")
## [1] 0.95
x <- c(11,120,1300,14000,150000)
n <- c(100,1000,10000,100000,1000000)
x/n
## [1] 0.11 0.12 0.13 0.14 0.15
pairwise.prop.test(x, n, p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using Pairwise comparison of proportions 
## 
## data:  x out of n 
## 
##   1     2     3       4      
## 2 1.000 -     -       -      
## 3 1.000 1.000 -       -      
## 4 1.000 0.769 0.061   -      
## 5 1.000 0.090 2.7e-07 2.7e-16
## 
## P value adjustment method: bonferroni

4.4.5. Testy istotności zależności pomiędzy dwoma zmiennymi

set.seed(227)
x <- rlnorm(50)
y <- x + rlnorm(50)
cor.test(x, y, method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 11.924, df = 48, p-value = 5.876e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7721256 0.9212688
## sample estimates:
##       cor 
## 0.8646445
cor.test(x, y, method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  x and y
## S = 5348, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7431933
get.z <- function(rho) log((1+rho)/(1-rho))/2

rho0.test <- function(rho, n, rho0=0, alternative="two.sided") {
  p.raw <- pnorm((get.z(rho) - get.z(rho0))/sqrt(1/(n-3)))
  if (alternative=="two.sided") return(1 - 2*abs(0.5-p.raw))
  if (alternative=="less") return(p.raw)
  if (alternative=="greater") return(1-p.raw)
}

rho.rho.test <- function(rho1,n1, rho2,n2, alternative="two.sided"){
  p.raw <- pnorm((get.z(rho1) - get.z(rho2))/sqrt(1/(n1-3)+1/(n2-3)))
  if (alternative=="two.sided") return(1 - 2*abs(0.5 - p.raw))
  if (alternative=="less") return(p.raw)
  if (alternative=="greater") return(1-p.raw)
}

tabela <- table(daneSoc$plec, daneSoc$praca)
tabela
##            
##             nie pracuje uczen lub pracuje
##   kobieta             7                48
##   mezczyzna          45               104
chiwynik <- chisq.test(tabela)
chiwynik
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tabela
## X-squared = 5.5711, df = 1, p-value = 0.01826
chiwynik$expected
##            
##             nie pracuje uczen lub pracuje
##   kobieta      14.01961          40.98039
##   mezczyzna    37.98039         111.01961
fisher.test(tabela)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  tabela
## p-value = 0.0112
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.1199709 0.8308932
## sample estimates:
## odds ratio 
##  0.3386347
mcnemar.test(tabela)
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  tabela
## McNemar's chi-squared = 0.043011, df = 1, p-value = 0.8357

4.4.5.4. Współczynnik zgodności kappa

library(irr)
ocena1 <- factor(2 + trunc(runif(100)*4))
ocena2 <- factor(2 + trunc(runif(100)*4))
table(ocena1, ocena2)
##       ocena2
## ocena1  2  3  4  5
##      2 11  4 10  7
##      3  8  5  4  9
##      4  5  5  6  8
##      5  2  7  5  4
kappa2(cbind(ocena1, ocena2))
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 100 
##    Raters = 2 
##     Kappa = 0.0157 
## 
##         z = 0.276 
##   p-value = 0.783
table(ocena1, ocena2)
##       ocena2
## ocena1  2  3  4  5
##      2 11  4 10  7
##      3  8  5  4  9
##      4  5  5  6  8
##      5  2  7  5  4
kappa2(cbind(ocena1, ocena2), weight = "unweighted")
##  Cohen's Kappa for 2 Raters (Weights: unweighted)
## 
##  Subjects = 100 
##    Raters = 2 
##     Kappa = 0.0157 
## 
##         z = 0.276 
##   p-value = 0.783
kappa2(cbind(ocena1, ocena2), weight = "squared")
##  Cohen's Kappa for 2 Raters (Weights: squared)
## 
##  Subjects = 100 
##    Raters = 2 
##     Kappa = 0.0815 
## 
##         z = 0.839 
##   p-value = 0.401

4.4.6. Testowanie zbioru hipotez

pwart <- numeric(6)
for (i in 1:6)
  pwart[i] <- t.test(rnorm(20),rnorm(20,1))$p.value
p.adjust(pwart, method=c("BH"))
## [1] 0.0063051452 0.0306423393 0.0148567296 0.0438430956 0.0004657299
## [6] 0.0252220850

4.4.7. Rozkład statystyki testowej

N <- 10^5
getStat <- function(n=30, p=0.2) {
  x <- rbinom(n,1,p)
  y <- rbinom(n,1,p)
  tab <- table(x,y)
  chisq.test(tab)$statistic
}

st.testowa <- replicate(N, getStat())
plot(ecdf(st.testowa), xlim=c(0,5))
lines(sort(st.testowa), pchisq(sort(st.testowa),1), col="red", lwd=3)

x <- kidney[kidney$recipient.age < 40, "MDRD30"]
y <- kidney[kidney$recipient.age >= 40, "MDRD30"]
n <- length(x); m <- length(y); B <- 999

(theta <- ks.test(x,y)$statistic)
##        D 
## 0.219922
thetastar <- replicate(B, {
  zstar <- sample(c(x,y))
  xstar <- zstar[1:n]
  ystar <- zstar[(1:m)+n]
  ks.test(xstar, ystar)$statistic
})

(sum(thetastar >= theta) + 1) / (B+1)
## [1] 0.001

4.5. Bootstrap

library(boot)
srednia <- function(x,w) sum(x*w)
wynikBoot <- boot(daneO$VEGF, srednia, R=999, stype="w")
quantile(wynikBoot$t, c(0.05/2, 1 - 0.05/2))
##     2.5%    97.5% 
## 1958.080 3389.398
plot(wynikBoot)

mean(daneO$VEGF)
## [1] 2626.608
boot.ci(wynikBoot, conf = 0.95)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = wynikBoot, conf = 0.95)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (1922, 3324 )   (1861, 3300 )  
## 
## Level     Percentile            BCa          
## 95%   (1953, 3392 )   (2039, 3491 )  
## Calculations and Intervals on Original Scale
VEGF.stat <- function(data) {
  mean(data)
}
VEGF.sim <- function(data, mle) {
  rnorm(length(data), mean = mle[1], sd = mle[2])
}
VEGF.mle <- c(mean(daneO$VEGF), sd(daneO$VEGF))
bootParametryczny <- boot(daneO$VEGF, statistic = VEGF.stat, R = 999, sim = "parametric", ran.gen = VEGF.sim, mle = VEGF.mle)

plot(bootParametryczny)

boot.ci(bootParametryczny, type = "perc", conf = 0.95)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 999 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = bootParametryczny, conf = 0.95, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   (1937, 3327 )  
## Calculations and Intervals on Original Scale

4.5.2. Testy bootstrapowe

statystykaT <- function(x,i) (mean(x[i]) - mean(x))/sd(x[i])
boostrapowyTestT <- function(wektor, mu0=0) {
  # wyznaczamy 999 replikacji i liczymy wartosci statystyki
  wartosciStatystykiT <- boot(wektor,statystykaT,R=999,stype="i")
  # wyznaczamy dystrybuante dla rozkladu statystyki T
  dystrybuantaStatystykiT <- ecdf(wartosciStatystykiT$t)
  T <- mean(wektor - mu0)/sd(wektor)
  # wyznaczamy kwantyl odpowiadajacy wartosci statystyki T w rozkladzie statystyki testowej dla replikacji bootstrpowych
  kwantyl <- dystrybuantaStatystykiT(T)
  # wyznaczamy p-wartosc dla testu dwustronnego
  1 - 2*abs(kwantyl-0.5)
}
boostrapowyTestT(daneO$VEGF, 3000)
## [1] 0.4304304
boostrapowyTestT(daneO$VEGF, 4000)
## [1] 0.02602603

4.6. Analiza przeżycia

library("Przewodnik")
czasy <- with(daneO, Surv(Okres.bez.wznowy,Niepowodzenia=="wznowa"))
czasy[1:10]
##  [1] 22+ 53+ 38+ 26+ 19+ 36  33+ 38+ 38  37+

4.6.1. Krzywa przeżycia

library(survival)
krzywaKM <- survfit(czasy~1)
summary(krzywaKM)
## Call: survfit(formula = czasy ~ 1)
## 
## 1 observation deleted due to missingness 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    10     96       1    0.990  0.0104        0.969        1.000
##    16     95       1    0.979  0.0146        0.951        1.000
##    19     93       1    0.969  0.0178        0.934        1.000
##    21     90       1    0.958  0.0206        0.918        0.999
##    29     80       1    0.946  0.0236        0.901        0.993
##    30     77       1    0.934  0.0263        0.884        0.987
##    36     60       1    0.918  0.0301        0.861        0.979
##    38     51       1    0.900  0.0345        0.835        0.970
##    40     40       1    0.878  0.0403        0.802        0.960
##    41     39       1    0.855  0.0451        0.771        0.948
##    43     30       1    0.827  0.0518        0.731        0.935
##    48     15       1    0.771  0.0719        0.643        0.926
##    50      9       1    0.686  0.1030        0.511        0.921
plot(krzywaKM)

plot(survfit(czasy~daneO$Nowotwor))

survdiff(czasy~daneO$Nowotwor)
## Call:
## survdiff(formula = czasy ~ daneO$Nowotwor)
## 
## n=86, 11 observations deleted due to missingness.
## 
##                   N Observed Expected (O-E)^2/E (O-E)^2/V
## daneO$Nowotwor=1  7        0    0.913     0.913     0.991
## daneO$Nowotwor=2 53        4    8.532     2.408     7.091
## daneO$Nowotwor=3 26        9    3.554     8.344    11.552
## 
##  Chisq= 11.7  on 2 degrees of freedom, p= 0.00285

4.6.2. Model Coxa

fit <- coxph(czasy~Nowotwor+Wiek, data=daneO)
(fit2 <- cox.zph(fit))
##             rho chisq     p
## Nowotwor 0.2266 0.692 0.406
## Wiek     0.0862 0.102 0.750
## GLOBAL       NA 0.821 0.663
summary(fit)
## Call:
## coxph(formula = czasy ~ Nowotwor + Wiek, data = daneO)
## 
##   n= 86, number of events= 13 
##    (11 observations deleted due to missingness)
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)   
## Nowotwor  1.75372   5.77607  0.59566  2.944  0.00324 **
## Wiek     -0.05632   0.94524  0.04340 -1.297  0.19447   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## Nowotwor    5.7761     0.1731    1.7973    18.563
## Wiek        0.9452     1.0579    0.8682     1.029
## 
## Concordance= 0.747  (se = 0.091 )
## Rsquare= 0.132   (max possible= 0.683 )
## Likelihood ratio test= 12.17  on 2 df,   p=0.002282
## Wald test            = 10  on 2 df,   p=0.006722
## Score (logrank) test = 11.62  on 2 df,   p=0.003004
library("partykit")
library("ipred")
library("TH.data")
data("GBSG2", package = "ipred")
GBSG2ct <- ctree(Surv(time, cens) ~ .,data = GBSG2)
plot(GBSG2ct)

4.7. Wybrane funkcje matematyczne

4.7.1. Operacje na zbiorach

x <- 1:10
y <- 5:15
setdiff(union(x,y), intersect(x,y))
## [1]  1  2  3  4 11 12 13 14 15
setequal(x, setdiff(x, setdiff(y,x)))
## [1] TRUE
is.element(3, x)
## [1] TRUE

4.7.2. Operacje arytmetyczne

1-exp(0.1^15)
## [1] -1.110223e-15
expm1(0.1^15)
## [1] 1e-15
log(1+0.1^20)
## [1] 0
log1p(0.1^20)
## [1] 1e-20

4.7.3. Wielomiany

library(orthopolynom)
(p1 <- polynomial(c(2,0,1)))
## 2 + x^2
(p2 <- polynomial(c(2,2,1,1)))
## 2 + 2*x + x^2 + x^3
p1 + p2
## 4 + 2*x + 2*x^2 + x^3
p1 * p2
## 4 + 4*x + 4*x^2 + 4*x^3 + x^4 + x^5
integral(p1,c(0,1))
## [1] 2.333333
deriv(p2)
## 2 + 2*x + 3*x^2
poly.calc(c(-1,1))
## -1 + x^2
poly.calc(c(0,2,4), c(3,2,3))
## 3 - x + 0.25*x^2
solve(p2)
## [1] -1+0.000000i  0-1.414214i  0+1.414214i
LCM(p1,p2)
## 2 + 2*x + x^2 + x^3
GCD(p1,p2)
## 2 + x^2

4.7.4. Bazy wielomianów ortogonalnych

library(orthopolynom)
Nmax <- 5
(wielomiany <- slegendre.polynomials(Nmax, normalized=TRUE))
## [[1]]
## 1 
## 
## [[2]]
## -1.732051 + 3.464102*x 
## 
## [[3]]
## 2.236068 - 13.41641*x + 13.41641*x^2 
## 
## [[4]]
## -2.645751 + 31.74902*x - 79.37254*x^2 + 52.91503*x^3 
## 
## [[5]]
## 3 - 60*x + 270*x^2 - 420*x^3 + 210*x^4 
## 
## [[6]]
## -3.316625 + 99.49874*x - 696.4912*x^2 + 1857.31*x^3 - 2089.474*x^4 +  
## 835.7894*x^5
wielomian4 <- as.function(wielomiany[[4]])
wielomian4(c(0, 0.5, 1))
## [1] -2.645751e+00  1.776357e-15  2.645751e+00
curve(wielomian4, 0, 1, lwd=3, lty=4, ylab="")

for (i in 1:Nmax) {
  wielomian <- as.function(wielomiany[[i]])
  curve(wielomian, 0, 1, add=TRUE, lwd=3, lty=i)
}

4.7.5. Szukanie maksimum/minimum/zer funkcji

f <- function(x, wyznacznik) {(x - 7)^wyznacznik - x}
optimize(f, interval=c(-1,10), wyznacznik = 2)
## $minimum
## [1] 7.5
## 
## $objective
## [1] -7.25
uniroot(f, interval=c(-1,10), wyznacznik = 2)
## $root
## [1] 4.807418
## 
## $f.root
## [1] -4.69523e-06
## 
## $iter
## [1] 8
## 
## $init.it
## [1] NA
## 
## $estim.prec
## [1] 6.103516e-05

4.7.6. Rachunek różniczkowo–całkowy

D(expression(3*(x-2)^2-15), "x")
## 3 * (2 * (x - 2))
(wyraz <- deriv(~(x-2)^2+15,"x")) 
## expression({
##     .expr1 <- x - 2
##     .value <- .expr1^2 + 15
##     .grad <- array(0, c(length(.value), 1L), list(NULL, c("x")))
##     .grad[, "x"] <- 2 * .expr1
##     attr(.value, "gradient") <- .grad
##     .value
## })
x <- 1:3
eval(wyraz)
## [1] 16 15 16
## attr(,"gradient")
##       x
## [1,] -2
## [2,]  0
## [3,]  2
integrate(dnorm, 0, Inf)
## 0.5 with absolute error < 4.7e-05
integrate(function(x) sin(x)^2, 0, 600)
## 300.0221 with absolute error < 0.0087
# integrate(function(x) sin(x)^2, 0, Inf)