Rozdział 4.1.1

import scipy.stats as stats

u = stats.uniform.rvs(loc= 0, scale= 1, size= 10, random_state= 1313)
print(u)
## [ 0.99250241  0.03101313  0.85090992  0.98288816  0.20383509  0.49741385
##   0.27774076  0.88546349  0.65512953  0.83098086]

Rozdział 4.1.2

import scipy.stats as stats

stats.uniform.rvs(loc= 0, scale= 1, size= 5)
stats.uniform.cdf(loc= 0, scale= 1, x= 0.5)
stats.uniform.pdf(loc= 0, scale= 1, x= 0.5)
stats.uniform.ppf(loc= 0, scale= 1, q= 0.1)
from scipy import *
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.pylab as mpl

mpl.rcParams["font.size"] = 15

x = arange(-4, 4, 0.01)

fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1, 1, 1)
ax2 = ax1.twinx()
ax1.plot(x,stats.norm.pdf(x), color='C0', linestyle='-', linewidth=3)
ax2.plot(x,stats.norm.cdf(x), color='C1', linestyle='--', linewidth=3)
ax1.set_ylabel('stats.norm.pdf(x)', color='C0')
ax2.set_ylabel('stats.norm.cdf(x)', color='C1')
ax1.set_xlabel('x')
plt.tight_layout()
plt.savefig('rys_4.1.png')

import scipy.stats as stats

print(stats.norm.cdf(x= -3)+(1-stats.norm.cdf(x= 3)))
## 0.00269979606326
import scipy.stats as stats

print(stats.norm.pdf(x= [-1,0,1]))
## [ 0.24197072  0.39894228  0.24197072]
import scipy.stats as stats

print(stats.norm.ppf(q= [0.001, 0.025, 0.05, 0.5, 0.95, 0.975, 0.999]))
## [-3.09023231 -1.95996398 -1.64485363  0.          1.64485363  1.95996398
##   3.09023231]
import scipy.stats as stats

print(stats.norm.rvs(loc= 2, scale= 1, size= 10))
## [ 2.15484945  1.38551231  2.24869944  3.07372318  2.15051313  2.65334518
##   1.5418569   0.72848281  0.83493084  2.468362  ]
from scipy import *
import scipy.stats as stats

print(stats.norm.rvs(loc= linspace(1,10,10), scale= linspace(1,10,10), size= 10))
## [  1.82127172   1.1863091    0.4459603   -0.75203839  -6.23573767
##   12.07365432   6.22924021  -7.25209158  20.29651775  -6.48766261]

Rozdział 4.1.3.1

from scipy import *
import scipy.stats as stats

def rPoisProcess(T=1, lambd=1):
    N = stats.poisson.rvs(size= 1, mu= T*lambd, random_state= 2305)
    return sorted(stats.uniform.rvs(size=N,loc=0,scale=T/2, random_state= 4101),\
    reverse=False)

print(rPoisProcess(T=10, lambd=0.2))
## [1.9252328007676289, 2.0283654010219032]
from scipy import *
import scipy.stats as stats
import pandas as pd

def rRandomFiled(T1=1, T2=1, lambd=1):
    N = stats.poisson.rvs(size= 1, mu= T1*T2*lambd, random_state= 2305)
    d = {'x' : stats.uniform.rvs(size=N,loc=0,scale=T1/2, random_state= 4101),\
         'y': stats.uniform.rvs(size=N,loc=0,scale=T2/2, random_state= 4026 )}
    return pd.DataFrame(d)

print(rRandomFiled(T1=10, T2=10, lambd=0.02))
##           x         y
## 0  1.925233  1.023386
## 1  2.028365  3.471873

Rozdział 4.1.3.2

from scipy import *
import scipy.linalg as la

n = 100; d = 100; N = 10000
random.seed(120)
Sigma = cov(1 * random.randn(n, d) + 0)
random.seed(122)
X = 1 * random.randn(n, d) + 0

Q = la.cholesky(Sigma)
y = dot(X,Q)

u, d, v = la.svd(Sigma, full_matrices=True)
y = dot(X,dot(dot(u,diag(sqrt(d))),v))

val, vec = la.eigh(Sigma)
values = val[::-1]
vectors = fliplr(vec)
y = dot(X,dot(vectors,dot(diag(sqrt(values)),transpose(vectors))))

Rozdział 4.1.3.3*

from scipy import *
from copulalib.copulalib import Copula 

x = random.normal(size=40)
y = x+random.normal(size=40)
frank = Copula(x,y,family='frank')
uf,vf = frank.generate_uv(1000)

clayton = Copula(x,y,family='clayton')
uc,vc = clayton.generate_uv(1000)

gumbel = Copula(x,y,family='gumbel')
ug,vg = gumbel.generate_uv(1000)

Rozdział 4.1.4

from scipy import *
import scipy.stats as stats

wek = stats.lognorm.rvs(s= 1, loc= 0, scale= 1, size= 100, random_state= 2305)
fitN = stats.norm.fit(wek)
fitG = stats.gamma.fit(wek)

logLik = sum(stats.norm.logpdf(wek, loc= fitN[0], scale= fitN[1]))
print('Rozkład normalny:')
print('szukane parametry: loc= %.6f, scale= %.6f' % (fitN[0], fitN[1]))
print('logarytm wairygodności: %.6f' % logLik, '\n')

logLik = sum(stats.gamma.logpdf(wek, a=fitG[0], loc=fitG[1], scale=fitG[2]))
print('Rozkład gamma:')
print('szukane parametry: shape= %.6f, loc= %.6f, scale= %.6f' % (fitG[0], fitG[1], fitG[2]))
print('logarytm wairygodności: %.6f' % logLik)
## Rozkład normalny:
## szukane parametry: loc= 1.682725, scale= 2.958832
## logarytm wairygodności: -250.373306 
## 
## Rozkład gamma:
## szukane parametry: shape= 0.101070, loc= 0.113678, scale= 1.568297
## logarytm wairygodności: -274.995035

Rozdział 4.2.1

import pandas as pd

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(df.shape)
## (155, 20)
import pandas as pd
from scipy import *

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(sum(df.isnull().values))
## 167
import pandas as pd
from scipy import *

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(pd.isnull(df).max(axis=1).sum())
## 75
import pandas as pd

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(df.dropna().shape)
## (80, 20)
import pandas as pd

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(df['V19'].describe())
## count     88.000000
## mean      61.852273
## std       22.875244
## min        0.000000
## 25%       46.000000
## 50%       61.000000
## 75%       76.250000
## max      100.000000
## Name: V19, dtype: float64
import pandas as pd
from scipy import *

df = pd.DataFrame()
df['wektor'] = [1, 5, nan, 2, 8, 8, 3, nan, 10]

liczba = df.fillna(value=2.5)
df['liczba'] = liczba
print(df)
##    wektor  liczba
## 0     1.0     1.0
## 1     5.0     5.0
## 2     NaN     2.5
## 3     2.0     2.0
## 4     8.0     8.0
## 5     8.0     8.0
## 6     3.0     3.0
## 7     NaN     2.5
## 8    10.0    10.0
import pandas as pd
from scipy import *

df = pd.DataFrame()
df['wektor'] = [1, 5, nan, 2, 8, 8, 3, nan, 10]

srednia = df['wektor'].fillna(df['wektor'].mean())
df['srednia'] = srednia
print(df)
##    wektor    srednia
## 0     1.0   1.000000
## 1     5.0   5.000000
## 2     NaN   5.285714
## 3     2.0   2.000000
## 4     8.0   8.000000
## 5     8.0   8.000000
## 6     3.0   3.000000
## 7     NaN   5.285714
## 8    10.0  10.000000
import pandas as pd
from scipy import *

df = pd.DataFrame()
df['wektor'] = [1, 5, nan, 2, 8, 8, 3, nan, 10]

imput = df['wektor'].fillna(random.choice(df['wektor'].dropna()))
df['imput'] = imput
print(df)
##    wektor  imput
## 0     1.0    1.0
## 1     5.0    5.0
## 2     NaN    8.0
## 3     2.0    2.0
## 4     8.0    8.0
## 5     8.0    8.0
## 6     3.0    3.0
## 7     NaN    8.0
## 8    10.0   10.0
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.imputation import *

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
df = df[['V18','V6','V12']]

imp = mice.MICEData(df)
fml = 'V18 ~ V6 + V12'
mice = mice.MICE(fml, sm.OLS, imp)
results = mice.fit(3,3)
print(results.summary())
##                         Results: MICE
## =============================================================
## Method:                  MICE      Sample size:          154 
## Model:                   OLS       Scale                 0.36
## Dependent variable:      V18       Num. imputations      3   
## -------------------------------------------------------------
##           Coef.  Std.Err.    t    P>|t|  [0.025 0.975]  FMI  
## -------------------------------------------------------------
## Intercept 2.8393   0.2038 13.9326 0.0000 2.4399 3.2387 0.1135
## V6        0.3442   0.1140  3.0186 0.0025 0.1207 0.5677 0.0735
## V12       0.3062   0.1124  2.7249 0.0064 0.0860 0.5264 0.0399
## =============================================================

Opis tej metody dla pakietu R jest dostępny pod adresem: https://www.jstatsoft.org/article/view/v045i03

import pandas as pd
import statsmodels.formula.api as smf

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
df = df[['V18','V6','V12']]

results = smf.ols('V18 ~ V6 +V12', data=df, missing='drop').fit()
print(results.summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                    V18   R-squared:                       0.147
## Model:                            OLS   Adj. R-squared:                  0.134
## Method:                 Least Squares   F-statistic:                     11.36
## Date:                Sat, 28 Jan 2017   Prob (F-statistic):           2.80e-05
## Time:                        15:08:39   Log-Likelihood:                -121.14
## No. Observations:                 135   AIC:                             248.3
## Df Residuals:                     132   BIC:                             257.0
## Df Model:                           2                                         
## Covariance Type:            nonrobust                                         
## ==============================================================================
##                  coef    std err          t      P>|t|      [0.025      0.975]
## ------------------------------------------------------------------------------
## Intercept      2.8870      0.205     14.104      0.000       2.482       3.292
## V6             0.3069      0.117      2.630      0.010       0.076       0.538
## V12            0.3139      0.117      2.677      0.008       0.082       0.546
## ==============================================================================
## Omnibus:                        5.940   Durbin-Watson:                   2.135
## Prob(Omnibus):                  0.051   Jarque-Bera (JB):                9.116
## Skew:                          -0.094   Prob(JB):                       0.0105
## Kurtosis:                       4.259   Cond. No.                         10.5
## ==============================================================================
## 
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Rozkład 4.2.2*

from scipy import *
import pandas as pd

daneSoc = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")

print(daneSoc.iloc[:,[0,5,6]].cov())
##                              wiek  cisnienie.skurczowe  cisnienie.rozkurczowe
## wiek                   191.742176            -6.016662              -8.941249
## cisnienie.skurczowe     -6.016662           246.903989              82.809186
## cisnienie.rozkurczowe   -8.941249            82.809186              60.324616
from scipy import *
import pandas as pd

daneSoc = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")

print(daneSoc.iloc[:,[0,5,6]].apply(lambda x: (x-mean(x))/std(x,ddof=1)).cov())
##                            wiek  cisnienie.skurczowe  cisnienie.rozkurczowe
## wiek                   1.000000            -0.027652              -0.083137
## cisnienie.skurczowe   -0.027652             1.000000               0.678527
## cisnienie.rozkurczowe -0.083137             0.678527               1.000000
from scipy import *
import pandas as pd

daneSoc = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")

daneSoc.iloc[:,[0,5,6]].apply(lambda x: x/std(x,ddof=1))
from scipy import *
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sm.qqplot((daneO['VEGF'])**0.5,line='q',ax=ax1)
ax1.set_title('Transformacja pierwiastkowa - zmienna VEGF')
sm.qqplot(log(daneO['VEGF']),line='q',ax=ax2)
ax2.set_title('Transformacja logarytmiczna - zmienna VEGF')
fig.tight_layout()
fig.savefig('rys_4.2.png')

from scipy import *
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
prob = stats.probplot((daneO['VEGF'])**0.5, dist=stats.norm, plot=ax1)
ax1.set_title('Transformacja pierwiastkowa - zmienna VEGF')
prob = stats.probplot(log(daneO['VEGF']), dist=stats.norm, plot=ax2)
ax2.set_title('Transformacja logarytmiczna - zmienna VEGF')
fig.tight_layout()
fig.savefig('rys_4.3.png')

from scipy import *
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

lmbdas = linspace(-2, 2)
llfVEGF = zeros(lmbdas.shape, dtype=float)
for ii, lmbda in enumerate(lmbdas):
    llfVEGF[ii] = stats.boxcox_llf(lmbda, daneO['VEGF'])

llfWiek = zeros(lmbdas.shape, dtype=float)
for ii, lmbda in enumerate(lmbdas):
    llfWiek[ii] = stats.boxcox_llf(lmbda, daneO['Wiek'])

fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
ax1.plot(lmbdas, llfVEGF)
ax1.plot(lmbdas[llfVEGF.argmax()],llfVEGF.max(), 'o',color='C1')
ax1.set_title('Zmienna VEGF')
ax1.set_xlabel('lmbda parameter')
ax1.set_ylabel('Box-Cox log-likelihood')
ax2.plot(lmbdas, llfWiek)
ax2.plot(lmbdas[llfWiek.argmax()],llfWiek.max(), 'o',color='C1')
ax2.set_title('Zmienna Wiek')
ax2.set_xlabel('lmbda parameter')
ax2.set_ylabel('Box-Cox log-likelihood')
fig.tight_layout()
fig.savefig('rys_4.4.png')

from scipy import *
import pandas as pd

zmOryginalna = array([0.1, 0.8, 0.5, 0.2, 0.9, 0.7])
zmZmieniona = pd.cut(zmOryginalna,[0,0.33,0.66,1],labels=['niski','sredni','wysoki'],precision=0)
print(zmZmieniona)
## [niski, wysoki, sredni, niski, wysoki, wysoki]
## Categories (3, object): [niski < sredni < wysoki]
from scipy import *
import pandas as pd

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

tDaneO = daneO
tDaneO['logVEGF'] = log(tDaneO['VEGF'])
tDaneO["fNowotwor"] = daneO["Nowotwor"].astype('category')
tDaneO['cutWiek'] = pd.cut(daneO['Wiek'],4,precision=0)
print(tDaneO.head(4))
##    Wiek  Rozmiar.guza  Wezly.chlonne  Nowotwor Receptory.estrogenowe  \
## 0    29             1              0       2.0                   (-)   
## 1    29             1              0       2.0                  (++)   
## 2    30             1              1       2.0                   (-)   
## 3    32             1              0       3.0                  (++)   
## 
##   Receptory.progesteronowe Niepowodzenia  Okres.bez.wznowy  VEGF   logVEGF  \
## 0                     (++)          brak              22.0   914  6.817831   
## 1                     (++)          brak              53.0  1118  7.019297   
## 2                      (+)          brak              38.0   630  6.445720   
## 3                     (++)          brak              26.0  1793  7.491645   
## 
##   fNowotwor   cutWiek  
## 0       2.0  (29, 36]  
## 1       2.0  (29, 36]  
## 2       2.0  (29, 36]  
## 3       3.0  (29, 36]

Rozdział 4.3.2

from scipy import *
import pandas as pd

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
print(mieszkania.describe(include='all'))
##                  cena       pokoi  powierzchnia dzielnica typ.budynku
## count      200.000000  200.000000    200.000000       200         200
## unique            NaN         NaN           NaN         3           3
## top               NaN         NaN           NaN    Krzyki   wiezowiec
## freq              NaN         NaN           NaN        79          76
## mean    175934.039900    2.550000     46.197000       NaN         NaN
## std      43078.651064    1.083221     20.079142       NaN         NaN
## min      83279.870000    1.000000     17.000000       NaN         NaN
## 25%     143304.052500    2.000000     31.150000       NaN         NaN
## 50%     174935.040000    3.000000     43.700000       NaN         NaN
## 75%     208740.710000    3.000000     61.400000       NaN         NaN
## max     295761.800000    4.000000     87.700000       NaN         NaN
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")

a1 = ols('cena ~ dzielnica',
              data=mieszkania).fit()
                
aov_table = sm.stats.anova_lm(a1, typ=2)
esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
print(aov_table)
print('\n')
print('Eta = %.6f' % esq_sm)
print('p-value = %.6f' % aov_table['PR(>F)'][0])
##                  sum_sq     df         F    PR(>F)
## dzielnica  1.799538e+10    2.0  5.045633  0.007294
## Residual   3.513029e+11  197.0       NaN       NaN
## 
## 
## Eta = 0.048729
## p-value = 0.007294
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

a2 = ols('cena ~ typ_budynku',
              data=mieszkania).fit()
                
aov_table = sm.stats.anova_lm(a2, typ=2)
esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
print(aov_table)
print('\n')
print('Eta = %.6f' % esq_sm)
print('p-value = %.6f' % aov_table['PR(>F)'][0])
##                    sum_sq     df         F    PR(>F)
## typ_budynku  2.277045e+10    2.0  6.472465  0.001895
## Residual     3.465278e+11  197.0       NaN       NaN
## 
## 
## Eta = 0.061659
## p-value = 0.001895
import pandas as pd
from statsmodels.stats.multicomp import *

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")

res2 = pairwise_tukeyhsd(mieszkania['cena'], mieszkania['dzielnica'])
print(res2)
##       Multiple Comparison of Means - Tukey HSD,FWER=0.05      
## ==============================================================
##  group1     group2     meandiff     lower      upper    reject
## --------------------------------------------------------------
## Biskupin    Krzyki   -21321.0188 -38022.2368 -4619.8008  True 
## Biskupin Srodmiescie -18350.5407 -36534.1091 -166.9722   True 
##  Krzyki  Srodmiescie  2970.4781  -14451.4593 20392.4155 False 
## --------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")

fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sns.boxplot(x="cena", y="dzielnica", data=mieszkania, color="C0", ax=ax1, saturation=0.2)
sns.barplot(x="cena", y="dzielnica", data=mieszkania, color="C0", ax=ax2, saturation=0.2)
plt.tight_layout()
plt.savefig('rys_4.5.png')

Rozdział 4.3.2.1

from patsy.contrasts import Treatment

contrast = Treatment(reference=0).code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
## [[ 0.  1.  0.  0.  0.]
##  [ 0.  0.  1.  0.  0.]
##  [ 0.  0.  0.  1.  0.]
##  [ 0.  0.  0.  0.  1.]]
from patsy.contrasts import Helmert

contrast = Helmert().code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
## [[-1.  1.  0.  0.  0.]
##  [-1. -1.  2.  0.  0.]
##  [-1. -1. -1.  3.  0.]
##  [-1. -1. -1. -1.  4.]]
from statsmodels.formula.api import ols
from patsy.contrasts import Helmert
import pandas as pd

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")

contrast = Helmert().code_without_intercept([1,2,3])
model = ols("cena ~ C(dzielnica, contrast)", data=mieszkania).fit()
print(model.summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                   cena   R-squared:                       0.049
## Model:                            OLS   Adj. R-squared:                  0.039
## Method:                 Least Squares   F-statistic:                     5.046
## Date:                Sat, 28 Jan 2017   Prob (F-statistic):            0.00729
## Time:                        15:08:47   Log-Likelihood:                -2412.4
## No. Observations:                 200   AIC:                             4831.
## Df Residuals:                     197   BIC:                             4841.
## Df Model:                           2                                         
## Covariance Type:            nonrobust                                         
## ===============================================================================================
##                                   coef    std err          t      P>|t|      [0.025      0.975]
## -----------------------------------------------------------------------------------------------
## Intercept                    1.763e+05   3015.732     58.450      0.000     1.7e+05    1.82e+05
## C(dzielnica, contrast)[H.2] -1.066e+04   3535.809     -3.015      0.003   -1.76e+04   -3687.615
## C(dzielnica, contrast)[H.3] -2563.3438   2219.758     -1.155      0.250   -6940.882    1814.195
## ==============================================================================
## Omnibus:                       10.736   Durbin-Watson:                   2.157
## Prob(Omnibus):                  0.005   Jarque-Bera (JB):                5.258
## Skew:                           0.160   Prob(JB):                       0.0721
## Kurtosis:                       2.273   Cond. No.                         1.63
## ==============================================================================
## 
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.formula.api import ols
import pandas as pd

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")

contrast = [[2,0], [-1,1],[-1,-1]]
model = ols("cena ~ C(dzielnica, contrast)", data=mieszkania).fit()
print(model.summary())
##                             OLS Regression Results                            
## ==============================================================================
## Dep. Variable:                   cena   R-squared:                       0.049
## Model:                            OLS   Adj. R-squared:                  0.039
## Method:                 Least Squares   F-statistic:                     5.046
## Date:                Sat, 28 Jan 2017   Prob (F-statistic):            0.00729
## Time:                        15:08:47   Log-Likelihood:                -2412.4
## No. Observations:                 200   AIC:                             4831.
## Df Residuals:                     197   BIC:                             4841.
## Df Model:                           2                                         
## Covariance Type:            nonrobust                                         
## ===================================================================================================
##                                       coef    std err          t      P>|t|      [0.025      0.975]
## ---------------------------------------------------------------------------------------------------
## Intercept                        1.763e+05   3015.732     58.450      0.000     1.7e+05    1.82e+05
## C(dzielnica, contrast)[custom0]  6611.9266   2135.391      3.096      0.002    2400.767    1.08e+04
## C(dzielnica, contrast)[custom1] -1485.2390   3688.392     -0.403      0.688   -8759.040    5788.562
## ==============================================================================
## Omnibus:                       10.736   Durbin-Watson:                   2.157
## Prob(Omnibus):                  0.005   Jarque-Bera (JB):                5.258
## Skew:                           0.160   Prob(JB):                       0.0721
## Kurtosis:                       2.273   Cond. No.                         1.77
## ==============================================================================
## 
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Rozdział 4.3.3.1

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

a3 = ols('cena ~ dzielnica + typ_budynku',
              data=mieszkania).fit()
                
aov_table = sm.stats.anova_lm(a3, typ=2)
print(aov_table)
print('\n')
print('dzielnica: p-value = %.6f, typ_budynku: p-value = %.6f' % \
      (aov_table['PR(>F)'][0],aov_table['PR(>F)'][1]))
##                    sum_sq     df         F    PR(>F)
## dzielnica    1.794386e+10    2.0  5.324443  0.005605
## typ_budynku  2.271893e+10    2.0  6.741338  0.001476
## Residual     3.285840e+11  195.0       NaN       NaN
## 
## 
## dzielnica: p-value = 0.005605, typ_budynku: p-value = 0.001476

Rozdział 4.3.3.2

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.factorplots import interaction_plot

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1,1,1)
interaction_plot(mieszkania['dzielnica'], mieszkania['typ_budynku'] ,mieszkania['cena'],\
                 colors=['red','blue','green'], ax=ax1)
plt.savefig('rys_4.6.png')

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

a4 = ols('cena ~ dzielnica * typ_budynku', data=mieszkania).fit()
a4 = ols('cena ~ (dzielnica + typ_budynku)**2', data=mieszkania).fit()              
a4 = ols('cena ~ dzielnica + typ_budynku + dzielnica:typ_budynku', data=mieszkania).fit()
                
aov_table = sm.stats.anova_lm(a4, typ=2)
print(aov_table)
##                              sum_sq     df         F    PR(>F)
## dzielnica              1.794386e+10    2.0  5.231069  0.006141
## typ_budynku            2.271893e+10    2.0  6.623115  0.001656
## dzielnica:typ_budynku  9.952797e+08    4.0  0.145074  0.964995
## Residual               3.275887e+11  191.0       NaN       NaN

Rozdział 4.3.4.1*

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

modelPP = ols('cena ~ powierzchnia + pokoi', data=mieszkania).fit()
print(modelPP.params)
print('\n')
print('R2: %.6f, R2adj: %.6f' % (modelPP.rsquared ,modelPP.rsquared_adj))
print('\n')
print(modelPP.summary2())
## Intercept       82407.088281
## powierzchnia     2070.896564
## pokoi            -840.100768
## dtype: float64
## 
## 
## R2: 0.893723, R2adj: 0.892644
## 
## 
##                    Results: Ordinary least squares
## ======================================================================
## Model:               OLS               Adj. R-squared:      0.893     
## Dependent Variable:  cena              AIC:                 4392.5453 
## Date:                2017-01-28 15:08  BIC:                 4402.4403 
## No. Observations:    200               Log-Likelihood:      -2193.3   
## Df Model:            2                 F-statistic:         828.3     
## Df Residuals:        197               Prob (F-statistic):  1.27e-96  
## R-squared:           0.894             Scale:               1.9923e+08
## ----------------------------------------------------------------------
##                Coef.     Std.Err.    t    P>|t|    [0.025     0.975]  
## ----------------------------------------------------------------------
## Intercept    82407.0883 2569.9057 32.0662 0.0000 77339.0312 87475.1454
## powierzchnia  2070.8966  149.1705 13.8828 0.0000  1776.7206  2365.0725
## pokoi         -840.1008 2765.1018 -0.3038 0.7616 -6293.1001  4612.8986
## ----------------------------------------------------------------------
## Omnibus:                 0.368         Durbin-Watson:            1.764
## Prob(Omnibus):           0.832         Jarque-Bera (JB):         0.425
## Skew:                    0.101         Prob(JB):                 0.809
## Kurtosis:                2.898         Condition No.:            150  
## ======================================================================
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

modelPP = ols('cena ~ powierzchnia + pokoi + dzielnica', data=mieszkania).fit()
print(modelPP.summary2())
##                            Results: Ordinary least squares
## ======================================================================================
## Model:                     OLS                    Adj. R-squared:           0.935     
## Dependent Variable:        cena                   AIC:                      4294.6332 
## Date:                      2017-01-28 15:08       BIC:                      4311.1248 
## No. Observations:          200                    Log-Likelihood:           -2142.3   
## Df Model:                  4                      F-statistic:              714.8     
## Df Residuals:              195                    Prob (F-statistic):       2.93e-115 
## R-squared:                 0.936                  Scale:                    1.2092e+08
## --------------------------------------------------------------------------------------
##                             Coef.     Std.Err.    t     P>|t|     [0.025      0.975]  
## --------------------------------------------------------------------------------------
## Intercept                 94222.0155 2320.3617  40.6066 0.0000  89645.7889  98798.2422
## dzielnica[T.Krzyki]      -20934.8600 1842.7863 -11.3604 0.0000 -24569.2107 -17300.5094
## dzielnica[T.Srodmiescie] -12722.6045 2008.0264  -6.3359 0.0000 -16682.8422  -8762.3667
## powierzchnia               2022.9886  116.3125  17.3927 0.0000   1793.5967   2252.3806
## pokoi                        34.3596 2157.5248   0.0159 0.9873  -4220.7195   4289.4386
## --------------------------------------------------------------------------------------
## Omnibus:                     1.237               Durbin-Watson:                  2.067
## Prob(Omnibus):               0.539               Jarque-Bera (JB):               0.955
## Skew:                        0.156               Prob(JB):                       0.620
## Kurtosis:                    3.131               Condition No.:                  188  
## ======================================================================================

modelPP.fittedvalues modelPP.resid modelPP.resid_pearson

from scipy import *
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.regressionplots import *

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

modelPP = ols('cena ~ powierzchnia + pokoi + dzielnica', data=mieszkania).fit()

fig = plt.figure(figsize=(14,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
plot_leverage_resid2(modelPP, ax=ax1)
influence_plot(modelPP, size=12, ax=ax2)
ax3.plot(arange(1,len(modelPP.resid)+1),modelPP.resid)
ax3.set_title('Reszty modelu')
ax3.set_xlabel('numer')
ax3.set_ylabel('reszty')
ax4.plot(modelPP.fittedvalues,modelPP.resid,'o')
ax4.set_title('Reszty vs wartości wyrównane modelu')
ax4.set_xlabel('wyrównane')
ax4.set_ylabel('reszty')
fig.tight_layout()
fig.savefig('rys_4.7.png')

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})

modelDP = ols('cena ~ powierzchnia + dzielnica', data=mieszkania).fit()
ndane = pd.DataFrame()
ndane['powierzchnia'] = [48,68]
ndane['dzielnica'] = ['Biskupin','Krzyki']
print(modelDP.predict(ndane))
## 0    191415.933843
## 1    210976.884413
## dtype: float64

Rozdział 4.3.4.3

import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")

rlt = ols('cena ~ powierzchnia + pokoi+ cena', data=mieszkania).fit()
VIF=pd.DataFrame([variance_inflation_factor(rlt.model.exog,i) for i in range(1,rlt.model.exog.shape[1])],index=rlt.model.exog_names[1:],columns=['VIF'])
print(VIF)
##                     VIF
## powierzchnia  17.727848
## pokoi          8.965221
## cena           9.409346

Rozdział 4.3.5*

from scipy import *
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

df['Niepowodzenia'] = df.Niepowodzenia.replace(dict(brak=0, wznowa=1))
df['logVEGF'] = log(df['VEGF'])
df['const'] = 1
y_name = 'Niepowodzenia'
x_name = ['const', 'Nowotwor', 'logVEGF']
modelNV = sm.GLM(df[y_name], df[x_name],missing='drop',family=sm.families.Binomial()).fit()
print(modelNV.summary())
##                  Generalized Linear Model Regression Results                  
## ==============================================================================
## Dep. Variable:          Niepowodzenia   No. Observations:                   86
## Model:                            GLM   Df Residuals:                       83
## Model Family:                Binomial   Df Model:                            2
## Link Function:                  logit   Scale:                             1.0
## Method:                          IRLS   Log-Likelihood:                -23.576
## Date:                Sat, 28 Jan 2017   Deviance:                       47.152
## Time:                        15:08:53   Pearson chi2:                     65.9
## No. Iterations:                     6                                         
## ==============================================================================
##                  coef    std err          z      P>|z|      [0.025      0.975]
## ------------------------------------------------------------------------------
## const        -17.3914      4.376     -3.975      0.000     -25.968      -8.815
## Nowotwor       2.2586      0.766      2.950      0.003       0.758       3.759
## logVEGF        1.3293      0.425      3.128      0.002       0.496       2.162
## ==============================================================================
from scipy import *
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

df['Niepowodzenia'] = df.Niepowodzenia.replace(dict(brak=0, wznowa=1))
df['logVEGF'] = log(df['VEGF'])
df['const'] = 1
y_name = 'Niepowodzenia'
x_name = ['const', 'Nowotwor']
modelN = sm.GLM(df[y_name], df[x_name],missing='drop',family=sm.families.Binomial()).fit()
daneO = pd.DataFrame()
daneO['const'] = [1,1,1]
daneO['Nowotwor'] = [1,2,3]
print(modelN.predict(daneO))
## 0    0.011259
## 1    0.072498
## 2    0.349185
## dtype: float64

Rozdział 4.3.5.2

from scipy import *
from scipy.optimize import *
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/tab01.csv")
d = df.dropna()

def f(x,a,b):
    return x**a-b

sol, pcov = curve_fit(f, d['x'], d['y'], p0=(2, 2))
print ("a =", sol[0], "+/-", pcov[0,0]**0.5, \
" " , round((pcov[0,0]**0.5/sol[0])*100,3),"%")
print ("b =", sol[1], "+/-", pcov[1,1]**0.5, \
" " , round((pcov[1,1]**0.5/sol[1])*100,3),"%" '\n')
df = len(d['x']) - len(sol)
chi_squared = sum(((f(d['x'], *sol) - d['y']) / 1)**2)
reduced_chi_squared = chi_squared / df
print ('degrees of freedom: ', df)
print ('chi-square value: ', ("%.3f" % chi_squared))
print ('reduced chi-square value: ', ("%.3f" % reduced_chi_squared))

fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1,1,1)
ax1.plot(d['x'],d['y'],'o')
X = arange(d['x'].min(),d['x'].max(),0.1)
Y = f(X,*sol)
ax1.plot(X, Y, linewidth=2, color="orange")
ax1.plot(X, X**2.5-5, '--', linewidth=2, color="C2")
ax1.set_title('$y = x^{2.5}-5+\\varepsilon$')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
fig.tight_layout()
fig.savefig('rys_4.8.png')
## a = 2.51425786701 +/- 0.0234084769392   0.931 %
## b = 4.69439487063 +/- 0.612348759614   13.044 %
## 
## degrees of freedom:  40
## chi-square value:  414.627
## reduced chi-square value:  10.366

Rozdział 4.4.1.1

from scipy import *
from scipy.stats import *
from skgof import cvm_test

x = norm(10, 1.4).rvs(random_state=1, size=100)
print(cvm_test(x, norm(10, 1.4)))
#x = norm(10, 1.4).rvs(random_state=1, size=100)
print('\nWyciągamy p-wartość: pwynik = %.6f' % cvm_test(x, norm(10, 1.4)).pvalue)
## GofResult(statistic=0.10543122652629035, pvalue=0.55976362337633079)
## 
## Wyciągamy p-wartość: pwynik = 0.559764
from scipy import *
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

y = random.normal(size= 100)
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sm.qqplot(y,line='q',ax=ax1)
ax1.set_title('Normal Q-Q Plot')
sm.qqplot(y,line='q',dist=stats.expon,ax=ax2)
ax2.set_title('Exponential Q-Q Plot')
fig.tight_layout()
fig.savefig('rys_4.9.png')

Rozdział 4.4.1.2

from scipy import *
import pandas as pd
import scipy.stats as stats

random.seed(4101)
x = random.uniform(size = 100)

df = pd.DataFrame()
df['x'] = x
df['cutx'] = pd.cut(df['x'],5,precision=0)
df = df.groupby(['cutx'],as_index=False)['x'].agg({'zm' : 'count'})
print(df,'\n')
chi2, p = stats.chisquare(df['zm'])
print('X-squared = %.6f, df = %.0f, p-value = %.6f' % (chi2,df.shape[0]-1,p))
##            cutx  zm
## 0  (0.002, 0.2]  22
## 1    (0.2, 0.4]  20
## 2    (0.4, 0.6]  13
## 3    (0.6, 0.8]  28
## 4      (0.8, 1]  17 
## 
## X-squared = 6.300000, df = 4, p-value = 0.177836
from scipy import *
import pandas as pd
import scipy.stats as stats

random.seed(4026)
x = random.beta(a=2, b=3, size = 100)

df = pd.DataFrame()
df['x'] = x
df['cutx'] = pd.cut(df['x'],5,precision=0)
df = df.groupby(['cutx'],as_index=False)['x'].agg({'zm' : 'count'})
print(df,'\n')
chi2, p = stats.chisquare(df['zm'])
print('X-squared = %.6f, df = %.0f, p-value = %.6f' % (chi2,df.shape[0]-1,p))
##           cutx  zm
## 0  (0.03, 0.2]  24
## 1   (0.2, 0.4]  31
## 2   (0.4, 0.5]  16
## 3   (0.5, 0.7]  20
## 4   (0.7, 0.9]   9 
## 
## X-squared = 13.700000, df = 4, p-value = 0.008317
from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.uniform(size = 100)

chi2, p = stats.kstest(x,'uniform')
print('D = %.6f, p-value = %.6f' % (chi2,p))
## D = 0.063369, p-value = 0.816786
from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.uniform(size = 100)

chi2, p = stats.kstest(x,'norm', args=(1,1))
print('D = %.6f, p-value = %.6f' % (chi2,p))
## D = 0.501903, p-value = 0.000000

Rozdział 4.4.1.3

from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.uniform(size = 100)
y = random.normal(size = 100)

print(stats.ks_2samp(x,y))
## Ks_2sampResult(statistic=0.47999999999999998, pvalue=8.0827828929192009e-11)
from scipy import *
import matplotlib.pyplot as plt

x = random.exponential(size=100)
y = random.normal(size=100)

fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1, 1, 1)
ax1.plot(sort(x), sort(y), 'o')
plt.tight_layout()
plt.savefig('rys_4.10.png')

Rozdział 4.4.2

from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.normal(size=50)
y = x + 0.3 + random.normal(loc=0,scale=0.01,size=50)
print(stats.ttest_1samp(y, 0.0))
print('mean = %.6f' % mean(y))
## Ttest_1sampResult(statistic=2.1054568276626191, pvalue=0.040400430666069305)
## mean = 0.301538
from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.normal(size=50)
y = x + 0.3 + random.normal(loc=0,scale=0.01,size=50)
print(stats.ttest_rel(x,y))
## Ttest_relResult(statistic=-226.90837947553518, pvalue=1.0419125460882572e-75)
from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.normal(size=50)
y = x + 0.3 + random.normal(loc=0,scale=0.01,size=50)
print(stats.wilcoxon(y))
print(stats.ranksums(x,y))
print(stats.mannwhitneyu(x,y))
## WilcoxonResult(statistic=432.0, pvalue=0.047283854040010578)
## RanksumsResult(statistic=-1.5648971117287647, pvalue=0.11760703603911446)
## MannwhitneyuResult(statistic=1023.0, pvalue=0.05920878241374343)

Rozdział 4.4.3

from scipy import *
import scipy.stats as stats

random.seed(2305)
x = random.normal(size=20,loc=2,scale=1)
y = random.normal(size=20,loc=2,scale=2)
z = random.normal(size=20,loc=2,scale=3)
wart = list(x) + list(y) + list(z)
grup=sorted([1,2,3]*20)

F = var(x,ddof=1) / var(y,ddof=1)
df1 = len(x) - 1; df2 = len(y) - 1
pF = stats.f.cdf(F, df1, df2)

print('F = %.6f, df1 = %.0f, df2 = %.0f, p-value = %.6f' % (F,df1,df2,pF))
print('Z = %.6f, p-value = %.6f' % (stats.mood(x,y)[0],stats.mood(x,y)[1]))
print(stats.ansari(x,y))
print(stats.bartlett(wart,grup))
print(stats.fligner(wart,grup))
print(stats.levene(wart,grup))
## F = 0.082671, df1 = 19, df2 = 19, p-value = 0.000001
## Z = -3.288482, p-value = 0.001007
## AnsariResult(statistic=267.0, pvalue=0.0016590662842894473)
## BartlettResult(statistic=45.245372098460869, pvalue=1.7383082414404198e-11)
## FlignerResult(statistic=19.012874888600216, pvalue=1.2983941380633496e-05)
## LeveneResult(statistic=20.329787781367841, pvalue=1.5471327643540701e-05)

Rozdział 4.4.4

from statsmodels.stats.proportion import *

wtestu = proportions_chisquare(594, 1234, value=0.5)
print('Test chi-square (without continuity correction) = %.6f, p-value = %.6f' % (wtestu[0],wtestu[1]))
## Test chi-square (without continuity correction) = 1.714749, p-value = 0.190370
from statsmodels.stats.proportion import *

print(proportion_confint(594,1234,method='beta'))
## (0.4531511917599369, 0.50966066183703784)
from scipy import *

x = [11,120,1300,14000,150000]
n = [100,1000,10000,100000,1000000]

print([ x[i]/n[i] for i in range(5) ])
## [0.11, 0.12, 0.13, 0.14, 0.15]
from scipy import *
from statsmodels.stats.proportion import *

x = [11,120,1300,14000,150000]
n = [100,1000,10000,100000,1000000]

print(proportions_chisquare_allpairs(array(x),array(n),multitest_method='b'))
print('\np-value correction without continuity correction')
## Corrected p-values using Bonferroni p-value correction
## 
## Pairs  p-values
## (0, 1)       1
## (0, 2)       1
## (0, 3)       1
## (0, 4)       1
## (1, 2)       1
## (1, 3)  0.6956
## (1, 4)  0.07914
## (2, 3)  0.05863
## (2, 4)  2.456e-07
## (3, 4)  2.557e-16
## 
## p-value correction without continuity correction

Rozdział 4.4.5.1

from scipy import *
import scipy.stats as stats

random.seed(227)
x = random.lognormal(size=50)
y = add(x, random.lognormal(size=50))

sol, pval = stats.pearsonr(x,y)
res, pwal = stats.spearmanr(x,y)
print('Correlation Pearson = %.6f, p-value = %.6f' % (sol,pval))
print('Correlation Spearman = %.6f, p-value = %.6f' % (res,pwal))
## Correlation Pearson = 0.560403, p-value = 0.000023
## Correlation Spearman = 0.587899, p-value = 0.000007

Rozdział 4.4.5.2

from scipy import *
import scipy.stats as stats

def get_z(rho):
    return log((1+rho)/(1-rho))/2

def rho0_test(rho, n, rho0=0, alternative="two.sided"):
    p_raw = stats.norm.cdf((get_z(rho)-get_z(rho0))/sqrt(1/(n-3)))
    if   alternative=="two.sided": return 1-2*abs(0.5-p_raw)
    elif alternative=="less": return p_raw
    elif alternative=="greater": return 1-p_raw

print(rho0_test(rho=0.23,n=60))
## 0.0770455747032
from scipy import *
import scipy.stats as stats

def get_z(rho):
    return log((1+rho)/(1-rho))/2

def rho_rho_test(rho1, n1, rho2, n2, alternative="two.sided"):
    p_raw = stats.norm.cdf((get_z(rho1)-get_z(rho2))/sqrt(1/(n1-3)+1/(n2-3)))
    if   alternative=="two.sided": return 1-2*abs(0.5-p_raw)
    elif alternative=="less": return p_raw
    elif alternative=="greater": return 1-p_raw

print(rho_rho_test(rho1=0.23,n1=60,rho2=0.33,n2=60))
## 0.5619332178

Rozdział 4.4.5.3

import pandas as pd
import scipy.stats as stats
from statsmodels.sandbox.stats.runs import *

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")

plec = df['plec'].values
praca = df['praca'].values
print(pd.crosstab(plec,praca))
s, p, dof, expctd = stats.chi2_contingency(pd.crosstab(plec,praca))
print('\nX-squared = %.6f, df = %.0f, p-value = %.6f\n' % (s, dof, p))
print(expctd)
oddsratio, pvalue = stats.fisher_exact(pd.crosstab(plec,praca))
print('\nodds ratio = %.6f, p-value = %.6f' % (oddsratio,pvalue))
## col_0      nie pracuje  uczen lub pracuje
## row_0                                    
## kobieta              7                 48
## mezczyzna           45                104
## 
## X-squared = 5.571056, df = 1, p-value = 0.018260
## 
## [[  14.01960784   40.98039216]
##  [  37.98039216  111.01960784]]
## 
## odds ratio = 0.337037, p-value = 0.011203

Rozdział 4.4.5.3.1

import pandas as pd
from statsmodels.sandbox.stats.runs import *

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")

plec = df['plec'].values
praca = df['praca'].values

sol, pval = mcnemar(pd.crosstab(plec,praca), exact=False)
print('McNemar`s Chi-squared test\n\nX-squared = %.6f, p-value = %.6f' % (sol,pval))
## McNemar`s Chi-squared test
## 
## X-squared = 0.043011, p-value = 0.835705

Rozdział 4.4.5.3.2

W tym podrozdziale pokażemy jak uruchomić kod R w Pythonie z wykorzystaniem pakietu pyper

import pandas as pd
import pyper as pr

tab = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/tabela.csv")

r = pr.R(use_pandas = True)
r.assign("rdata", tab)

r("tab <- xtabs(~wzrost+charakter+plec,data=rdata)")
print(r("tab"))

r("laczna <- apply(tab,c(1,2),sum)")
print(r("laczna"))

print(r("chisq.test(laczna)"))

print(r("mantelhaen.test(xtabs(~wzrost+charakter+plec,data=rdata))"))
## try({tab})
## , , plec = kobieta
## 
##         charakter
## wzrost   agresywny lagodny
##   niski         10     100
##   wysoki         1      10
## 
## , , plec = mezczyzna
## 
##         charakter
## wzrost   agresywny lagodny
##   niski         10       1
##   wysoki       100      10
## 
## 
## try({laczna})
##         charakter
## wzrost   agresywny lagodny
##   niski         20     101
##   wysoki       101      20
## 
## try({chisq.test(laczna)})
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  laczna
## X-squared = 105.79, df = 1, p-value < 2.2e-16
## 
## 
## try({mantelhaen.test(xtabs(~wzrost+charakter+plec,data=rdata))})
## 
##  Mantel-Haenszel chi-squared test without continuity correction
## 
## data:  xtabs(~wzrost + charakter + plec, data = rdata)
## Mantel-Haenszel X-squared = 0, df = 1, p-value = 1
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2177312 4.5928200
## sample estimates:
## common odds ratio 
##                 1

Rozdział 4.4.5.4

from scipy import *
import pandas as pd
from statsmodels.stats.inter_rater import *

random.seed(227)
ocena1 = random.choice([2,3,4,5], size=100)
ocena2 = random.choice([2,3,4,5], size=100)

print(pd.crosstab(ocena1,ocena2, rownames=['ocena1'],colnames=['ocena2']),'\n')
print(cohens_kappa(pd.crosstab(ocena1,ocena2)))
## ocena2  2  3   4  5
## ocena1             
## 2       5  7  11  5
## 3       4  7   5  9
## 4       7  9   1  7
## 5       6  5   6  6 
## 
##                   Simple Kappa Coefficient
##               --------------------------------
##               Kappa                     -0.0784
##               ASE                       0.0516
##               95% Lower Conf Limit      -0.1796
##               95% Upper Conf Limit      0.0227
## 
##                  Test of H0: Simple Kappa = 0
## 
##               ASE under H0              0.0574
##               Z                         -1.3659
##               One-sided Pr >  Z         0.9140
##               Two-sided Pr > |Z|        0.1720
from scipy import *
import pandas as pd
from statsmodels.stats.inter_rater import *

df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/oceniajacy.csv")

print(pd.crosstab(df['oceniajacy1'],df['oceniajacy2']),'\n')
print('Czynniki traktujemy jako zmienne nominalne:\n')
print(cohens_kappa(pd.crosstab(df['oceniajacy1'],df['oceniajacy2'])),'\n')
print('Czynniki traktujemy jako zmienne porządkowe:\n')
print(cohens_kappa(pd.crosstab(df['oceniajacy1'],df['oceniajacy2']),wt='quadratic'))
## oceniajacy2     aSlabo  bSrednio  cDobrze  dBardzo Dobrze
## oceniajacy1                                              
## aSlabo               3         2        0               0
## bSrednio             0         1        0               0
## cDobrze              0         3        0               3
## dBardzo Dobrze       0         0        5               3 
## 
## Czynniki traktujemy jako zmienne nominalne:
## 
##                   Simple Kappa Coefficient
##               --------------------------------
##               Kappa                     0.1362
##               ASE                       0.1436
##               95% Lower Conf Limit      -0.1452
##               95% Upper Conf Limit      0.4176
## 
##                  Test of H0: Simple Kappa = 0
## 
##               ASE under H0              0.1201
##               Z                         1.1345
##               One-sided Pr >  Z         0.1283
##               Two-sided Pr > |Z|        0.2566
##  
## 
## Czynniki traktujemy jako zmienne porządkowe:
## 
##                   Weighted Kappa Coefficient
##               --------------------------------
##               Kappa                     0.7461
##               ASE                       0.0738
##               95% Lower Conf Limit      0.6014
##               95% Upper Conf Limit      0.8908
## 
##                  Test of H0: Weighted Kappa = 0
## 
##               ASE under H0              0.2199
##               Z                         3.3929
##               One-sided Pr >  Z         0.0003
##               Two-sided Pr > |Z|        0.0007

Rozdział 4.4.6

from scipy import *
import scipy.stats as stats
import pandas as pd
from statsmodels.sandbox.stats.multicomp import *

rawp = [ stats.ttest_ind(random.normal(size=20),random.normal(size=20,loc=1))[1] \
       for i in range(6) ]

_,Bonferroni,_,_ = multipletests(array(rawp),method='bonferroni')
_,BH,_,_ = multipletests(array(rawp),method='fdr_bh')
t = pd.DataFrame()
t['rawp'] = rawp
t['Bonferroni'] = Bonferroni
t['BH'] = BH
print(t)
##        rawp  Bonferroni        BH
## 0  0.000344    0.002063  0.002063
## 1  0.124588    0.747527  0.124588
## 2  0.000719    0.004316  0.002158
## 3  0.099282    0.595694  0.119139
## 4  0.002370    0.014219  0.003555
## 5  0.002131    0.012784  0.003555

Rozdział 4.4.7

from scipy import *
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import *

N = 10**3
def getStat(size=30, p=0.2):
    x = random.binomial(1, p, size)
    y = random.binomial(1, p, size)
    tab = pd.crosstab(x,y)
    return stats.chi2_contingency(tab)[0]
    
sol = [ getStat() for i in range(N) ]

plt.figure(figsize=(8,6))
plt.plot(sol,ECDF(sol)(sol),'.',linewidth=2, color='C1')
plt.plot(sort(sol), stats.chi2.cdf(sort(sol),df=1), color='C0')
plt.axhline(y=0.95,color='k',linestyle='--',linewidth=1)
plt.axvline(x=percentile(sol, q=95),ymax=0.91,color='k',linestyle='--',linewidth=1)
plt.text(percentile(sol, q=95)-0.4, 0.6,'empiryczny kwantyl 0.95 = %.2f' % percentile(sol, q=95), rotation=90)
plt.axvline(x=stats.chi2.ppf(0.95,df=1),ymax=0.91,color='k',linestyle='--',linewidth=1)
plt.text(stats.chi2.ppf(0.95,df=1)-0.4, 0.6,'asymptotyczny kwantyl 0.95 = %.2f' % stats.chi2.ppf(0.95,df=1), rotation=90)
plt.xlabel('kwantyle')
plt.ylabel('p-value')
plt.title('Rozkład empiryczny i testowy statystyki testowej')
plt.tight_layout()
plt.savefig('rys_4.11.png')

import scipy.stats as stats
from scipy import *
import pandas as pd

kidney = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/kidney.csv")

x = kidney[kidney['recipient.age'] < 40]['MDRD30']
y = kidney[kidney['recipient.age'] >= 40]['MDRD30']
n = len(x); m = len(y); B = 999

theta = stats.ks_2samp(x,y).statistic
print('Statystyka testu K-S: theta = %.2f' % theta,'\n')

def thetastar(x,y):
    zstar = random.choice(concatenate((x,y)),n+m)
    xstar = zstar[:n]
    ystar = zstar[n:n+m]
    return stats.ks_2samp(xstar,ystar).statistic

res = [ thetastar(x,y) for i in range(B)]

print('Permutacyjna p-wartość testu K-S:')
print((sum(res >= theta) + 1) / (B+1))
## Statystyka testu K-S: theta = 0.22 
## 
## Permutacyjna p-wartość testu K-S:
## 0.001

Rozdział 4.5.1

from scipy import *
import scipy.stats as stats
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

n = daneO.shape[0]; B = 9999
wynikBoot = [ mean(random.choice(daneO['VEGF'],n)) for i in range(B)]
bootParametryczny = [ mean(random.normal(mean(daneO['VEGF']),std(daneO['VEGF'],ddof=1),n)) for i in range(B)]
print('95% przedział ufności dla oceny średniej - bootstrap nieparametryczny:')
print(percentile(wynikBoot,q=[5/2, 100-5/2]))
print('\n95% przedział ufności dla oceny średniej - bootstrap parametryczny:')
print(percentile(bootParametryczny,q=[5/2, 100-5/2]))

fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
X = linspace(min(wynikBoot), max(wynikBoot), 100)
N, bins, patches = ax1.hist(wynikBoot ,normed=1, facecolor='C2',edgecolor='C2',
histtype='stepfilled', alpha=0.25, label='rozkład empiryczny')
ax1.plot(X, stats.norm.pdf(X, loc=mean(wynikBoot), scale=std(wynikBoot,ddof=1)), '--', color='C0',
linewidth=1,label='loc=%.2f, scale=%.2f' % (mean(wynikBoot),std(wynikBoot,ddof=1)))
ax1.set_title("Rozkład empiryczny vs. normalny\nbootstrap nieparametryczny")
ax1.legend()
X = linspace(min(bootParametryczny), max(bootParametryczny), 100)
N, bins, patches = ax2.hist(bootParametryczny ,normed=1, facecolor='C2',edgecolor='C2',
histtype='stepfilled', alpha=0.25, label='rozkład empiryczny')
ax2.plot(X, stats.norm.pdf(X, loc=mean(bootParametryczny), scale=std(bootParametryczny,ddof=1)), '--', color='C0',
linewidth=1,label='loc=%.2f, scale=%.2f' % (mean(bootParametryczny),std(bootParametryczny,ddof=1)))
ax2.set_title("Rozkład empiryczny vs. normalny\nbootstrap parametryczny")
ax2.legend()
fig.tight_layout()
fig.savefig('rys_4.12.png')
## 95% przedział ufności dla oceny średniej - bootstrap nieparametryczny:
## [ 1976.21597938  3393.44123711]
## 
## 95% przedział ufności dla oceny średniej - bootstrap parametryczny:
## [ 1921.28524356  3336.2346008 ]

Rozkład 4.5.2

from scipy import *
import pandas as pd
from statsmodels.distributions.empirical_distribution import *

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

def bootstrapowyTestT(wektor,mu,B):
    statystykaT = lambda x:\
    (mean(random.choice(x,len(x)))-mean(x))/std(random.choice(x,len(x)),ddof=1)
    wartosciStatystykiT = [ statystykaT(wektor) for i in range(B)]
    T = (mean(wektor)-mu)/std(wektor,ddof=1)
    kwantyl = ECDF(wartosciStatystykiT)(T)
    return 1-2*abs(kwantyl-0.5)

print(bootstrapowyTestT(daneO['VEGF'],3000,999))
print(bootstrapowyTestT(daneO['VEGF'],4000,999))
## 0.26026026026
## 0.0

Rozdział 4.6.1

from scipy import *
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.plotting import plot_lifetimes
from lifelines.statistics import logrank_test

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

daneO['Niepowodzenia'] = daneO.Niepowodzenia.replace(dict(brak=0, wznowa=1))
daneO = daneO[isfinite(daneO['Nowotwor'])]
T = daneO['Okres.bez.wznowy']
E = daneO['Niepowodzenia']
daneO.Nowotwor[daneO["Nowotwor"]==1]   = 'typ nowotworu 1'
daneO.Nowotwor[daneO["Nowotwor"]==2.0] = 'typ nowotworu 2'
daneO.Nowotwor[daneO["Nowotwor"]==3.0] = 'typ nowotworu 3'

now1 = (daneO["Nowotwor"] == "typ nowotworu 1")
now2 = (daneO["Nowotwor"] == "typ nowotworu 2")
now3 = (daneO["Nowotwor"] == "typ nowotworu 3")
results12 = logrank_test(T[now1], T[~now1], E[now1], E[~now1], alpha=.95 )
results12.print_summary()
results13 = logrank_test(T[now2], T[~now2], E[now2], E[~now2], alpha=.95 )
results13.print_summary()
results23 = logrank_test(T[now3], T[~now3], E[now3], E[~now3], alpha=.95 )
results23.print_summary()

kmf = KaplanMeierFitter()
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
kmf.fit(T,E, label="Nowotwór")
kmf.plot(ci_force_lines=True, ax=ax1)
for i in daneO['Nowotwor'].unique():
    ix = daneO['Nowotwor'] == i
    kmf.fit(T.ix[ix],E.ix[ix], label= i)
    ax = kmf.plot(ax=ax2)
fig.tight_layout()
fig.savefig('rys_4.13.png')
## Results
##    alpha: 0.95
##    t 0: -1
##    null distribution: chi squared
##    test: logrank
##    df: 1
## 
##    __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
##          0.31958 |              0.991 |  Cannot Reject Null |       False       
## Results
##    alpha: 0.95
##    t 0: -1
##    null distribution: chi squared
##    test: logrank
##    df: 1
## 
##    __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
##          0.00775 |              7.091 |      Reject Null    |        True       
## Results
##    alpha: 0.95
##    t 0: -1
##    null distribution: chi squared
##    test: logrank
##    df: 1
## 
##    __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
##          0.00068 |             11.552 |      Reject Null    |        True

Rozdział 4.6.2*

from scipy import *
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter
from lifelines.utils import k_fold_cross_validation

daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")

daneO['Niepowodzenia'] = daneO.Niepowodzenia.replace(dict(brak=0, wznowa=1))
daneO = daneO[isfinite(daneO['Nowotwor'])]

cf = CoxPHFitter(normalize=False)
cf.fit(daneO[['Nowotwor','Wiek','Okres.bez.wznowy','Niepowodzenia']],\
      duration_col='Okres.bez.wznowy', event_col='Niepowodzenia',\
      show_progress=True,include_likelihood=True)
cf.print_summary()

scores = k_fold_cross_validation(cf, daneO[['Nowotwor','Wiek','Okres.bez.wznowy','Niepowodzenia']], \
         duration_col='Okres.bez.wznowy', event_col='Niepowodzenia', k=5)
print('\nCross walidacja modelu:')
print(scores)
print('\n  średnia: %.6f' % mean(scores))
print('odch.std.: %.6f' % std(scores))
## Convergence completed after 5 iterations.
## n=86, number of events=13
## 
##                coef  exp(coef)  se(coef)          z         p  lower 0.95  upper 0.95    
## Nowotwor  1.754e+00  5.776e+00 5.957e-01  2.944e+00 3.238e-03   5.860e-01   2.921e+00  **
## Wiek     -5.632e-02  9.452e-01 4.340e-02 -1.297e+00 1.945e-01  -1.414e-01   2.877e-02    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Concordance = 0.747
## 
## Cross walidacja modelu:
## [0.64634146341463417, 0.76190476190476186, 0.75, 0.8214285714285714, 0.80303030303030298]
## 
##   średnia: 0.756541
## odch.std.: 0.060971

Rozdział 4.7.1

x = {1,2,3,4,5,6,7,8,9,10}
y = {5,6,7,8,9,10,11,12,13,14,15}

print((x | y)-(x & y),'\n')
print(x == (x - (y - x)),'\n')
A = {3}
print(A.issubset(x))
## {1, 2, 3, 4, 11, 12, 13, 14, 15} 
## 
## True 
## 
## True

Rozdział 4.7.2

from scipy import *

print(1-exp(0.1**15),'\n')
print(expm1(0.1**15),'\n')
print(log(1+0.1**20),'\n')
print(log1p(0.1**20))
## -1.11022302463e-15 
## 
## 1e-15 
## 
## 0.0 
## 
## 1e-20

Rozdział 4.7.3

from scipy import *

p1 = poly1d([1,0,2])
p2 = poly1d([1,1,2,2])
print(p1,'\n')
print(p2,'\n')
print(p1 + p1,'\n')
print(p1 * p2,'\n')
print(polyint(p1)(1)-polyint(p1)(0),'\n')
print(p2.deriv(),'\n')
print(roots(p2),'\n')
##    2
## 1 x + 2 
## 
##    3     2
## 1 x + 1 x + 2 x + 2 
## 
##    2
## 2 x + 4 
## 
##    5     4     3     2
## 1 x + 1 x + 4 x + 4 x + 4 x + 4 
## 
## 2.33333333333 
## 
##    2
## 3 x + 2 x + 2 
## 
## [ -4.85722573e-16+1.41421356j  -4.85722573e-16-1.41421356j
##   -1.00000000e+00+0.j        ]

Rozdział 4.7.4

from scipy import *
from scipy.special import sh_legendre
import matplotlib.pyplot as plt

print(sh_legendre(5),'\n')
print(sh_legendre(4),'\n')
print(sh_legendre(3),'\n')
print(sh_legendre(2),'\n')
print(sh_legendre(1),'\n')
print(sh_legendre(0),'\n')

x = linspace(0, 1, 100)
plt.figure(figsize=(8,6))
for i in range(5): ax = plt.plot(x, sh_legendre(i)(x), lw=2,label="stopnia %d"%i)
plt.legend(loc='lower center',ncol=3)
plt.title('Wielomiany Legendre`a\nbez normalizacji')
plt.tight_layout()
plt.savefig('rys_4.15.png')
##      5       4       3       2
## 252 x - 630 x + 560 x - 210 x + 30 x - 1 
## 
##     4       3      2
## 70 x - 140 x + 90 x - 20 x + 1 
## 
##     3      2
## 20 x - 30 x + 12 x - 1 
## 
##    2
## 6 x - 6 x + 1 
## 
##  
## 2 x - 1 
## 
##  
## 1

Rozdział 4.5.7

from scipy.optimize import *

f = lambda x: (x -7)**2 - x
res = fminbound(f, -1,10)
fun = f(res)
print ('Dla argumentu x:', ("%.7f" % res), 'funkcja osiąga minimum:', ("%.7f" % fun))
## Dla argumentu x: 7.5000000 funkcja osiąga minimum: -7.2500000
from scipy.optimize import *

f = lambda x: (x -7)**2 - x
sol = brentq(f,-1,10)
fun = f(sol)
print ('Dla argumentu x:', ("%.7f" % sol), 'funkcja ma wartość zero:', ("%.7f" % fun))
## Dla argumentu x: 4.8074176 funkcja ma wartość zero: 0.0000000

Rozdział 4.7.6

from sympy import *

x = symbols('x')
print(diff(3*(x-2)**2-15, x, 1))
## 6*x - 12
from scipy import *
from scipy.misc import derivative

f = lambda x: (x-2)**2+15
wyraz = lambda x: derivative(f, x, dx=1e-6)
X = arange(1,4,1)
print(wyraz(X))
## [-2.  0.  2.]
import warnings
warnings.filterwarnings("ignore")

from scipy import *
import scipy.stats as stats
from scipy.integrate import *

f = lambda x: stats.norm.pdf(x)
print(quad(f, 0, Inf, limit=2))
g = lambda x: sin(x)**2
print(quad(g, 0, 600, limit=2))
## (0.5000005473812185, 0.03966696247407325)
## (299.16747101861347, 179.0735567361394)