import scipy.stats as stats
u = stats.uniform.rvs(loc= 0, scale= 1, size= 10, random_state= 1313)
print(u)
## [ 0.99250241 0.03101313 0.85090992 0.98288816 0.20383509 0.49741385
## 0.27774076 0.88546349 0.65512953 0.83098086]
import scipy.stats as stats
stats.uniform.rvs(loc= 0, scale= 1, size= 5)
stats.uniform.cdf(loc= 0, scale= 1, x= 0.5)
stats.uniform.pdf(loc= 0, scale= 1, x= 0.5)
stats.uniform.ppf(loc= 0, scale= 1, q= 0.1)
from scipy import *
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.pylab as mpl
mpl.rcParams["font.size"] = 15
x = arange(-4, 4, 0.01)
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1, 1, 1)
ax2 = ax1.twinx()
ax1.plot(x,stats.norm.pdf(x), color='C0', linestyle='-', linewidth=3)
ax2.plot(x,stats.norm.cdf(x), color='C1', linestyle='--', linewidth=3)
ax1.set_ylabel('stats.norm.pdf(x)', color='C0')
ax2.set_ylabel('stats.norm.cdf(x)', color='C1')
ax1.set_xlabel('x')
plt.tight_layout()
plt.savefig('rys_4.1.png')
import scipy.stats as stats
print(stats.norm.cdf(x= -3)+(1-stats.norm.cdf(x= 3)))
## 0.00269979606326
import scipy.stats as stats
print(stats.norm.pdf(x= [-1,0,1]))
## [ 0.24197072 0.39894228 0.24197072]
import scipy.stats as stats
print(stats.norm.ppf(q= [0.001, 0.025, 0.05, 0.5, 0.95, 0.975, 0.999]))
## [-3.09023231 -1.95996398 -1.64485363 0. 1.64485363 1.95996398
## 3.09023231]
import scipy.stats as stats
print(stats.norm.rvs(loc= 2, scale= 1, size= 10))
## [ 2.15484945 1.38551231 2.24869944 3.07372318 2.15051313 2.65334518
## 1.5418569 0.72848281 0.83493084 2.468362 ]
from scipy import *
import scipy.stats as stats
print(stats.norm.rvs(loc= linspace(1,10,10), scale= linspace(1,10,10), size= 10))
## [ 1.82127172 1.1863091 0.4459603 -0.75203839 -6.23573767
## 12.07365432 6.22924021 -7.25209158 20.29651775 -6.48766261]
from scipy import *
import scipy.stats as stats
def rPoisProcess(T=1, lambd=1):
N = stats.poisson.rvs(size= 1, mu= T*lambd, random_state= 2305)
return sorted(stats.uniform.rvs(size=N,loc=0,scale=T/2, random_state= 4101),\
reverse=False)
print(rPoisProcess(T=10, lambd=0.2))
## [1.9252328007676289, 2.0283654010219032]
from scipy import *
import scipy.stats as stats
import pandas as pd
def rRandomFiled(T1=1, T2=1, lambd=1):
N = stats.poisson.rvs(size= 1, mu= T1*T2*lambd, random_state= 2305)
d = {'x' : stats.uniform.rvs(size=N,loc=0,scale=T1/2, random_state= 4101),\
'y': stats.uniform.rvs(size=N,loc=0,scale=T2/2, random_state= 4026 )}
return pd.DataFrame(d)
print(rRandomFiled(T1=10, T2=10, lambd=0.02))
## x y
## 0 1.925233 1.023386
## 1 2.028365 3.471873
from scipy import *
import scipy.linalg as la
n = 100; d = 100; N = 10000
random.seed(120)
Sigma = cov(1 * random.randn(n, d) + 0)
random.seed(122)
X = 1 * random.randn(n, d) + 0
Q = la.cholesky(Sigma)
y = dot(X,Q)
u, d, v = la.svd(Sigma, full_matrices=True)
y = dot(X,dot(dot(u,diag(sqrt(d))),v))
val, vec = la.eigh(Sigma)
values = val[::-1]
vectors = fliplr(vec)
y = dot(X,dot(vectors,dot(diag(sqrt(values)),transpose(vectors))))
from scipy import *
from copulalib.copulalib import Copula
x = random.normal(size=40)
y = x+random.normal(size=40)
frank = Copula(x,y,family='frank')
uf,vf = frank.generate_uv(1000)
clayton = Copula(x,y,family='clayton')
uc,vc = clayton.generate_uv(1000)
gumbel = Copula(x,y,family='gumbel')
ug,vg = gumbel.generate_uv(1000)
from scipy import *
import scipy.stats as stats
wek = stats.lognorm.rvs(s= 1, loc= 0, scale= 1, size= 100, random_state= 2305)
fitN = stats.norm.fit(wek)
fitG = stats.gamma.fit(wek)
logLik = sum(stats.norm.logpdf(wek, loc= fitN[0], scale= fitN[1]))
print('Rozkład normalny:')
print('szukane parametry: loc= %.6f, scale= %.6f' % (fitN[0], fitN[1]))
print('logarytm wairygodności: %.6f' % logLik, '\n')
logLik = sum(stats.gamma.logpdf(wek, a=fitG[0], loc=fitG[1], scale=fitG[2]))
print('Rozkład gamma:')
print('szukane parametry: shape= %.6f, loc= %.6f, scale= %.6f' % (fitG[0], fitG[1], fitG[2]))
print('logarytm wairygodności: %.6f' % logLik)
## Rozkład normalny:
## szukane parametry: loc= 1.682725, scale= 2.958832
## logarytm wairygodności: -250.373306
##
## Rozkład gamma:
## szukane parametry: shape= 0.101070, loc= 0.113678, scale= 1.568297
## logarytm wairygodności: -274.995035
import pandas as pd
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(df.shape)
## (155, 20)
import pandas as pd
from scipy import *
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(sum(df.isnull().values))
## 167
import pandas as pd
from scipy import *
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(pd.isnull(df).max(axis=1).sum())
## 75
import pandas as pd
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(df.dropna().shape)
## (80, 20)
import pandas as pd
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
print(df['V19'].describe())
## count 88.000000
## mean 61.852273
## std 22.875244
## min 0.000000
## 25% 46.000000
## 50% 61.000000
## 75% 76.250000
## max 100.000000
## Name: V19, dtype: float64
import pandas as pd
from scipy import *
df = pd.DataFrame()
df['wektor'] = [1, 5, nan, 2, 8, 8, 3, nan, 10]
liczba = df.fillna(value=2.5)
df['liczba'] = liczba
print(df)
## wektor liczba
## 0 1.0 1.0
## 1 5.0 5.0
## 2 NaN 2.5
## 3 2.0 2.0
## 4 8.0 8.0
## 5 8.0 8.0
## 6 3.0 3.0
## 7 NaN 2.5
## 8 10.0 10.0
import pandas as pd
from scipy import *
df = pd.DataFrame()
df['wektor'] = [1, 5, nan, 2, 8, 8, 3, nan, 10]
srednia = df['wektor'].fillna(df['wektor'].mean())
df['srednia'] = srednia
print(df)
## wektor srednia
## 0 1.0 1.000000
## 1 5.0 5.000000
## 2 NaN 5.285714
## 3 2.0 2.000000
## 4 8.0 8.000000
## 5 8.0 8.000000
## 6 3.0 3.000000
## 7 NaN 5.285714
## 8 10.0 10.000000
import pandas as pd
from scipy import *
df = pd.DataFrame()
df['wektor'] = [1, 5, nan, 2, 8, 8, 3, nan, 10]
imput = df['wektor'].fillna(random.choice(df['wektor'].dropna()))
df['imput'] = imput
print(df)
## wektor imput
## 0 1.0 1.0
## 1 5.0 5.0
## 2 NaN 8.0
## 3 2.0 2.0
## 4 8.0 8.0
## 5 8.0 8.0
## 6 3.0 3.0
## 7 NaN 8.0
## 8 10.0 10.0
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.imputation import *
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
df = df[['V18','V6','V12']]
imp = mice.MICEData(df)
fml = 'V18 ~ V6 + V12'
mice = mice.MICE(fml, sm.OLS, imp)
results = mice.fit(3,3)
print(results.summary())
## Results: MICE
## =============================================================
## Method: MICE Sample size: 154
## Model: OLS Scale 0.36
## Dependent variable: V18 Num. imputations 3
## -------------------------------------------------------------
## Coef. Std.Err. t P>|t| [0.025 0.975] FMI
## -------------------------------------------------------------
## Intercept 2.8393 0.2038 13.9326 0.0000 2.4399 3.2387 0.1135
## V6 0.3442 0.1140 3.0186 0.0025 0.1207 0.5677 0.0735
## V12 0.3062 0.1124 2.7249 0.0064 0.0860 0.5264 0.0399
## =============================================================
Opis tej metody dla pakietu R jest dostępny pod adresem: https://www.jstatsoft.org/article/view/v045i03
import pandas as pd
import statsmodels.formula.api as smf
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/hepatitis.csv")
df = df[['V18','V6','V12']]
results = smf.ols('V18 ~ V6 +V12', data=df, missing='drop').fit()
print(results.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: V18 R-squared: 0.147
## Model: OLS Adj. R-squared: 0.134
## Method: Least Squares F-statistic: 11.36
## Date: Sat, 28 Jan 2017 Prob (F-statistic): 2.80e-05
## Time: 15:08:39 Log-Likelihood: -121.14
## No. Observations: 135 AIC: 248.3
## Df Residuals: 132 BIC: 257.0
## Df Model: 2
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## Intercept 2.8870 0.205 14.104 0.000 2.482 3.292
## V6 0.3069 0.117 2.630 0.010 0.076 0.538
## V12 0.3139 0.117 2.677 0.008 0.082 0.546
## ==============================================================================
## Omnibus: 5.940 Durbin-Watson: 2.135
## Prob(Omnibus): 0.051 Jarque-Bera (JB): 9.116
## Skew: -0.094 Prob(JB): 0.0105
## Kurtosis: 4.259 Cond. No. 10.5
## ==============================================================================
##
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from scipy import *
import pandas as pd
daneSoc = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")
print(daneSoc.iloc[:,[0,5,6]].cov())
## wiek cisnienie.skurczowe cisnienie.rozkurczowe
## wiek 191.742176 -6.016662 -8.941249
## cisnienie.skurczowe -6.016662 246.903989 82.809186
## cisnienie.rozkurczowe -8.941249 82.809186 60.324616
from scipy import *
import pandas as pd
daneSoc = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")
print(daneSoc.iloc[:,[0,5,6]].apply(lambda x: (x-mean(x))/std(x,ddof=1)).cov())
## wiek cisnienie.skurczowe cisnienie.rozkurczowe
## wiek 1.000000 -0.027652 -0.083137
## cisnienie.skurczowe -0.027652 1.000000 0.678527
## cisnienie.rozkurczowe -0.083137 0.678527 1.000000
from scipy import *
import pandas as pd
daneSoc = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")
daneSoc.iloc[:,[0,5,6]].apply(lambda x: x/std(x,ddof=1))
from scipy import *
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sm.qqplot((daneO['VEGF'])**0.5,line='q',ax=ax1)
ax1.set_title('Transformacja pierwiastkowa - zmienna VEGF')
sm.qqplot(log(daneO['VEGF']),line='q',ax=ax2)
ax2.set_title('Transformacja logarytmiczna - zmienna VEGF')
fig.tight_layout()
fig.savefig('rys_4.2.png')
from scipy import *
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
prob = stats.probplot((daneO['VEGF'])**0.5, dist=stats.norm, plot=ax1)
ax1.set_title('Transformacja pierwiastkowa - zmienna VEGF')
prob = stats.probplot(log(daneO['VEGF']), dist=stats.norm, plot=ax2)
ax2.set_title('Transformacja logarytmiczna - zmienna VEGF')
fig.tight_layout()
fig.savefig('rys_4.3.png')
from scipy import *
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
lmbdas = linspace(-2, 2)
llfVEGF = zeros(lmbdas.shape, dtype=float)
for ii, lmbda in enumerate(lmbdas):
llfVEGF[ii] = stats.boxcox_llf(lmbda, daneO['VEGF'])
llfWiek = zeros(lmbdas.shape, dtype=float)
for ii, lmbda in enumerate(lmbdas):
llfWiek[ii] = stats.boxcox_llf(lmbda, daneO['Wiek'])
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
ax1.plot(lmbdas, llfVEGF)
ax1.plot(lmbdas[llfVEGF.argmax()],llfVEGF.max(), 'o',color='C1')
ax1.set_title('Zmienna VEGF')
ax1.set_xlabel('lmbda parameter')
ax1.set_ylabel('Box-Cox log-likelihood')
ax2.plot(lmbdas, llfWiek)
ax2.plot(lmbdas[llfWiek.argmax()],llfWiek.max(), 'o',color='C1')
ax2.set_title('Zmienna Wiek')
ax2.set_xlabel('lmbda parameter')
ax2.set_ylabel('Box-Cox log-likelihood')
fig.tight_layout()
fig.savefig('rys_4.4.png')
from scipy import *
import pandas as pd
zmOryginalna = array([0.1, 0.8, 0.5, 0.2, 0.9, 0.7])
zmZmieniona = pd.cut(zmOryginalna,[0,0.33,0.66,1],labels=['niski','sredni','wysoki'],precision=0)
print(zmZmieniona)
## [niski, wysoki, sredni, niski, wysoki, wysoki]
## Categories (3, object): [niski < sredni < wysoki]
from scipy import *
import pandas as pd
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
tDaneO = daneO
tDaneO['logVEGF'] = log(tDaneO['VEGF'])
tDaneO["fNowotwor"] = daneO["Nowotwor"].astype('category')
tDaneO['cutWiek'] = pd.cut(daneO['Wiek'],4,precision=0)
print(tDaneO.head(4))
## Wiek Rozmiar.guza Wezly.chlonne Nowotwor Receptory.estrogenowe \
## 0 29 1 0 2.0 (-)
## 1 29 1 0 2.0 (++)
## 2 30 1 1 2.0 (-)
## 3 32 1 0 3.0 (++)
##
## Receptory.progesteronowe Niepowodzenia Okres.bez.wznowy VEGF logVEGF \
## 0 (++) brak 22.0 914 6.817831
## 1 (++) brak 53.0 1118 7.019297
## 2 (+) brak 38.0 630 6.445720
## 3 (++) brak 26.0 1793 7.491645
##
## fNowotwor cutWiek
## 0 2.0 (29, 36]
## 1 2.0 (29, 36]
## 2 2.0 (29, 36]
## 3 3.0 (29, 36]
from scipy import *
import pandas as pd
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
print(mieszkania.describe(include='all'))
## cena pokoi powierzchnia dzielnica typ.budynku
## count 200.000000 200.000000 200.000000 200 200
## unique NaN NaN NaN 3 3
## top NaN NaN NaN Krzyki wiezowiec
## freq NaN NaN NaN 79 76
## mean 175934.039900 2.550000 46.197000 NaN NaN
## std 43078.651064 1.083221 20.079142 NaN NaN
## min 83279.870000 1.000000 17.000000 NaN NaN
## 25% 143304.052500 2.000000 31.150000 NaN NaN
## 50% 174935.040000 3.000000 43.700000 NaN NaN
## 75% 208740.710000 3.000000 61.400000 NaN NaN
## max 295761.800000 4.000000 87.700000 NaN NaN
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
a1 = ols('cena ~ dzielnica',
data=mieszkania).fit()
aov_table = sm.stats.anova_lm(a1, typ=2)
esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
print(aov_table)
print('\n')
print('Eta = %.6f' % esq_sm)
print('p-value = %.6f' % aov_table['PR(>F)'][0])
## sum_sq df F PR(>F)
## dzielnica 1.799538e+10 2.0 5.045633 0.007294
## Residual 3.513029e+11 197.0 NaN NaN
##
##
## Eta = 0.048729
## p-value = 0.007294
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
a2 = ols('cena ~ typ_budynku',
data=mieszkania).fit()
aov_table = sm.stats.anova_lm(a2, typ=2)
esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
print(aov_table)
print('\n')
print('Eta = %.6f' % esq_sm)
print('p-value = %.6f' % aov_table['PR(>F)'][0])
## sum_sq df F PR(>F)
## typ_budynku 2.277045e+10 2.0 6.472465 0.001895
## Residual 3.465278e+11 197.0 NaN NaN
##
##
## Eta = 0.061659
## p-value = 0.001895
import pandas as pd
from statsmodels.stats.multicomp import *
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
res2 = pairwise_tukeyhsd(mieszkania['cena'], mieszkania['dzielnica'])
print(res2)
## Multiple Comparison of Means - Tukey HSD,FWER=0.05
## ==============================================================
## group1 group2 meandiff lower upper reject
## --------------------------------------------------------------
## Biskupin Krzyki -21321.0188 -38022.2368 -4619.8008 True
## Biskupin Srodmiescie -18350.5407 -36534.1091 -166.9722 True
## Krzyki Srodmiescie 2970.4781 -14451.4593 20392.4155 False
## --------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sns.boxplot(x="cena", y="dzielnica", data=mieszkania, color="C0", ax=ax1, saturation=0.2)
sns.barplot(x="cena", y="dzielnica", data=mieszkania, color="C0", ax=ax2, saturation=0.2)
plt.tight_layout()
plt.savefig('rys_4.5.png')
from patsy.contrasts import Treatment
contrast = Treatment(reference=0).code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
## [[ 0. 1. 0. 0. 0.]
## [ 0. 0. 1. 0. 0.]
## [ 0. 0. 0. 1. 0.]
## [ 0. 0. 0. 0. 1.]]
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept([1,2,3,4,5])
print(contrast.matrix.T)
## [[-1. 1. 0. 0. 0.]
## [-1. -1. 2. 0. 0.]
## [-1. -1. -1. 3. 0.]
## [-1. -1. -1. -1. 4.]]
from statsmodels.formula.api import ols
from patsy.contrasts import Helmert
import pandas as pd
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
contrast = Helmert().code_without_intercept([1,2,3])
model = ols("cena ~ C(dzielnica, contrast)", data=mieszkania).fit()
print(model.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: cena R-squared: 0.049
## Model: OLS Adj. R-squared: 0.039
## Method: Least Squares F-statistic: 5.046
## Date: Sat, 28 Jan 2017 Prob (F-statistic): 0.00729
## Time: 15:08:47 Log-Likelihood: -2412.4
## No. Observations: 200 AIC: 4831.
## Df Residuals: 197 BIC: 4841.
## Df Model: 2
## Covariance Type: nonrobust
## ===============================================================================================
## coef std err t P>|t| [0.025 0.975]
## -----------------------------------------------------------------------------------------------
## Intercept 1.763e+05 3015.732 58.450 0.000 1.7e+05 1.82e+05
## C(dzielnica, contrast)[H.2] -1.066e+04 3535.809 -3.015 0.003 -1.76e+04 -3687.615
## C(dzielnica, contrast)[H.3] -2563.3438 2219.758 -1.155 0.250 -6940.882 1814.195
## ==============================================================================
## Omnibus: 10.736 Durbin-Watson: 2.157
## Prob(Omnibus): 0.005 Jarque-Bera (JB): 5.258
## Skew: 0.160 Prob(JB): 0.0721
## Kurtosis: 2.273 Cond. No. 1.63
## ==============================================================================
##
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.formula.api import ols
import pandas as pd
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
contrast = [[2,0], [-1,1],[-1,-1]]
model = ols("cena ~ C(dzielnica, contrast)", data=mieszkania).fit()
print(model.summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: cena R-squared: 0.049
## Model: OLS Adj. R-squared: 0.039
## Method: Least Squares F-statistic: 5.046
## Date: Sat, 28 Jan 2017 Prob (F-statistic): 0.00729
## Time: 15:08:47 Log-Likelihood: -2412.4
## No. Observations: 200 AIC: 4831.
## Df Residuals: 197 BIC: 4841.
## Df Model: 2
## Covariance Type: nonrobust
## ===================================================================================================
## coef std err t P>|t| [0.025 0.975]
## ---------------------------------------------------------------------------------------------------
## Intercept 1.763e+05 3015.732 58.450 0.000 1.7e+05 1.82e+05
## C(dzielnica, contrast)[custom0] 6611.9266 2135.391 3.096 0.002 2400.767 1.08e+04
## C(dzielnica, contrast)[custom1] -1485.2390 3688.392 -0.403 0.688 -8759.040 5788.562
## ==============================================================================
## Omnibus: 10.736 Durbin-Watson: 2.157
## Prob(Omnibus): 0.005 Jarque-Bera (JB): 5.258
## Skew: 0.160 Prob(JB): 0.0721
## Kurtosis: 2.273 Cond. No. 1.77
## ==============================================================================
##
## Warnings:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
a3 = ols('cena ~ dzielnica + typ_budynku',
data=mieszkania).fit()
aov_table = sm.stats.anova_lm(a3, typ=2)
print(aov_table)
print('\n')
print('dzielnica: p-value = %.6f, typ_budynku: p-value = %.6f' % \
(aov_table['PR(>F)'][0],aov_table['PR(>F)'][1]))
## sum_sq df F PR(>F)
## dzielnica 1.794386e+10 2.0 5.324443 0.005605
## typ_budynku 2.271893e+10 2.0 6.741338 0.001476
## Residual 3.285840e+11 195.0 NaN NaN
##
##
## dzielnica: p-value = 0.005605, typ_budynku: p-value = 0.001476
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.factorplots import interaction_plot
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1,1,1)
interaction_plot(mieszkania['dzielnica'], mieszkania['typ_budynku'] ,mieszkania['cena'],\
colors=['red','blue','green'], ax=ax1)
plt.savefig('rys_4.6.png')
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
a4 = ols('cena ~ dzielnica * typ_budynku', data=mieszkania).fit()
a4 = ols('cena ~ (dzielnica + typ_budynku)**2', data=mieszkania).fit()
a4 = ols('cena ~ dzielnica + typ_budynku + dzielnica:typ_budynku', data=mieszkania).fit()
aov_table = sm.stats.anova_lm(a4, typ=2)
print(aov_table)
## sum_sq df F PR(>F)
## dzielnica 1.794386e+10 2.0 5.231069 0.006141
## typ_budynku 2.271893e+10 2.0 6.623115 0.001656
## dzielnica:typ_budynku 9.952797e+08 4.0 0.145074 0.964995
## Residual 3.275887e+11 191.0 NaN NaN
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
modelPP = ols('cena ~ powierzchnia + pokoi', data=mieszkania).fit()
print(modelPP.params)
print('\n')
print('R2: %.6f, R2adj: %.6f' % (modelPP.rsquared ,modelPP.rsquared_adj))
print('\n')
print(modelPP.summary2())
## Intercept 82407.088281
## powierzchnia 2070.896564
## pokoi -840.100768
## dtype: float64
##
##
## R2: 0.893723, R2adj: 0.892644
##
##
## Results: Ordinary least squares
## ======================================================================
## Model: OLS Adj. R-squared: 0.893
## Dependent Variable: cena AIC: 4392.5453
## Date: 2017-01-28 15:08 BIC: 4402.4403
## No. Observations: 200 Log-Likelihood: -2193.3
## Df Model: 2 F-statistic: 828.3
## Df Residuals: 197 Prob (F-statistic): 1.27e-96
## R-squared: 0.894 Scale: 1.9923e+08
## ----------------------------------------------------------------------
## Coef. Std.Err. t P>|t| [0.025 0.975]
## ----------------------------------------------------------------------
## Intercept 82407.0883 2569.9057 32.0662 0.0000 77339.0312 87475.1454
## powierzchnia 2070.8966 149.1705 13.8828 0.0000 1776.7206 2365.0725
## pokoi -840.1008 2765.1018 -0.3038 0.7616 -6293.1001 4612.8986
## ----------------------------------------------------------------------
## Omnibus: 0.368 Durbin-Watson: 1.764
## Prob(Omnibus): 0.832 Jarque-Bera (JB): 0.425
## Skew: 0.101 Prob(JB): 0.809
## Kurtosis: 2.898 Condition No.: 150
## ======================================================================
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
modelPP = ols('cena ~ powierzchnia + pokoi + dzielnica', data=mieszkania).fit()
print(modelPP.summary2())
## Results: Ordinary least squares
## ======================================================================================
## Model: OLS Adj. R-squared: 0.935
## Dependent Variable: cena AIC: 4294.6332
## Date: 2017-01-28 15:08 BIC: 4311.1248
## No. Observations: 200 Log-Likelihood: -2142.3
## Df Model: 4 F-statistic: 714.8
## Df Residuals: 195 Prob (F-statistic): 2.93e-115
## R-squared: 0.936 Scale: 1.2092e+08
## --------------------------------------------------------------------------------------
## Coef. Std.Err. t P>|t| [0.025 0.975]
## --------------------------------------------------------------------------------------
## Intercept 94222.0155 2320.3617 40.6066 0.0000 89645.7889 98798.2422
## dzielnica[T.Krzyki] -20934.8600 1842.7863 -11.3604 0.0000 -24569.2107 -17300.5094
## dzielnica[T.Srodmiescie] -12722.6045 2008.0264 -6.3359 0.0000 -16682.8422 -8762.3667
## powierzchnia 2022.9886 116.3125 17.3927 0.0000 1793.5967 2252.3806
## pokoi 34.3596 2157.5248 0.0159 0.9873 -4220.7195 4289.4386
## --------------------------------------------------------------------------------------
## Omnibus: 1.237 Durbin-Watson: 2.067
## Prob(Omnibus): 0.539 Jarque-Bera (JB): 0.955
## Skew: 0.156 Prob(JB): 0.620
## Kurtosis: 3.131 Condition No.: 188
## ======================================================================================
modelPP.fittedvalues modelPP.resid modelPP.resid_pearson
from scipy import *
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.regressionplots import *
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
modelPP = ols('cena ~ powierzchnia + pokoi + dzielnica', data=mieszkania).fit()
fig = plt.figure(figsize=(14,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
plot_leverage_resid2(modelPP, ax=ax1)
influence_plot(modelPP, size=12, ax=ax2)
ax3.plot(arange(1,len(modelPP.resid)+1),modelPP.resid)
ax3.set_title('Reszty modelu')
ax3.set_xlabel('numer')
ax3.set_ylabel('reszty')
ax4.plot(modelPP.fittedvalues,modelPP.resid,'o')
ax4.set_title('Reszty vs wartości wyrównane modelu')
ax4.set_xlabel('wyrównane')
ax4.set_ylabel('reszty')
fig.tight_layout()
fig.savefig('rys_4.7.png')
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
mieszkania = mieszkania.rename(columns={"typ.budynku": "typ_budynku"})
modelDP = ols('cena ~ powierzchnia + dzielnica', data=mieszkania).fit()
ndane = pd.DataFrame()
ndane['powierzchnia'] = [48,68]
ndane['dzielnica'] = ['Biskupin','Krzyki']
print(modelDP.predict(ndane))
## 0 191415.933843
## 1 210976.884413
## dtype: float64
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
mieszkania = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/mieszkania.csv")
rlt = ols('cena ~ powierzchnia + pokoi+ cena', data=mieszkania).fit()
VIF=pd.DataFrame([variance_inflation_factor(rlt.model.exog,i) for i in range(1,rlt.model.exog.shape[1])],index=rlt.model.exog_names[1:],columns=['VIF'])
print(VIF)
## VIF
## powierzchnia 17.727848
## pokoi 8.965221
## cena 9.409346
from scipy import *
import pandas as pd
import statsmodels.api as sm
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
df['Niepowodzenia'] = df.Niepowodzenia.replace(dict(brak=0, wznowa=1))
df['logVEGF'] = log(df['VEGF'])
df['const'] = 1
y_name = 'Niepowodzenia'
x_name = ['const', 'Nowotwor', 'logVEGF']
modelNV = sm.GLM(df[y_name], df[x_name],missing='drop',family=sm.families.Binomial()).fit()
print(modelNV.summary())
## Generalized Linear Model Regression Results
## ==============================================================================
## Dep. Variable: Niepowodzenia No. Observations: 86
## Model: GLM Df Residuals: 83
## Model Family: Binomial Df Model: 2
## Link Function: logit Scale: 1.0
## Method: IRLS Log-Likelihood: -23.576
## Date: Sat, 28 Jan 2017 Deviance: 47.152
## Time: 15:08:53 Pearson chi2: 65.9
## No. Iterations: 6
## ==============================================================================
## coef std err z P>|z| [0.025 0.975]
## ------------------------------------------------------------------------------
## const -17.3914 4.376 -3.975 0.000 -25.968 -8.815
## Nowotwor 2.2586 0.766 2.950 0.003 0.758 3.759
## logVEGF 1.3293 0.425 3.128 0.002 0.496 2.162
## ==============================================================================
from scipy import *
import pandas as pd
import statsmodels.api as sm
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
df['Niepowodzenia'] = df.Niepowodzenia.replace(dict(brak=0, wznowa=1))
df['logVEGF'] = log(df['VEGF'])
df['const'] = 1
y_name = 'Niepowodzenia'
x_name = ['const', 'Nowotwor']
modelN = sm.GLM(df[y_name], df[x_name],missing='drop',family=sm.families.Binomial()).fit()
daneO = pd.DataFrame()
daneO['const'] = [1,1,1]
daneO['Nowotwor'] = [1,2,3]
print(modelN.predict(daneO))
## 0 0.011259
## 1 0.072498
## 2 0.349185
## dtype: float64
from scipy import *
from scipy.optimize import *
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/tab01.csv")
d = df.dropna()
def f(x,a,b):
return x**a-b
sol, pcov = curve_fit(f, d['x'], d['y'], p0=(2, 2))
print ("a =", sol[0], "+/-", pcov[0,0]**0.5, \
" " , round((pcov[0,0]**0.5/sol[0])*100,3),"%")
print ("b =", sol[1], "+/-", pcov[1,1]**0.5, \
" " , round((pcov[1,1]**0.5/sol[1])*100,3),"%" '\n')
df = len(d['x']) - len(sol)
chi_squared = sum(((f(d['x'], *sol) - d['y']) / 1)**2)
reduced_chi_squared = chi_squared / df
print ('degrees of freedom: ', df)
print ('chi-square value: ', ("%.3f" % chi_squared))
print ('reduced chi-square value: ', ("%.3f" % reduced_chi_squared))
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1,1,1)
ax1.plot(d['x'],d['y'],'o')
X = arange(d['x'].min(),d['x'].max(),0.1)
Y = f(X,*sol)
ax1.plot(X, Y, linewidth=2, color="orange")
ax1.plot(X, X**2.5-5, '--', linewidth=2, color="C2")
ax1.set_title('$y = x^{2.5}-5+\\varepsilon$')
ax1.set_xlabel('x')
ax1.set_ylabel('y')
fig.tight_layout()
fig.savefig('rys_4.8.png')
## a = 2.51425786701 +/- 0.0234084769392 0.931 %
## b = 4.69439487063 +/- 0.612348759614 13.044 %
##
## degrees of freedom: 40
## chi-square value: 414.627
## reduced chi-square value: 10.366
from scipy import *
from scipy.stats import *
from skgof import cvm_test
x = norm(10, 1.4).rvs(random_state=1, size=100)
print(cvm_test(x, norm(10, 1.4)))
#x = norm(10, 1.4).rvs(random_state=1, size=100)
print('\nWyciągamy p-wartość: pwynik = %.6f' % cvm_test(x, norm(10, 1.4)).pvalue)
## GofResult(statistic=0.10543122652629035, pvalue=0.55976362337633079)
##
## Wyciągamy p-wartość: pwynik = 0.559764
from scipy import *
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
y = random.normal(size= 100)
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
sm.qqplot(y,line='q',ax=ax1)
ax1.set_title('Normal Q-Q Plot')
sm.qqplot(y,line='q',dist=stats.expon,ax=ax2)
ax2.set_title('Exponential Q-Q Plot')
fig.tight_layout()
fig.savefig('rys_4.9.png')
from scipy import *
import pandas as pd
import scipy.stats as stats
random.seed(4101)
x = random.uniform(size = 100)
df = pd.DataFrame()
df['x'] = x
df['cutx'] = pd.cut(df['x'],5,precision=0)
df = df.groupby(['cutx'],as_index=False)['x'].agg({'zm' : 'count'})
print(df,'\n')
chi2, p = stats.chisquare(df['zm'])
print('X-squared = %.6f, df = %.0f, p-value = %.6f' % (chi2,df.shape[0]-1,p))
## cutx zm
## 0 (0.002, 0.2] 22
## 1 (0.2, 0.4] 20
## 2 (0.4, 0.6] 13
## 3 (0.6, 0.8] 28
## 4 (0.8, 1] 17
##
## X-squared = 6.300000, df = 4, p-value = 0.177836
from scipy import *
import pandas as pd
import scipy.stats as stats
random.seed(4026)
x = random.beta(a=2, b=3, size = 100)
df = pd.DataFrame()
df['x'] = x
df['cutx'] = pd.cut(df['x'],5,precision=0)
df = df.groupby(['cutx'],as_index=False)['x'].agg({'zm' : 'count'})
print(df,'\n')
chi2, p = stats.chisquare(df['zm'])
print('X-squared = %.6f, df = %.0f, p-value = %.6f' % (chi2,df.shape[0]-1,p))
## cutx zm
## 0 (0.03, 0.2] 24
## 1 (0.2, 0.4] 31
## 2 (0.4, 0.5] 16
## 3 (0.5, 0.7] 20
## 4 (0.7, 0.9] 9
##
## X-squared = 13.700000, df = 4, p-value = 0.008317
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.uniform(size = 100)
chi2, p = stats.kstest(x,'uniform')
print('D = %.6f, p-value = %.6f' % (chi2,p))
## D = 0.063369, p-value = 0.816786
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.uniform(size = 100)
chi2, p = stats.kstest(x,'norm', args=(1,1))
print('D = %.6f, p-value = %.6f' % (chi2,p))
## D = 0.501903, p-value = 0.000000
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.uniform(size = 100)
y = random.normal(size = 100)
print(stats.ks_2samp(x,y))
## Ks_2sampResult(statistic=0.47999999999999998, pvalue=8.0827828929192009e-11)
from scipy import *
import matplotlib.pyplot as plt
x = random.exponential(size=100)
y = random.normal(size=100)
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1, 1, 1)
ax1.plot(sort(x), sort(y), 'o')
plt.tight_layout()
plt.savefig('rys_4.10.png')
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.normal(size=50)
y = x + 0.3 + random.normal(loc=0,scale=0.01,size=50)
print(stats.ttest_1samp(y, 0.0))
print('mean = %.6f' % mean(y))
## Ttest_1sampResult(statistic=2.1054568276626191, pvalue=0.040400430666069305)
## mean = 0.301538
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.normal(size=50)
y = x + 0.3 + random.normal(loc=0,scale=0.01,size=50)
print(stats.ttest_rel(x,y))
## Ttest_relResult(statistic=-226.90837947553518, pvalue=1.0419125460882572e-75)
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.normal(size=50)
y = x + 0.3 + random.normal(loc=0,scale=0.01,size=50)
print(stats.wilcoxon(y))
print(stats.ranksums(x,y))
print(stats.mannwhitneyu(x,y))
## WilcoxonResult(statistic=432.0, pvalue=0.047283854040010578)
## RanksumsResult(statistic=-1.5648971117287647, pvalue=0.11760703603911446)
## MannwhitneyuResult(statistic=1023.0, pvalue=0.05920878241374343)
from scipy import *
import scipy.stats as stats
random.seed(2305)
x = random.normal(size=20,loc=2,scale=1)
y = random.normal(size=20,loc=2,scale=2)
z = random.normal(size=20,loc=2,scale=3)
wart = list(x) + list(y) + list(z)
grup=sorted([1,2,3]*20)
F = var(x,ddof=1) / var(y,ddof=1)
df1 = len(x) - 1; df2 = len(y) - 1
pF = stats.f.cdf(F, df1, df2)
print('F = %.6f, df1 = %.0f, df2 = %.0f, p-value = %.6f' % (F,df1,df2,pF))
print('Z = %.6f, p-value = %.6f' % (stats.mood(x,y)[0],stats.mood(x,y)[1]))
print(stats.ansari(x,y))
print(stats.bartlett(wart,grup))
print(stats.fligner(wart,grup))
print(stats.levene(wart,grup))
## F = 0.082671, df1 = 19, df2 = 19, p-value = 0.000001
## Z = -3.288482, p-value = 0.001007
## AnsariResult(statistic=267.0, pvalue=0.0016590662842894473)
## BartlettResult(statistic=45.245372098460869, pvalue=1.7383082414404198e-11)
## FlignerResult(statistic=19.012874888600216, pvalue=1.2983941380633496e-05)
## LeveneResult(statistic=20.329787781367841, pvalue=1.5471327643540701e-05)
from statsmodels.stats.proportion import *
wtestu = proportions_chisquare(594, 1234, value=0.5)
print('Test chi-square (without continuity correction) = %.6f, p-value = %.6f' % (wtestu[0],wtestu[1]))
## Test chi-square (without continuity correction) = 1.714749, p-value = 0.190370
from statsmodels.stats.proportion import *
print(proportion_confint(594,1234,method='beta'))
## (0.4531511917599369, 0.50966066183703784)
from scipy import *
x = [11,120,1300,14000,150000]
n = [100,1000,10000,100000,1000000]
print([ x[i]/n[i] for i in range(5) ])
## [0.11, 0.12, 0.13, 0.14, 0.15]
from scipy import *
from statsmodels.stats.proportion import *
x = [11,120,1300,14000,150000]
n = [100,1000,10000,100000,1000000]
print(proportions_chisquare_allpairs(array(x),array(n),multitest_method='b'))
print('\np-value correction without continuity correction')
## Corrected p-values using Bonferroni p-value correction
##
## Pairs p-values
## (0, 1) 1
## (0, 2) 1
## (0, 3) 1
## (0, 4) 1
## (1, 2) 1
## (1, 3) 0.6956
## (1, 4) 0.07914
## (2, 3) 0.05863
## (2, 4) 2.456e-07
## (3, 4) 2.557e-16
##
## p-value correction without continuity correction
from scipy import *
import scipy.stats as stats
random.seed(227)
x = random.lognormal(size=50)
y = add(x, random.lognormal(size=50))
sol, pval = stats.pearsonr(x,y)
res, pwal = stats.spearmanr(x,y)
print('Correlation Pearson = %.6f, p-value = %.6f' % (sol,pval))
print('Correlation Spearman = %.6f, p-value = %.6f' % (res,pwal))
## Correlation Pearson = 0.560403, p-value = 0.000023
## Correlation Spearman = 0.587899, p-value = 0.000007
from scipy import *
import scipy.stats as stats
def get_z(rho):
return log((1+rho)/(1-rho))/2
def rho0_test(rho, n, rho0=0, alternative="two.sided"):
p_raw = stats.norm.cdf((get_z(rho)-get_z(rho0))/sqrt(1/(n-3)))
if alternative=="two.sided": return 1-2*abs(0.5-p_raw)
elif alternative=="less": return p_raw
elif alternative=="greater": return 1-p_raw
print(rho0_test(rho=0.23,n=60))
## 0.0770455747032
from scipy import *
import scipy.stats as stats
def get_z(rho):
return log((1+rho)/(1-rho))/2
def rho_rho_test(rho1, n1, rho2, n2, alternative="two.sided"):
p_raw = stats.norm.cdf((get_z(rho1)-get_z(rho2))/sqrt(1/(n1-3)+1/(n2-3)))
if alternative=="two.sided": return 1-2*abs(0.5-p_raw)
elif alternative=="less": return p_raw
elif alternative=="greater": return 1-p_raw
print(rho_rho_test(rho1=0.23,n1=60,rho2=0.33,n2=60))
## 0.5619332178
import pandas as pd
import scipy.stats as stats
from statsmodels.sandbox.stats.runs import *
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")
plec = df['plec'].values
praca = df['praca'].values
print(pd.crosstab(plec,praca))
s, p, dof, expctd = stats.chi2_contingency(pd.crosstab(plec,praca))
print('\nX-squared = %.6f, df = %.0f, p-value = %.6f\n' % (s, dof, p))
print(expctd)
oddsratio, pvalue = stats.fisher_exact(pd.crosstab(plec,praca))
print('\nodds ratio = %.6f, p-value = %.6f' % (oddsratio,pvalue))
## col_0 nie pracuje uczen lub pracuje
## row_0
## kobieta 7 48
## mezczyzna 45 104
##
## X-squared = 5.571056, df = 1, p-value = 0.018260
##
## [[ 14.01960784 40.98039216]
## [ 37.98039216 111.01960784]]
##
## odds ratio = 0.337037, p-value = 0.011203
import pandas as pd
from statsmodels.sandbox.stats.runs import *
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneSoc.csv")
plec = df['plec'].values
praca = df['praca'].values
sol, pval = mcnemar(pd.crosstab(plec,praca), exact=False)
print('McNemar`s Chi-squared test\n\nX-squared = %.6f, p-value = %.6f' % (sol,pval))
## McNemar`s Chi-squared test
##
## X-squared = 0.043011, p-value = 0.835705
W tym podrozdziale pokażemy jak uruchomić kod R w Pythonie z wykorzystaniem pakietu pyper
import pandas as pd
import pyper as pr
tab = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/tabela.csv")
r = pr.R(use_pandas = True)
r.assign("rdata", tab)
r("tab <- xtabs(~wzrost+charakter+plec,data=rdata)")
print(r("tab"))
r("laczna <- apply(tab,c(1,2),sum)")
print(r("laczna"))
print(r("chisq.test(laczna)"))
print(r("mantelhaen.test(xtabs(~wzrost+charakter+plec,data=rdata))"))
## try({tab})
## , , plec = kobieta
##
## charakter
## wzrost agresywny lagodny
## niski 10 100
## wysoki 1 10
##
## , , plec = mezczyzna
##
## charakter
## wzrost agresywny lagodny
## niski 10 1
## wysoki 100 10
##
##
## try({laczna})
## charakter
## wzrost agresywny lagodny
## niski 20 101
## wysoki 101 20
##
## try({chisq.test(laczna)})
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: laczna
## X-squared = 105.79, df = 1, p-value < 2.2e-16
##
##
## try({mantelhaen.test(xtabs(~wzrost+charakter+plec,data=rdata))})
##
## Mantel-Haenszel chi-squared test without continuity correction
##
## data: xtabs(~wzrost + charakter + plec, data = rdata)
## Mantel-Haenszel X-squared = 0, df = 1, p-value = 1
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2177312 4.5928200
## sample estimates:
## common odds ratio
## 1
from scipy import *
import pandas as pd
from statsmodels.stats.inter_rater import *
random.seed(227)
ocena1 = random.choice([2,3,4,5], size=100)
ocena2 = random.choice([2,3,4,5], size=100)
print(pd.crosstab(ocena1,ocena2, rownames=['ocena1'],colnames=['ocena2']),'\n')
print(cohens_kappa(pd.crosstab(ocena1,ocena2)))
## ocena2 2 3 4 5
## ocena1
## 2 5 7 11 5
## 3 4 7 5 9
## 4 7 9 1 7
## 5 6 5 6 6
##
## Simple Kappa Coefficient
## --------------------------------
## Kappa -0.0784
## ASE 0.0516
## 95% Lower Conf Limit -0.1796
## 95% Upper Conf Limit 0.0227
##
## Test of H0: Simple Kappa = 0
##
## ASE under H0 0.0574
## Z -1.3659
## One-sided Pr > Z 0.9140
## Two-sided Pr > |Z| 0.1720
from scipy import *
import pandas as pd
from statsmodels.stats.inter_rater import *
df = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/oceniajacy.csv")
print(pd.crosstab(df['oceniajacy1'],df['oceniajacy2']),'\n')
print('Czynniki traktujemy jako zmienne nominalne:\n')
print(cohens_kappa(pd.crosstab(df['oceniajacy1'],df['oceniajacy2'])),'\n')
print('Czynniki traktujemy jako zmienne porządkowe:\n')
print(cohens_kappa(pd.crosstab(df['oceniajacy1'],df['oceniajacy2']),wt='quadratic'))
## oceniajacy2 aSlabo bSrednio cDobrze dBardzo Dobrze
## oceniajacy1
## aSlabo 3 2 0 0
## bSrednio 0 1 0 0
## cDobrze 0 3 0 3
## dBardzo Dobrze 0 0 5 3
##
## Czynniki traktujemy jako zmienne nominalne:
##
## Simple Kappa Coefficient
## --------------------------------
## Kappa 0.1362
## ASE 0.1436
## 95% Lower Conf Limit -0.1452
## 95% Upper Conf Limit 0.4176
##
## Test of H0: Simple Kappa = 0
##
## ASE under H0 0.1201
## Z 1.1345
## One-sided Pr > Z 0.1283
## Two-sided Pr > |Z| 0.2566
##
##
## Czynniki traktujemy jako zmienne porządkowe:
##
## Weighted Kappa Coefficient
## --------------------------------
## Kappa 0.7461
## ASE 0.0738
## 95% Lower Conf Limit 0.6014
## 95% Upper Conf Limit 0.8908
##
## Test of H0: Weighted Kappa = 0
##
## ASE under H0 0.2199
## Z 3.3929
## One-sided Pr > Z 0.0003
## Two-sided Pr > |Z| 0.0007
from scipy import *
import scipy.stats as stats
import pandas as pd
from statsmodels.sandbox.stats.multicomp import *
rawp = [ stats.ttest_ind(random.normal(size=20),random.normal(size=20,loc=1))[1] \
for i in range(6) ]
_,Bonferroni,_,_ = multipletests(array(rawp),method='bonferroni')
_,BH,_,_ = multipletests(array(rawp),method='fdr_bh')
t = pd.DataFrame()
t['rawp'] = rawp
t['Bonferroni'] = Bonferroni
t['BH'] = BH
print(t)
## rawp Bonferroni BH
## 0 0.000344 0.002063 0.002063
## 1 0.124588 0.747527 0.124588
## 2 0.000719 0.004316 0.002158
## 3 0.099282 0.595694 0.119139
## 4 0.002370 0.014219 0.003555
## 5 0.002131 0.012784 0.003555
from scipy import *
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import *
N = 10**3
def getStat(size=30, p=0.2):
x = random.binomial(1, p, size)
y = random.binomial(1, p, size)
tab = pd.crosstab(x,y)
return stats.chi2_contingency(tab)[0]
sol = [ getStat() for i in range(N) ]
plt.figure(figsize=(8,6))
plt.plot(sol,ECDF(sol)(sol),'.',linewidth=2, color='C1')
plt.plot(sort(sol), stats.chi2.cdf(sort(sol),df=1), color='C0')
plt.axhline(y=0.95,color='k',linestyle='--',linewidth=1)
plt.axvline(x=percentile(sol, q=95),ymax=0.91,color='k',linestyle='--',linewidth=1)
plt.text(percentile(sol, q=95)-0.4, 0.6,'empiryczny kwantyl 0.95 = %.2f' % percentile(sol, q=95), rotation=90)
plt.axvline(x=stats.chi2.ppf(0.95,df=1),ymax=0.91,color='k',linestyle='--',linewidth=1)
plt.text(stats.chi2.ppf(0.95,df=1)-0.4, 0.6,'asymptotyczny kwantyl 0.95 = %.2f' % stats.chi2.ppf(0.95,df=1), rotation=90)
plt.xlabel('kwantyle')
plt.ylabel('p-value')
plt.title('Rozkład empiryczny i testowy statystyki testowej')
plt.tight_layout()
plt.savefig('rys_4.11.png')
import scipy.stats as stats
from scipy import *
import pandas as pd
kidney = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/kidney.csv")
x = kidney[kidney['recipient.age'] < 40]['MDRD30']
y = kidney[kidney['recipient.age'] >= 40]['MDRD30']
n = len(x); m = len(y); B = 999
theta = stats.ks_2samp(x,y).statistic
print('Statystyka testu K-S: theta = %.2f' % theta,'\n')
def thetastar(x,y):
zstar = random.choice(concatenate((x,y)),n+m)
xstar = zstar[:n]
ystar = zstar[n:n+m]
return stats.ks_2samp(xstar,ystar).statistic
res = [ thetastar(x,y) for i in range(B)]
print('Permutacyjna p-wartość testu K-S:')
print((sum(res >= theta) + 1) / (B+1))
## Statystyka testu K-S: theta = 0.22
##
## Permutacyjna p-wartość testu K-S:
## 0.001
from scipy import *
import scipy.stats as stats
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
n = daneO.shape[0]; B = 9999
wynikBoot = [ mean(random.choice(daneO['VEGF'],n)) for i in range(B)]
bootParametryczny = [ mean(random.normal(mean(daneO['VEGF']),std(daneO['VEGF'],ddof=1),n)) for i in range(B)]
print('95% przedział ufności dla oceny średniej - bootstrap nieparametryczny:')
print(percentile(wynikBoot,q=[5/2, 100-5/2]))
print('\n95% przedział ufności dla oceny średniej - bootstrap parametryczny:')
print(percentile(bootParametryczny,q=[5/2, 100-5/2]))
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
X = linspace(min(wynikBoot), max(wynikBoot), 100)
N, bins, patches = ax1.hist(wynikBoot ,normed=1, facecolor='C2',edgecolor='C2',
histtype='stepfilled', alpha=0.25, label='rozkład empiryczny')
ax1.plot(X, stats.norm.pdf(X, loc=mean(wynikBoot), scale=std(wynikBoot,ddof=1)), '--', color='C0',
linewidth=1,label='loc=%.2f, scale=%.2f' % (mean(wynikBoot),std(wynikBoot,ddof=1)))
ax1.set_title("Rozkład empiryczny vs. normalny\nbootstrap nieparametryczny")
ax1.legend()
X = linspace(min(bootParametryczny), max(bootParametryczny), 100)
N, bins, patches = ax2.hist(bootParametryczny ,normed=1, facecolor='C2',edgecolor='C2',
histtype='stepfilled', alpha=0.25, label='rozkład empiryczny')
ax2.plot(X, stats.norm.pdf(X, loc=mean(bootParametryczny), scale=std(bootParametryczny,ddof=1)), '--', color='C0',
linewidth=1,label='loc=%.2f, scale=%.2f' % (mean(bootParametryczny),std(bootParametryczny,ddof=1)))
ax2.set_title("Rozkład empiryczny vs. normalny\nbootstrap parametryczny")
ax2.legend()
fig.tight_layout()
fig.savefig('rys_4.12.png')
## 95% przedział ufności dla oceny średniej - bootstrap nieparametryczny:
## [ 1976.21597938 3393.44123711]
##
## 95% przedział ufności dla oceny średniej - bootstrap parametryczny:
## [ 1921.28524356 3336.2346008 ]
from scipy import *
import pandas as pd
from statsmodels.distributions.empirical_distribution import *
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
def bootstrapowyTestT(wektor,mu,B):
statystykaT = lambda x:\
(mean(random.choice(x,len(x)))-mean(x))/std(random.choice(x,len(x)),ddof=1)
wartosciStatystykiT = [ statystykaT(wektor) for i in range(B)]
T = (mean(wektor)-mu)/std(wektor,ddof=1)
kwantyl = ECDF(wartosciStatystykiT)(T)
return 1-2*abs(kwantyl-0.5)
print(bootstrapowyTestT(daneO['VEGF'],3000,999))
print(bootstrapowyTestT(daneO['VEGF'],4000,999))
## 0.26026026026
## 0.0
from scipy import *
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.plotting import plot_lifetimes
from lifelines.statistics import logrank_test
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
daneO['Niepowodzenia'] = daneO.Niepowodzenia.replace(dict(brak=0, wznowa=1))
daneO = daneO[isfinite(daneO['Nowotwor'])]
T = daneO['Okres.bez.wznowy']
E = daneO['Niepowodzenia']
daneO.Nowotwor[daneO["Nowotwor"]==1] = 'typ nowotworu 1'
daneO.Nowotwor[daneO["Nowotwor"]==2.0] = 'typ nowotworu 2'
daneO.Nowotwor[daneO["Nowotwor"]==3.0] = 'typ nowotworu 3'
now1 = (daneO["Nowotwor"] == "typ nowotworu 1")
now2 = (daneO["Nowotwor"] == "typ nowotworu 2")
now3 = (daneO["Nowotwor"] == "typ nowotworu 3")
results12 = logrank_test(T[now1], T[~now1], E[now1], E[~now1], alpha=.95 )
results12.print_summary()
results13 = logrank_test(T[now2], T[~now2], E[now2], E[~now2], alpha=.95 )
results13.print_summary()
results23 = logrank_test(T[now3], T[~now3], E[now3], E[~now3], alpha=.95 )
results23.print_summary()
kmf = KaplanMeierFitter()
fig = plt.figure(figsize=(14,6))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
kmf.fit(T,E, label="Nowotwór")
kmf.plot(ci_force_lines=True, ax=ax1)
for i in daneO['Nowotwor'].unique():
ix = daneO['Nowotwor'] == i
kmf.fit(T.ix[ix],E.ix[ix], label= i)
ax = kmf.plot(ax=ax2)
fig.tight_layout()
fig.savefig('rys_4.13.png')
## Results
## alpha: 0.95
## t 0: -1
## null distribution: chi squared
## test: logrank
## df: 1
##
## __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
## 0.31958 | 0.991 | Cannot Reject Null | False
## Results
## alpha: 0.95
## t 0: -1
## null distribution: chi squared
## test: logrank
## df: 1
##
## __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
## 0.00775 | 7.091 | Reject Null | True
## Results
## alpha: 0.95
## t 0: -1
## null distribution: chi squared
## test: logrank
## df: 1
##
## __ p-value ___|__ test statistic __|____ test result ____|__ is significant __
## 0.00068 | 11.552 | Reject Null | True
from scipy import *
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter
from lifelines.utils import k_fold_cross_validation
daneO = pd.read_csv("/home/krz/Pulpit/Przewodnik_R/daneO.csv")
daneO['Niepowodzenia'] = daneO.Niepowodzenia.replace(dict(brak=0, wznowa=1))
daneO = daneO[isfinite(daneO['Nowotwor'])]
cf = CoxPHFitter(normalize=False)
cf.fit(daneO[['Nowotwor','Wiek','Okres.bez.wznowy','Niepowodzenia']],\
duration_col='Okres.bez.wznowy', event_col='Niepowodzenia',\
show_progress=True,include_likelihood=True)
cf.print_summary()
scores = k_fold_cross_validation(cf, daneO[['Nowotwor','Wiek','Okres.bez.wznowy','Niepowodzenia']], \
duration_col='Okres.bez.wznowy', event_col='Niepowodzenia', k=5)
print('\nCross walidacja modelu:')
print(scores)
print('\n średnia: %.6f' % mean(scores))
print('odch.std.: %.6f' % std(scores))
## Convergence completed after 5 iterations.
## n=86, number of events=13
##
## coef exp(coef) se(coef) z p lower 0.95 upper 0.95
## Nowotwor 1.754e+00 5.776e+00 5.957e-01 2.944e+00 3.238e-03 5.860e-01 2.921e+00 **
## Wiek -5.632e-02 9.452e-01 4.340e-02 -1.297e+00 1.945e-01 -1.414e-01 2.877e-02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Concordance = 0.747
##
## Cross walidacja modelu:
## [0.64634146341463417, 0.76190476190476186, 0.75, 0.8214285714285714, 0.80303030303030298]
##
## średnia: 0.756541
## odch.std.: 0.060971
x = {1,2,3,4,5,6,7,8,9,10}
y = {5,6,7,8,9,10,11,12,13,14,15}
print((x | y)-(x & y),'\n')
print(x == (x - (y - x)),'\n')
A = {3}
print(A.issubset(x))
## {1, 2, 3, 4, 11, 12, 13, 14, 15}
##
## True
##
## True
from scipy import *
print(1-exp(0.1**15),'\n')
print(expm1(0.1**15),'\n')
print(log(1+0.1**20),'\n')
print(log1p(0.1**20))
## -1.11022302463e-15
##
## 1e-15
##
## 0.0
##
## 1e-20
from scipy import *
p1 = poly1d([1,0,2])
p2 = poly1d([1,1,2,2])
print(p1,'\n')
print(p2,'\n')
print(p1 + p1,'\n')
print(p1 * p2,'\n')
print(polyint(p1)(1)-polyint(p1)(0),'\n')
print(p2.deriv(),'\n')
print(roots(p2),'\n')
## 2
## 1 x + 2
##
## 3 2
## 1 x + 1 x + 2 x + 2
##
## 2
## 2 x + 4
##
## 5 4 3 2
## 1 x + 1 x + 4 x + 4 x + 4 x + 4
##
## 2.33333333333
##
## 2
## 3 x + 2 x + 2
##
## [ -4.85722573e-16+1.41421356j -4.85722573e-16-1.41421356j
## -1.00000000e+00+0.j ]
from scipy import *
from scipy.special import sh_legendre
import matplotlib.pyplot as plt
print(sh_legendre(5),'\n')
print(sh_legendre(4),'\n')
print(sh_legendre(3),'\n')
print(sh_legendre(2),'\n')
print(sh_legendre(1),'\n')
print(sh_legendre(0),'\n')
x = linspace(0, 1, 100)
plt.figure(figsize=(8,6))
for i in range(5): ax = plt.plot(x, sh_legendre(i)(x), lw=2,label="stopnia %d"%i)
plt.legend(loc='lower center',ncol=3)
plt.title('Wielomiany Legendre`a\nbez normalizacji')
plt.tight_layout()
plt.savefig('rys_4.15.png')
## 5 4 3 2
## 252 x - 630 x + 560 x - 210 x + 30 x - 1
##
## 4 3 2
## 70 x - 140 x + 90 x - 20 x + 1
##
## 3 2
## 20 x - 30 x + 12 x - 1
##
## 2
## 6 x - 6 x + 1
##
##
## 2 x - 1
##
##
## 1
from scipy.optimize import *
f = lambda x: (x -7)**2 - x
res = fminbound(f, -1,10)
fun = f(res)
print ('Dla argumentu x:', ("%.7f" % res), 'funkcja osiąga minimum:', ("%.7f" % fun))
## Dla argumentu x: 7.5000000 funkcja osiąga minimum: -7.2500000
from scipy.optimize import *
f = lambda x: (x -7)**2 - x
sol = brentq(f,-1,10)
fun = f(sol)
print ('Dla argumentu x:', ("%.7f" % sol), 'funkcja ma wartość zero:', ("%.7f" % fun))
## Dla argumentu x: 4.8074176 funkcja ma wartość zero: 0.0000000
from sympy import *
x = symbols('x')
print(diff(3*(x-2)**2-15, x, 1))
## 6*x - 12
from scipy import *
from scipy.misc import derivative
f = lambda x: (x-2)**2+15
wyraz = lambda x: derivative(f, x, dx=1e-6)
X = arange(1,4,1)
print(wyraz(X))
## [-2. 0. 2.]
import warnings
warnings.filterwarnings("ignore")
from scipy import *
import scipy.stats as stats
from scipy.integrate import *
f = lambda x: stats.norm.pdf(x)
print(quad(f, 0, Inf, limit=2))
g = lambda x: sin(x)**2
print(quad(g, 0, 600, limit=2))
## (0.5000005473812185, 0.03966696247407325)
## (299.16747101861347, 179.0735567361394)