This is the web material of the work On zero-inflated permutation testing and some related problems by Livio Finos and Fortunato Pesarin.

Analysis of seeds data

The codes and the data are available here

load data

data(seeds)

The manufacturer’s point of view

seeds0=seeds
seeds0[is.na(seeds)]=1

# the two tests
(res1 = flip(x+y~grp,data=seeds0,perms=10000,tail=1))
##     
##  Call:
##  flip(Y = x + y ~ grp, data = seeds0, tail = 1, perms = 10000) 
## 9999 permutations.
##   Test  Stat tail p-value sig.
## x    t 2.235    >  0.0166    *
## y    t 2.774    >  0.0053   **
plot(res1)

# and combined
npc(res1,comb.funct = "Fisher")
##  Call:
##  npc(permTP = res1, comb.funct = "Fisher") 
##  permutations.
##    comb.funct nVar  Stat p-value sig.
## V1     Fisher    2 9.338  0.0080   **

The user’s point of view

# the two tests
(res2 = flip(x+y~grp,data=seeds[!is.na(seeds$x),],perms=10000,tail=1))
##     
##  Call:
##  flip(Y = x + y ~ grp, data = seeds[!is.na(seeds$x), ], tail = 1,      perms = 10000) 
## 9999 permutations.
##   Test  Stat tail p-value sig.
## x    t 1.320    >  0.0985     
## y    t 2.061    >  0.0240    *
# and combined
npc(res2,comb.funct = "Fisher")
##  Call:
##  npc(permTP = res2, comb.funct = "Fisher") 
##  permutations.
##    comb.funct nVar  Stat p-value sig.
## V1     Fisher    2 6.047  0.0417    *

The point of view of a scientist (I)

# the two tests for the problem X|X>0
res3_Quant = flip(x+y~grp,data=seeds,statTest = "NA",perms=10000,tail=1,flipReturn = list(permSpace=TRUE,permT=TRUE))

plot(res3_Quant)

# the test for the problem X>0
seeds$Zeros=is.na(seeds$x)*1
res3_zero = flip(Zeros~grp,data=seeds,perms=res3_Quant@permSpace,tail=-1)
##     
res3=cFlip(res3_zero,res3_Quant)
res3
##  Call:
##  flip(Y = Zeros ~ grp, data = seeds, tail = -1, perms = res3_Quant@permSpace) 
## 9999 permutations.
##         Test tail NValidPerms    Stat p-value sig.
## Zeros      t    <          NA  -1.798  0.0771     
## x     uni-NA    >       10000  60.714  0.0953     
## y     uni-NA    >       10000 248.374  0.0269    *
# and combined
npc(res3,comb.funct = "Fisher")
##  Call:
##  npc(permTP = res3, comb.funct = "Fisher") 
##  permutations.
##    comb.funct nVar  Stat p-value sig.
## V1     Fisher    3 8.529  0.0175    *

The point of view of a scientist (II)

# the 2+2 tests for the problem X|X>0
res4_plus = res3

(res4_Quant_minus = flip(x+y~grp,data=seeds,statTest = "NA",perms=res3_Quant@permSpace,tail=-1))
##  Call:
##  flip(Y = x + y ~ grp, data = seeds, tail = -1, perms = res3_Quant@permSpace,      statTest = "NA") 
## 9999 permutations.
##     Test NValidPerms   Stat tail p-value sig.
## x uni-NA       10000  60.71    <  0.8987     
## y uni-NA       10000 248.37    <  0.9740
# the test for the problem X>0
(res4_zero_minus = flip(Zeros~grp,data=seeds,perms=res3@permSpace,tail=+1))
##     
##  Call:
##  flip(Y = Zeros ~ grp, data = seeds, tail = +1, perms = res3@permSpace) 
## 9999 permutations.
##       Test   Stat tail p-value sig.
## Zeros    t -1.798    >  0.9849
res4=cFlip(res4_plus,res4_zero_minus,res4_Quant_minus)
res4
##  Call:
##  flip(Y = Zeros ~ grp, data = seeds, tail = -1, perms = res3_Quant@permSpace) 
## 9999 permutations.
##         Test tail NValidPerms    Stat p-value sig.
## Zeros      t    <          NA  -1.798  0.0771     
## x     uni-NA    >       10000  60.714  0.0953     
## y     uni-NA    >       10000 248.374  0.0269    *
## 4          t    >          NA  -1.798  0.9849     
## 5     uni-NA    <       10000  60.714  0.8987     
## 6     uni-NA    <       10000 248.374  0.9740
# and combined
npc(res4,comb.funct = "Fisher",subsets = list(Zeros=c(1,4),x=c(2,5),y=c(3,6),overall=1:6))
## 1 / 42 / 43 / 44 / 4
##  Call:
##  npc(permTP = res4, comb.funct = "Fisher", subsets = list(Zeros = c(1,      4), x = c(2, 5), y = c(3, 6), overall = 1:6)) 
##  permutations.
##         comb.funct nVar   Stat p-value sig.
## Zeros       Fisher    2  5.125  0.0771     
## x           Fisher    2  4.701  0.0953     
## y           Fisher    2  7.231  0.0269    *
## overall     Fisher    6 17.058  0.0175    *

Analysis of Medical Care data

The codes and the data are available here

Load data

The data comes from: https://www.jstatsoft.org/index.php/jss/article/downloadSuppFile/v027i08/DebTrivedi.rda.zip

A Rdata file is saved in this folder for your convenience.

Now, load the data and define the dataset to be used, for our convenience.

load("DebTrivedi.rda")

dati=DebTrivedi[,c("ofp","hosp","health","numchron","gender","school","privins")]
summary(dati)
##       ofp              hosp             health        numchron    
##  Min.   : 0.000   Min.   :0.000   poor     : 554   Min.   :0.000  
##  1st Qu.: 1.000   1st Qu.:0.000   average  :3509   1st Qu.:1.000  
##  Median : 4.000   Median :0.000   excellent: 343   Median :1.000  
##  Mean   : 5.774   Mean   :0.296                    Mean   :1.542  
##  3rd Qu.: 8.000   3rd Qu.:0.000                    3rd Qu.:2.000  
##  Max.   :89.000   Max.   :8.000                    Max.   :8.000  
##     gender         school      privins   
##  female:2628   Min.   : 0.00   no : 985  
##  male  :1778   1st Qu.: 8.00   yes:3421  
##                Median :11.00             
##                Mean   :10.29             
##                3rd Qu.:12.00             
##                Max.   :18.00

Fit parametric Hurdle model

formula <- ofp ~ hosp + health + numchron + gender + school + privins
modelHurdle <- pscl::hurdle(formula = formula,
                            dist    = "negbin",
                            data    = DebTrivedi)
(out=summary(modelHurdle))
## 
## Call:
## pscl::hurdle(formula = formula, data = DebTrivedi, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1718 -0.7080 -0.2737  0.3196 18.0092 
## 
## Count model coefficients (truncated negbin with log link):
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.197699   0.058973  20.309  < 2e-16 ***
## hosp             0.211898   0.021396   9.904  < 2e-16 ***
## healthpoor       0.315958   0.048056   6.575 4.87e-11 ***
## healthexcellent -0.331861   0.066093  -5.021 5.14e-07 ***
## numchron         0.126421   0.012452  10.152  < 2e-16 ***
## gendermale      -0.068317   0.032416  -2.108   0.0351 *  
## school           0.020693   0.004535   4.563 5.04e-06 ***
## privinsyes       0.100172   0.042619   2.350   0.0188 *  
## Log(theta)       0.333255   0.042754   7.795 6.46e-15 ***
## Zero hurdle model coefficients (binomial with logit link):
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.043147   0.139852   0.309 0.757688    
## hosp             0.312449   0.091437   3.417 0.000633 ***
## healthpoor      -0.008716   0.161024  -0.054 0.956833    
## healthexcellent -0.289570   0.142682  -2.029 0.042409 *  
## numchron         0.535213   0.045378  11.794  < 2e-16 ***
## gendermale      -0.415658   0.087608  -4.745 2.09e-06 ***
## school           0.058541   0.011989   4.883 1.05e-06 ***
## privinsyes       0.747120   0.100880   7.406 1.30e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta: count = 1.3955
## Number of iterations in BFGS optimization: 16 
## Log-likelihood: -1.209e+04 on 17 Df

Permutation based analysis

Before we proceed, let’s compute the Maximum cardinality for each test:

sapply(2:ncol(dati),compute_orbit_cardinality)
## [1] Inf Inf Inf Inf Inf Inf

The number is very high, certainly enough for a permutation test.

# Depending on the number of permutations 'perms', the computation time can be very high, set EVAL=TRUE (here below) to run the analysis
EVAL=FALSE
perms=1000

dati$health=relevel(dati$health,"average")
dataX <- flip:::.getXY(ofp ~ hosp + health + numchron + gender + school + privins,
                         data=dati,statTest="permutation",rotationTest = FALSE)
nomefile=paste0("results_permutation_MC",perms,".Rdata")
if(EVAL){
  
  # the two response variables
  id_Ynot0=which(dataX$Y!=0)
  # Yno0=dataX$Y[dataX$Y!=0,]
  Y01=(dataX$Y!=0)*1
  # table(Y01)
  # Yno0_orig=Yno0
  Y01_orig=Y01
  
  lista_tested_var=list(2,3,4,5,6,7,8)
  set.seed(1)
  res_permT=llply(lista_tested_var,compute_test_stats_pvalues)
  save(res_permT,file=nomefile)
} else
load(file=nomefile)

## some make up of the results:
pvals=sapply(res_permT,t2p,obs.only = 1,tail=0)
pvals=c(pvals[2,],pvals[1,])

stats=unlist(sapply(res_permT,function(X)X[1,]))
stats=c(stats[2,],stats[1,])

out=new("flip.object")
out@res= data.frame(Test="z",Stat=stats,tail="><", p.value=pvals)
colnames(out@res)[4]="p-value"
rownames(out@res)=paste(rep(c("X|X>0 (Quantitative)","X>0 (Zeros)"),each=7),colnames(dataX$X)[-1],sep=":")
summary(out)
##  Call:
##  `<undef>`() 
##  permutations.
##                                        Test     Stat tail p-value sig.
## X|X>0 (Quantitative):hosp                 z 12.53530   ><  0.0010  ***
## X|X>0 (Quantitative):health.poor.         z  5.59255   ><  0.0020   **
## X|X>0 (Quantitative):health.excellent.    z -2.74832   ><  0.0110    *
## X|X>0 (Quantitative):numchron             z  9.35176   ><  0.0010  ***
## X|X>0 (Quantitative):gender.male.         z -2.55200   ><  0.0230    *
## X|X>0 (Quantitative):school               z  4.08174   ><  0.0020   **
## X|X>0 (Quantitative):privins.yes.         z  1.86305   ><  0.0849     
## X>0 (Zeros):hosp                          z  2.80361   ><  0.0010  ***
## X>0 (Zeros):health.poor.                  z -0.05789   ><  0.9820     
## X>0 (Zeros):health.excellent.             z -2.57913   ><  0.0220    *
## X>0 (Zeros):numchron                      z 10.53065   ><  0.0010  ***
## X>0 (Zeros):gender.male.                  z -4.70975   ><  0.0010  ***
## X>0 (Zeros):school                        z  4.49383   ><  0.0010  ***
## X>0 (Zeros):privins.yes.                  z  6.15691   ><  0.0010  ***

Plots

An example for gender(male). Change the number 5 in res_permT[[5]] - in the code below - to see other distributions of the test statistic

out=new("flip.object",permT=res_permT[[5]][,2:1])
colnames(out@permT)=c("X|X>0 (Quantitative)","X>0 (Zeros)")
out@res=data.frame(T=matrix(res_permT[[5]][1,]))

par(mfrow=c(2,2))
plot(out)
hist(out[2],main=colnames(out@permT)[2])
hist(out[1],main=colnames(out@permT)[1])