CPOs Built Into mlrCPO

Martin Binder

2020-03-04

Listing CPOs

Builtin CPOs can be listed with listCPO().

listCPO()[, c("name", "category", "subcategory")]

	name	category	subcategory
11	cpoDropConstants	data	cleanup
36	cpoFixFactors	data	cleanup
10	cpoCollapseFact	data	factor data preprocessing
4	cpoAsNumeric	data	feature conversion
15	cpoDummyEncode	data	feature conversion
13	cpoImpactEncodeClassif	data	feature conversion
14	cpoImpactEncodeRegr	data	feature conversion
12	cpoProbEncode	data	feature conversion
55	cpoQuantileBinNumerics	data	feature conversion
61	cpoSelect	data	feature selection
62	cpoSelectFreeProperties	data	feature selection
51	cpoAddCols	data	features
50	cpoMakeCols	data	features
1	cpoApplyFun	data	general data preprocessing
53	cpoModelMatrix	data	general
37	cpoIca	data	numeric data preprocessing
54	cpoPca	data	numeric data preprocessing
58	cpoScale	data	numeric data preprocessing
59	cpoScaleMaxAbs	data	numeric data preprocessing
60	cpoScaleRange	data	numeric data preprocessing
64	cpoSpatialSign	data	numeric data preprocessing
16	cpoFilterFeatures	featurefilter	general
32	cpoFilterAnova	featurefilter	specialised
18	cpoFilterCarscore	featurefilter	specialised
28	cpoFilterChiSquared	featurefilter	specialised
26	cpoFilterGainRatio	featurefilter	specialised
25	cpoFilterInformationGain	featurefilter	specialised
33	cpoFilterKruskal	featurefilter	specialised
23	cpoFilterLinearCorrelation	featurefilter	specialised
17	cpoFilterMrmr	featurefilter	specialised
30	cpoFilterOneR	featurefilter	specialised
35	cpoFilterPermutationImportance	featurefilter	specialised
24	cpoFilterRankCorrelation	featurefilter	specialised
29	cpoFilterRelief	featurefilter	specialised
21	cpoFilterRfCImportance	featurefilter	specialised
22	cpoFilterRfImportance	featurefilter	specialised
19	cpoFilterRfSRCImportance	featurefilter	specialised
20	cpoFilterRfSRCMinDepth	featurefilter	specialised
27	cpoFilterSymmetricalUncertainty	featurefilter	specialised
31	cpoFilterUnivariate	featurefilter	specialised
34	cpoFilterVariance	featurefilter	specialised
38	cpoImpute	imputation	general
39	cpoImputeAll	imputation	general
40	cpoImputeConstant	imputation	specialised
48	cpoImputeHist	imputation	specialised
49	cpoImputeLearner	imputation	specialised
45	cpoImputeMax	imputation	specialised
42	cpoImputeMean	imputation	specialised
41	cpoImputeMedian	imputation	specialised
44	cpoImputeMin	imputation	specialised
43	cpoImputeMode	imputation	specialised
47	cpoImputeNormal	imputation	specialised
46	cpoImputeUniform	imputation	specialised
8	cpoCache	meta
6	cpoCase	meta
9	cpoCbind	meta
5	cpoMultiplex	meta
7	cpoTransformParams	meta
68	cpoWrap	meta	wrap
69	cpoWrapRetrafoless	meta	wrap
65	cpoOversample	subsampling	binary classif
63	cpoSmote	subsampling	binary classif
66	cpoUndersample	subsampling	binary classif
67	cpoSample	subsampling	general
2	cpoApplyFunRegrTarget	target	general target transformation
56	cpoRegrResiduals	target	residual fitting
3	cpoLogTrafoRegr	target	target transformation
52	cpoMissingIndicators	tools	imputation
57	cpoResponseFromSE	tools	predict.type

NULLCPO #> NULLCPO is.nullcpo(NULLCPO) #> [1] TRUE NULLCPO %>>% cpoScale() #> scale(center = TRUE, scale = TRUE) NULLCPO %>>% NULLCPO #> NULLCPO print(as.list(NULLCPO)) #> list() pipeCPO(list()) #> NULLCPO

cpa = cpoWrap() print(cpa, verbose = TRUE) #> Trafo chain of 1 cpos: #> wrap() #> Operating: feature #> ParamSet: #> Type len Def Constr Req Tunable Trafo #> wrap.cpo untyped - - - - TRUE - head(iris %>>% setHyperPars(cpa, wrap.cpo = cpoScale())) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa #> 2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa #> 3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa #> 4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa #> 5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa #> 6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa head(iris %>>% setHyperPars(cpa, wrap.cpo = cpoPca())) #> Species PC1 PC2 PC3 PC4 #> 1 setosa -2.684126 -0.3193972 0.02791483 0.002262437 #> 2 setosa -2.714142 0.1770012 0.21046427 0.099026550 #> 3 setosa -2.888991 0.1449494 -0.01790026 0.019968390 #> 4 setosa -2.745343 0.3182990 -0.03155937 -0.075575817 #> 5 setosa -2.728717 -0.3267545 -0.09007924 -0.061258593 #> 6 setosa -2.280860 -0.7413304 -0.16867766 -0.024200858 # attaching the cpo applicator to a learner gives this learner a "cpo" hyperparameter # that can be set to any CPO. getParamSet(cpoWrap() %>>% makeLearner("classif.logreg")) #> Type len Def Constr Req Tunable Trafo #> wrap.cpo untyped - - - - TRUE - #> model logical - TRUE - - FALSE -

cpm = cpoMultiplex(list(cpoScale, cpoPca)) print(cpm, verbose = TRUE) #> Trafo chain of 1 cpos: #> multiplex(selected.cpo = scale, scale.center = TRUE, scale.scale = TRUE, pca.center = TRUE, pca.scale = FALSE) #> Operating: feature #> ParamSet: #> Type len Def Constr Req Tunable Trafo #> selected.cpo discrete - scale scale,pca - TRUE - #> scale.center logical - TRUE - Y TRUE - #> scale.scale logical - TRUE - Y TRUE - #> pca.center logical - TRUE - Y TRUE - #> pca.scale logical - FALSE - Y TRUE - head(iris %>>% setHyperPars(cpm, selected.cpo = "scale")) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa #> 2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa #> 3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa #> 4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa #> 5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa #> 6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa # every CPO's Hyperparameters are exported head(iris %>>% setHyperPars(cpm, selected.cpo = "scale", scale.center = FALSE)) #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species #> 1 0.8613268 1.1296201 0.3362663 0.140405 setosa #> 2 0.8275493 0.9682458 0.3362663 0.140405 setosa #> 3 0.7937718 1.0327956 0.3122473 0.140405 setosa #> 4 0.7768830 1.0005207 0.3602853 0.140405 setosa #> 5 0.8444380 1.1618950 0.3362663 0.140405 setosa #> 6 0.9119931 1.2587196 0.4083234 0.280810 setosa head(iris %>>% setHyperPars(cpm, selected.cpo = "pca")) #> Species PC1 PC2 PC3 PC4 #> 1 setosa -2.684126 -0.3193972 0.02791483 0.002262437 #> 2 setosa -2.714142 0.1770012 0.21046427 0.099026550 #> 3 setosa -2.888991 0.1449494 -0.01790026 0.019968390 #> 4 setosa -2.745343 0.3182990 -0.03155937 -0.075575817 #> 5 setosa -2.728717 -0.3267545 -0.09007924 -0.061258593 #> 6 setosa -2.280860 -0.7413304 -0.16867766 -0.024200858

# cpoCbind recognises that "scale.scale" happens before "pca.pca" but is also fed to the # result directly. The summary draws a (crude) ascii-art graph. print(cbinder, verbose = TRUE) #> Trafo chain of 1 cpos: #> cbind(scale.center = TRUE, scale.scale = TRUE, pca.center = TRUE, pca.scale = FALSE) #> Operating: feature #> ParamSet: #> Type len Def Constr Req Tunable Trafo #> scale.center logical - TRUE - - TRUE - #> scale.scale logical - TRUE - - TRUE - #> pca.center logical - TRUE - - TRUE - #> pca.scale logical - FALSE - - TRUE - #> O>+ scale(center = TRUE, scale = TRUE) #> | | #> +<O pca(center = TRUE, scale = FALSE)[not exp'd: tol = <NULL>, rank = <NULL>] #> | #> O CBIND[scaled,pcad,original] #> head(iris %>>% cbinder) #> scaled.Sepal.Length scaled.Sepal.Width scaled.Petal.Length scaled.Petal.Width #> 1 -0.8976739 1.01560199 -1.335752 -1.311052 #> 2 -1.1392005 -0.13153881 -1.335752 -1.311052 #> 3 -1.3807271 0.32731751 -1.392399 -1.311052 #> 4 -1.5014904 0.09788935 -1.279104 -1.311052 #> 5 -1.0184372 1.24503015 -1.335752 -1.311052 #> 6 -0.5353840 1.93331463 -1.165809 -1.048667 #> scaled.Species pcad.Species pcad.PC1 pcad.PC2 pcad.PC3 pcad.PC4 #> 1 setosa setosa -2.257141 -0.4784238 0.12727962 0.024087508 #> 2 setosa setosa -2.074013 0.6718827 0.23382552 0.102662845 #> 3 setosa setosa -2.356335 0.3407664 -0.04405390 0.028282305 #> 4 setosa setosa -2.291707 0.5953999 -0.09098530 -0.065735340 #> 5 setosa setosa -2.381863 -0.6446757 -0.01568565 -0.035802870 #> 6 setosa setosa -2.068701 -1.4842053 -0.02687825 0.006586116 #> original.Sepal.Length original.Sepal.Width original.Petal.Length #> 1 5.1 3.5 1.4 #> 2 4.9 3.0 1.4 #> 3 4.7 3.2 1.3 #> 4 4.6 3.1 1.5 #> 5 5.0 3.6 1.4 #> 6 5.4 3.9 1.7 #> original.Petal.Width original.Species #> 1 0.2 setosa #> 2 0.2 setosa #> 3 0.2 setosa #> 4 0.2 setosa #> 5 0.2 setosa #> 6 0.4 setosa

# the unnecessary copies of "Species" are unfortunate. Remove them with cpoSelect: selector = cpoSelect(type = "numeric") cbinder.select = cpoCbind(scaled = selector %>>% scale, pcad = selector %>>% scale.pca, original = NULLCPO) cbinder.select #> cbind(scale.center = TRUE, scale.scale = TRUE, pca.center = TRUE, pca.scale = FALSE) head(iris %>>% cbinder) #> scaled.Sepal.Length scaled.Sepal.Width scaled.Petal.Length scaled.Petal.Width #> 1 -0.8976739 1.01560199 -1.335752 -1.311052 #> 2 -1.1392005 -0.13153881 -1.335752 -1.311052 #> 3 -1.3807271 0.32731751 -1.392399 -1.311052 #> 4 -1.5014904 0.09788935 -1.279104 -1.311052 #> 5 -1.0184372 1.24503015 -1.335752 -1.311052 #> 6 -0.5353840 1.93331463 -1.165809 -1.048667 #> scaled.Species pcad.Species pcad.PC1 pcad.PC2 pcad.PC3 pcad.PC4 #> 1 setosa setosa -2.257141 -0.4784238 0.12727962 0.024087508 #> 2 setosa setosa -2.074013 0.6718827 0.23382552 0.102662845 #> 3 setosa setosa -2.356335 0.3407664 -0.04405390 0.028282305 #> 4 setosa setosa -2.291707 0.5953999 -0.09098530 -0.065735340 #> 5 setosa setosa -2.381863 -0.6446757 -0.01568565 -0.035802870 #> 6 setosa setosa -2.068701 -1.4842053 -0.02687825 0.006586116 #> original.Sepal.Length original.Sepal.Width original.Petal.Length #> 1 5.1 3.5 1.4 #> 2 4.9 3.0 1.4 #> 3 4.7 3.2 1.3 #> 4 4.6 3.1 1.5 #> 5 5.0 3.6 1.4 #> 6 5.4 3.9 1.7 #> original.Petal.Width original.Species #> 1 0.2 setosa #> 2 0.2 setosa #> 3 0.2 setosa #> 4 0.2 setosa #> 5 0.2 setosa #> 6 0.4 setosa

# alternatively, we apply the cbinder only to numerical data head(iris %>>% cpoWrap(cbinder, affect.type = "numeric")) #> Species scaled.Sepal.Length scaled.Sepal.Width scaled.Petal.Length #> 1 setosa -0.8976739 1.01560199 -1.335752 #> 2 setosa -1.1392005 -0.13153881 -1.335752 #> 3 setosa -1.3807271 0.32731751 -1.392399 #> 4 setosa -1.5014904 0.09788935 -1.279104 #> 5 setosa -1.0184372 1.24503015 -1.335752 #> 6 setosa -0.5353840 1.93331463 -1.165809 #> scaled.Petal.Width pcad.PC1 pcad.PC2 pcad.PC3 pcad.PC4 #> 1 -1.311052 -2.257141 -0.4784238 0.12727962 0.024087508 #> 2 -1.311052 -2.074013 0.6718827 0.23382552 0.102662845 #> 3 -1.311052 -2.356335 0.3407664 -0.04405390 0.028282305 #> 4 -1.311052 -2.291707 0.5953999 -0.09098530 -0.065735340 #> 5 -1.311052 -2.381863 -0.6446757 -0.01568565 -0.035802870 #> 6 -1.048667 -2.068701 -1.4842053 -0.02687825 0.006586116 #> original.Sepal.Length original.Sepal.Width original.Petal.Length #> 1 5.1 3.5 1.4 #> 2 4.9 3.0 1.4 #> 3 4.7 3.2 1.3 #> 4 4.6 3.1 1.5 #> 5 5.0 3.6 1.4 #> 6 5.4 3.9 1.7 #> original.Petal.Width #> 1 0.2 #> 2 0.2 #> 3 0.2 #> 4 0.2 #> 5 0.2 #> 6 0.4

cpo = cpoTransformParams(cpoPca(), alist(pca.scale = pca.center)) retr = pid.task %>|% setHyperPars(cpo, pca.center = FALSE) getCPOTrainedState(retr)$control # both 'center' and 'scale' are FALSE #> CPO Retrafo chain #> [RETRAFO pca(center = FALSE, scale = FALSE)]

mplx = cpoMultiplex(list(cpoIca(export = "n.comp"), cpoPca(export = "rank"))) !mplx #> Trafo chain of 1 cpos: #> multiplex(selected.cpo = ica, ica.n.comp = <NULL>, pca.rank = <NULL>) #> Operating: feature #> ParamSet: #> Type len Def Constr Req Tunable Trafo #> selected.cpo discrete - ica ica,pca - TRUE - #> ica.n.comp integer - <NULL> 1 to Inf Y TRUE - #> pca.rank integer - <NULL> 1 to Inf Y TRUE - mtx = cpoTransformParams(mplx, alist(ica.n.comp = comp, pca.rank = comp), pSS(comp: integer[1, ]), list(comp = 1)) head(iris %>>% setHyperPars(mtx, selected.cpo = "ica", comp = 2)) #> Species V1 V2 #> 1 setosa 0.5040262 1.372772 #> 2 setosa -0.5026081 1.277214 #> 3 setosa -0.4470063 1.369134 #> 4 setosa -0.7903465 1.261004 #> 5 setosa 0.5165524 1.396033 #> 6 setosa 1.3797295 1.270769 head(iris %>>% setHyperPars(mtx, selected.cpo = "pca", comp = 3)) #> Species PC1 PC2 PC3 #> 1 setosa -2.684126 -0.3193972 0.02791483 #> 2 setosa -2.714142 0.1770012 0.21046427 #> 3 setosa -2.888991 0.1449494 -0.01790026 #> 4 setosa -2.745343 0.3182990 -0.03155937 #> 5 setosa -2.728717 -0.3267545 -0.09007924 #> 6 setosa -2.280860 -0.7413304 -0.16867766

Data Manipulation

cpoScale

Implements the base::scale function.

df = data.frame(a = 1:3, b = -(1:3) * 10)
df %>>% cpoScale()
#>    a  b
#> 1 -1  1
#> 2  0  0
#> 3  1 -1
df %>>% cpoScale(scale = FALSE)  # center = TRUE
#>    a   b
#> 1 -1  10
#> 2  0   0
#> 3  1 -10

cpoPca

Implements stats::prcomp. No scaling or centering is performed.

df %>>% cpoPca()
#>         PC1           PC2
#> 1 -10.04988  4.440892e-16
#> 2   0.00000  0.000000e+00
#> 3  10.04988 -4.440892e-16

cpoDummyEncode

Dummy encoding of factorial variables. Optionally uses the first factor as reference variable.

head(iris %>>% cpoDummyEncode())
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1          5.1         3.5          1.4         0.2             1
#> 2          4.9         3.0          1.4         0.2             1
#> 3          4.7         3.2          1.3         0.2             1
#> 4          4.6         3.1          1.5         0.2             1
#> 5          5.0         3.6          1.4         0.2             1
#> 6          5.4         3.9          1.7         0.4             1
#>   Speciesversicolor Speciesvirginica
#> 1                 0                0
#> 2                 0                0
#> 3                 0                0
#> 4                 0                0
#> 5                 0                0
#> 6                 0                0
head(iris %>>% cpoDummyEncode(reference.cat = TRUE))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Speciesversicolor
#> 1          5.1         3.5          1.4         0.2                 0
#> 2          4.9         3.0          1.4         0.2                 0
#> 3          4.7         3.2          1.3         0.2                 0
#> 4          4.6         3.1          1.5         0.2                 0
#> 5          5.0         3.6          1.4         0.2                 0
#> 6          5.4         3.9          1.7         0.4                 0
#>   Speciesvirginica
#> 1                0
#> 2                0
#> 3                0
#> 4                0
#> 5                0
#> 6                0

cpoSelect

Select to use only certain columns of a dataset. Select by column index, name, or regex pattern.

head(iris %>>% cpoSelect(pattern = "Width"))
#>   Sepal.Width Petal.Width
#> 1         3.5         0.2
#> 2         3.0         0.2
#> 3         3.2         0.2
#> 4         3.1         0.2
#> 5         3.6         0.2
#> 6         3.9         0.4
# selection is additive
head(iris %>>% cpoSelect(pattern = "Width", type = "factor"))
#>   Sepal.Width Petal.Width Species
#> 1         3.5         0.2  setosa
#> 2         3.0         0.2  setosa
#> 3         3.2         0.2  setosa
#> 4         3.1         0.2  setosa
#> 5         3.6         0.2  setosa
#> 6         3.9         0.4  setosa

cpoDropConstants

Drops constant features or numerics, with variable tolerance

head(iris) %>>% cpoDropConstants()  # drops 'species'
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width
#> 1          5.1         3.5          1.4         0.2
#> 2          4.9         3.0          1.4         0.2
#> 3          4.7         3.2          1.3         0.2
#> 4          4.6         3.1          1.5         0.2
#> 5          5.0         3.6          1.4         0.2
#> 6          5.4         3.9          1.7         0.4
head(iris) %>>% cpoDropConstants(abs.tol = 0.2)  # also drops 'Petal.Width'
#>   Sepal.Length Sepal.Width Petal.Length
#> 1          5.1         3.5          1.4
#> 2          4.9         3.0          1.4
#> 3          4.7         3.2          1.3
#> 4          4.6         3.1          1.5
#> 5          5.0         3.6          1.4
#> 6          5.4         3.9          1.7

cpoFixFactors

Drops unused factors and makes sure prediction data has the same factor levels as training data.

levels(iris$Species)
#> [1] "setosa"     "versicolor" "virginica"

irisfix = head(iris) %>>% cpoFixFactors()  # Species only has level 'setosa' in train
levels(irisfix$Species)
#> [1] "setosa"

rf = retrafo(irisfix)
iris[c(1, 100, 140), ]
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
#> 1            5.1         3.5          1.4         0.2     setosa
#> 100          5.7         2.8          4.1         1.3 versicolor
#> 140          6.9         3.1          5.4         2.1  virginica
iris[c(1, 100, 140), ] %>>% rf
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1            5.1         3.5          1.4         0.2  setosa
#> 100          5.7         2.8          4.1         1.3    <NA>
#> 140          6.9         3.1          5.4         2.1    <NA>

cpoMissingIndicators

Creates columns indicating missing data. Most useful in combination with cpoCbind.

impdata = df
impdata[[1]][1] = NA
impdata
#>    a   b
#> 1 NA -10
#> 2  2 -20
#> 3  3 -30

impdata %>>% cpoMissingIndicators()
#>       a
#> 1  TRUE
#> 2 FALSE
#> 3 FALSE
impdata %>>% cpoCbind(NULLCPO, dummy = cpoMissingIndicators())
#>    a   b dummy.a
#> 1 NA -10    TRUE
#> 2  2 -20   FALSE
#> 3  3 -30   FALSE

cpoApplyFun

Apply an univariate function to data columns

head(iris %>>% cpoApplyFun(function(x) sqrt(x) - 10, affect.type = "numeric"))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1    -7.741682   -8.129171    -8.816784   -9.552786  setosa
#> 2    -7.786406   -8.267949    -8.816784   -9.552786  setosa
#> 3    -7.832052   -8.211146    -8.859825   -9.552786  setosa
#> 4    -7.855239   -8.239318    -8.775255   -9.552786  setosa
#> 5    -7.763932   -8.102633    -8.816784   -9.552786  setosa
#> 6    -7.676210   -8.025158    -8.696160   -9.367544  setosa

cpoAsNumeric

Convert (non-numeric) features to numeric

head(iris[sample(nrow(iris), 10), ] %>>% cpoAsNumeric())
#>     Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 14           4.3         3.0          1.1         0.1       1
#> 50           5.0         3.3          1.4         0.2       1
#> 118          7.7         3.8          6.7         2.2       3
#> 43           4.4         3.2          1.3         0.2       1
#> 150          5.9         3.0          5.1         1.8       3
#> 148          6.5         3.0          5.2         2.0       3

cpoCollapseFact

Combine low prevalence factors. Set max.collapsed.class.prevalence how big the combined factor level may be.

iris2 = iris
iris2$Species = factor(c("a", "b", "c", "b", "b", "c", "b", "c",
                        as.character(iris2$Species[-(1:8)])))
head(iris2, 10)
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1           5.1         3.5          1.4         0.2       a
#> 2           4.9         3.0          1.4         0.2       b
#> 3           4.7         3.2          1.3         0.2       c
#> 4           4.6         3.1          1.5         0.2       b
#> 5           5.0         3.6          1.4         0.2       b
#> 6           5.4         3.9          1.7         0.4       c
#> 7           4.6         3.4          1.4         0.3       b
#> 8           5.0         3.4          1.5         0.2       c
#> 9           4.4         2.9          1.4         0.2  setosa
#> 10          4.9         3.1          1.5         0.1  setosa
head(iris2 %>>% cpoCollapseFact(max.collapsed.class.prevalence = 0.2), 10)
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
#> 1           5.1         3.5          1.4         0.2 collapsed
#> 2           4.9         3.0          1.4         0.2 collapsed
#> 3           4.7         3.2          1.3         0.2 collapsed
#> 4           4.6         3.1          1.5         0.2 collapsed
#> 5           5.0         3.6          1.4         0.2 collapsed
#> 6           5.4         3.9          1.7         0.4 collapsed
#> 7           4.6         3.4          1.4         0.3 collapsed
#> 8           5.0         3.4          1.5         0.2 collapsed
#> 9           4.4         2.9          1.4         0.2    setosa
#> 10          4.9         3.1          1.5         0.1    setosa

cpoModelMatrix

Specify which columns get used, and how they are transformed, using a formula.

head(iris %>>% cpoModelMatrix(~0 + Species:Petal.Width))
#>   Speciessetosa:Petal.Width Speciesversicolor:Petal.Width
#> 1                       0.2                             0
#> 2                       0.2                             0
#> 3                       0.2                             0
#> 4                       0.2                             0
#> 5                       0.2                             0
#> 6                       0.4                             0
#>   Speciesvirginica:Petal.Width
#> 1                            0
#> 2                            0
#> 3                            0
#> 4                            0
#> 5                            0
#> 6                            0
# use . + ... to retain originals
head(iris %>>% cpoModelMatrix(~0 + . + Species:Petal.Width))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1          5.1         3.5          1.4         0.2             1
#> 2          4.9         3.0          1.4         0.2             1
#> 3          4.7         3.2          1.3         0.2             1
#> 4          4.6         3.1          1.5         0.2             1
#> 5          5.0         3.6          1.4         0.2             1
#> 6          5.4         3.9          1.7         0.4             1
#>   Speciesversicolor Speciesvirginica Petal.Width:Speciesversicolor
#> 1                 0                0                             0
#> 2                 0                0                             0
#> 3                 0                0                             0
#> 4                 0                0                             0
#> 5                 0                0                             0
#> 6                 0                0                             0
#>   Petal.Width:Speciesvirginica
#> 1                            0
#> 2                            0
#> 3                            0
#> 4                            0
#> 5                            0
#> 6                            0

cpoScaleRange

scale values to a given range

head(iris %>>% cpoScaleRange(-1, 1))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1   -0.5555556  0.25000000   -0.8644068  -0.9166667  setosa
#> 2   -0.6666667 -0.16666667   -0.8644068  -0.9166667  setosa
#> 3   -0.7777778  0.00000000   -0.8983051  -0.9166667  setosa
#> 4   -0.8333333 -0.08333333   -0.8305085  -0.9166667  setosa
#> 5   -0.6111111  0.33333333   -0.8644068  -0.9166667  setosa
#> 6   -0.3888889  0.58333333   -0.7627119  -0.7500000  setosa

cpoScaleMaxAbs

Multiply features to set the maximum absolute value.

head(iris %>>% cpoScaleMaxAbs(0.1))
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1   0.06455696  0.07954545   0.02028986       0.008  setosa
#> 2   0.06202532  0.06818182   0.02028986       0.008  setosa
#> 3   0.05949367  0.07272727   0.01884058       0.008  setosa
#> 4   0.05822785  0.07045455   0.02173913       0.008  setosa
#> 5   0.06329114  0.08181818   0.02028986       0.008  setosa
#> 6   0.06835443  0.08863636   0.02463768       0.016  setosa

cpoSpatialSign

Normalize values row-wise

head(iris %>>% cpoSpatialSign())
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1    0.8037728   0.5516088    0.2206435  0.03152050  setosa
#> 2    0.8281329   0.5070201    0.2366094  0.03380134  setosa
#> 3    0.8053331   0.5483119    0.2227517  0.03426949  setosa
#> 4    0.8000302   0.5391508    0.2608794  0.03478392  setosa
#> 5    0.7909650   0.5694948    0.2214702  0.03163860  setosa
#> 6    0.7841750   0.5663486    0.2468699  0.05808704  setosa

impdata %>>% cpoImpute(cols = list(b = imputeMedian())) # NAs remain #> a b #> 1 NA -10 #> 2 2 -20 #> 3 3 -30 impdata %>>% cpoImputeAll(cols = list(b = imputeMedian())) # error, since NAs remain #> Error in assertPropertiesOk(present.properties, setdiff(allowed.properties, : Data returned by CPO trafo has property missings that impute declared in .properties.adding. #> properties in .properties.adding may not be present in trafo output.

missing.task = makeRegrTask("missing.task", impdata, target = "b") # the following gives an error, since 'cpoImpute' does not make sure all missings are removed # and hence does not add the 'missings' property. train(cpoImpute(cols = list(a = imputeMedian())) %>>% makeLearner("regr.lm"), missing.task) #> Error in checkLearnerBeforeTrain(task, learner, weights): Task 'missing.task' has missing values in 'a', but learner 'regr.lm.impute' does not support that!

# instead, the following works: train(cpoImputeAll(cols = list(a = imputeMedian())) %>>% makeLearner("regr.lm"), missing.task) #> Model for learner.id=regr.lm.impute; learner.class=CPOLearner #> Trained on: task.id = missing.task; obs = 3; features = 1 #> Hyperparameters: impute.target.cols=character(0),impute.classes=,impute.cols=a=<ImputeMethod>,impute.dummy.classes=character(0),impute.dummy.cols=character(0),impute.dummy.type=factor,impute.force.dummies=FALSE,impute.impute.new.levels=TRUE,impute.recode.factor.levels=TRUE

head(getTaskData(iris.task %>>% cpoFilterFeatures(method = "variance", perc = 0.5))) #> Sepal.Length Petal.Length Species #> 1 5.1 1.4 setosa #> 2 4.9 1.4 setosa #> 3 4.7 1.3 setosa #> 4 4.6 1.5 setosa #> 5 5.0 1.4 setosa #> 6 5.4 1.7 setosa head(getTaskData(iris.task %>>% cpoFilterVariance(perc = 0.5))) #> Sepal.Length Petal.Length Species #> 1 5.1 1.4 setosa #> 2 4.9 1.4 setosa #> 3 4.7 1.3 setosa #> 4 4.6 1.5 setosa #> 5 5.0 1.4 setosa #> 6 5.4 1.7 setosa # The specialised filter CPOs are: listCPO()[listCPO()$category == "featurefilter" & listCPO()$subcategory == "specialised", c("name", "description")] #> name #> 32 cpoFilterAnova #> 18 cpoFilterCarscore #> 28 cpoFilterChiSquared #> 26 cpoFilterGainRatio #> 25 cpoFilterInformationGain #> 33 cpoFilterKruskal #> ... (#rows: 19, #cols: 1)

CPOs Built Into mlrCPO

Martin Binder

2020-03-04

CPO Vignette Navigation

Listing CPOs

NULLCPO

Meta-CPO

cpoWrap

cpoMultiplex

cpoCase

cpoCbind

cpoTransformParams