Advantages:
Available on CRAN, one can simply run
install.packages("FSelectorRcpp")
or get the developer version with
devtools::install_github("mi2-warsaw/FSelectorRcpp")
# windows users should have Rtools for devtools installation
# https://cran.r-project.org/bin/windows/Rtools/
Feature selection algortihms are necessary for handling two problems related with too big data sets:
Every feature can be checked independently so that computations can be performed in parallel.
Hence entropy based feature selection is
Hα(X)=−∑xP(x)logα(P(x))
Hα(X,Y)=−∑x,yP(x,y)logα(P(x,y))
Units for different α parameter:
bit:α=2
nat:α=e
ban,dit,hartley:α=10
infogain:H(Class)+H(Attribute)−H(Class,Attribute)
gainratio:H(Class)+H(Attribute)−H(Class,Attribute)H(Attribute)
symuncert:2H(Class)+H(Attribute)−H(Class,Attribute)H(Attribute)+H(Class)
First one is simply mutual information
I(X;Y)=H(X)+H(Y)−H(X,Y)
info_gain <-
information_gain( # Calculate the score for each attribute
formula = Species ~ ., # that is on the right side of the formula.
data = iris, # Attributes must exist in the passed data.
type = "infogain", # Choose the type of a score to be calculated.
threads = 2 # Set number of threads in a parallel backend.
)
info_gain
attributes importance
1 Sepal.Length 0.4521286
2 Sepal.Width 0.2672750
3 Petal.Length 0.9402853
4 Petal.Width 0.9554360
cutted_attrs <-
cut_attrs( # Then take attributes with the highest rank.
attrs = info_gain,
k = 2 # For example: 2 attrs with the higehst rank.
)
cutted_attrs
[1] "Petal.Width" "Petal.Length"
new_formula <-
to_formula( # Create a new formula object with
attrs = cutted_attrs, # the most influencial attrs.
class = "Species"
)
new_formula
Species ~ Petal.Width + Petal.Length
<environment: 0x29e3080>
model <-
glm(
formula = new_formula, # Use that formula in any classification algorithm.
data = iris,
family = "binomial"
)
model
Call: glm(formula = new_formula, family = "binomial", data = iris)
Coefficients:
(Intercept) Petal.Width Petal.Length
-69.45 33.89 17.60
Degrees of Freedom: 149 Total (i.e. Null); 147 Residual
Null Deviance: 191
Residual Deviance: 5.17e-09 AIC: 6
information_gain(
formula = Species ~ .,
data = iris,
type = "infogain",
threads = 2
) %>%
cut_attrs(
k = 2
) %>%
to_formula(
attrs = .,
class = "Species"
) %>%
glm(
formula = .,
data = iris,
family = "binomial"
)
fun
parameter.eval_adjR2_lm <- # Create a scorer function.
function(
attributes, # That takes the currently considered subset of attributes
data, # from a specified dataset.
dependent =
names(data)[1] # To find features that best describe the dependent variable.
) {
summary( # In this situation we take the adj.r.squared statistic
lm( # from the summary of a linear model object.
to_formula( # This is the score to use to choose between considered
attributes, # subsets of attributes.
dependent
),
data = data)
)$adj.r.squared
}
set.seed(42)
random <- sample(x = letters, size = 150, replace = TRUE)
ran_iris <- cbind(iris, random)
head(ran_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species random
1 5.1 3.5 1.4 0.2 setosa x
2 4.9 3.0 1.4 0.2 setosa y
3 4.7 3.2 1.3 0.2 setosa h
4 4.6 3.1 1.5 0.2 setosa v
5 5.0 3.6 1.4 0.2 setosa q
6 5.4 3.9 1.7 0.4 setosa n
fs <- feature_search( # works in 2 modes – 'exhaustive' and 'greedy'
attributes =
names(ran_iris)[-1], # takes attribues and creates combinations of it's subsets
fun = eval_adjR2_lm, # calculates the score of a subset that depends on the
data = ran_iris, # evaluator function passed in the 'fun' parameter
mode = "exhaustive", # 'exhaustive' means to check all possible
sizes = # attributes' subset combinations
3:5 # of sizes passed in 'sizes'.
)
fs$best
Sepal.Width Petal.Length Petal.Width Species random values
11 1 1 1 1 0 0.862705
fs$all
Sepal.Width Petal.Length Petal.Width Species random values
1 1 1 1 0 0 0.8557065
2 1 1 0 1 0 0.859538
3 1 1 0 0 1 0.8376647
4 1 0 1 1 0 0.725002
5 1 0 1 0 1 0.7061277
6 1 0 0 1 1 0.7144209
7 0 1 1 1 0 0.8322213
8 0 1 1 0 1 0.7662291
9 0 1 0 1 1 0.8326902
10 0 0 1 1 1 0.6701464
11 1 1 1 1 0 0.862705
12 1 1 1 0 1 0.8529765
13 1 1 0 1 1 0.8619427
14 1 0 1 1 1 0.7223287
15 0 1 1 1 1 0.8319703
16 1 1 1 1 1 0.8624321
fs$fun
function(
attributes, # That takes the currently considered subset of attributes
data, # from a specified dataset.
dependent =
names(data)[1] # To find features that best describe the dependent variable.
) {
summary( # In this situation we take the adj.r.squared statistic
lm( # from the summary of a linear model object.
to_formula( # This is the score to use to choose between considered
attributes, # subsets of attributes.
dependent
),
data = data)
)$adj.r.squared
}
<bytecode: 0x39d3a10>
fs$call
feature_search(attributes = names(ran_iris)[-1], fun = eval_adjR2_lm,
data = ran_iris, mode = "exhaustive", sizes = 3:5)
Features selected from BRCA data with 20.000 variables.