mlr-org · be-marc · Jun 18, 2019 · Jun 18, 2019 · Jun 20, 2019 · Jun 20, 2019
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,7 +16,11 @@ Authors@R:
              family = "Bischl",
              role = "aut",
              email = "[email protected]",
-             comment = c(ORCID = "0000-0001-6002-6980")))
+             comment = c(ORCID = "0000-0001-6002-6980")),
+      person(given = "Marc",
+             family = "Becker",
+             role = "aut",
+             email = "[email protected]"))
 Description: Implements methods for feature selection and
     filtering in mlr3.
 License: MIT + file LICENSE
@@ -57,6 +61,9 @@ NeedsCompilation: no
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 6.1.1
 Collate:
+    'FeatureSelection.R'
+    'FeatureSelectionForward.R'
+    'FeatureSelectionRandom.R'
     'Filter.R'
     'FilterAUC.R'
     'FilterCMIM.R'
@@ -73,6 +80,11 @@ Collate:
     'FilterSymmetricalUncertainty.R'
     'FilterVariableImportance.R'
     'FilterVariance.R'
+    'PerformanceEvaluator.R'
+    'Terminator.R'
+    'TerminatorEvaluations.R'
+    'TerminatorPerformanceStep.R'
+    'TerminatorRuntime.R'
     'helpers.R'
     'mlr_filters.R'
     'reexports.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,9 @@
 
 S3method(as.data.table,DictionaryFilter)
 S3method(as.data.table,Filter)
+export(FeatureSelection)
+export(FeatureSelectionForward)
+export(FeatureSelectionRandom)
 export(Filter)
 export(FilterAUC)
 export(FilterCMIM)
@@ -18,6 +21,11 @@ export(FilterRankCorrelation)
 export(FilterSymmetricalUncertainty)
 export(FilterVariableImportance)
 export(FilterVariance)
+export(PerformanceEvaluator)
+export(Terminator)
+export(TerminatorEvaluations)
+export(TerminatorPerformanceStep)
+export(TerminatorRuntime)
 export(as.data.table)
 export(mlr_filters)
 import(checkmate)

diff --git a/R/FeatureSelection.R b/R/FeatureSelection.R
@@ -0,0 +1,80 @@
+#' @title Abstract FeatureSelection Class
+#'
+#' @description `FeatureSelection` class that implements the main functionality each fs must have. A fs is an object that describes the optimization method for choosing the features given within the `[PerformanceEvaluator]` object.
+#'
+#' @section Usage:
+#' ```
+#' # Construction
+#' fs = FeatureSelectionr$new(id, pe, tm, settings = list())
+#'
+#' # public members
+#' fs$id
+#' fs$pe
+#' fs$tm
+#' fs$settings
+#'
+#' # public methods
+#' fs$calculate()
+#' ```
+#' @section Arguments:
+#' * `id` (`character(1)`):\cr
+#'   The id of the FeatureSelection.
+#' * `pe` (`[PerformanceEvaluator]`).
+#' * `tm` (`[Terminator]`).
+#' * `settings` (`list`):\cr
+#'   The settings for the FeatureSelection.
+#'
+#' @section Details:
+#' * `$new()` creates a new object of class `[FeatureSelection]`.
+#' * `$id` stores an identifier for this `[FeatureSelection]`.
+#' * `$pe` stores the [PerformanceEvaluator] to optimize.
+#' * `$tm` stores the `[Terminator]`.
+#' * `$settings` is a list of settings for this `[FeatureSelection]`.
+#' * `calculate()` performs the feature selection, until the budget of the `[Terminator]` in the `[PerformanceEvaluator]` is exhausted.
+#' @name FeatureSelection
+#' @family FeatureSelection
+NULL
+
+#' @export
+FeatureSelection = R6Class("FeatureSelection",
+   public = list(
+      id = NULL,
+      pe = NULL,
+      tm = NULL,
+      settings = NULL,
+      state = NULL,
+
+      initialize = function(id, pe, tm, settings = list()) {
+         self$id = checkmate::assert_string(id)
+         self$pe = checkmate::assert_r6(pe, "PerformanceEvaluator")
+         self$tm = checkmate::assert_r6(tm, "Terminator")
+         self$settings = checkmate::assert_list(settings, names = "unique")
+         },
+
+      calculate = function() {
+         while(!self$tm$terminated) {
+            private$calculate_step()
+         }
+      }
+  ),
+   private = list(
+      calculate_step = function() {
+         states = private$generate_states()
+         named_states = lapply(states, private$binary_to_features)
+
+         private$eval_states_terminator(named_states)
+
+         bmr = self$pe$get_best()
+         features = bmr[[length(bmr)]]$features
+         self$state = as.numeric(Reduce("|", lapply(features, function(x) x == self$pe$task$feature_names)))
+      },
+      binary_to_features = function(binary_features) {
+         task$feature_names[as.logical(binary_features)]
+      },
+      eval_states_terminator = function(states) {
+         self$tm$update_start(self$pe)
+         self$pe$eval_states(states)
+         self$tm$update_end(self$pe)
+      }
+ )
+)
diff --git a/R/FeatureSelectionForward.R b/R/FeatureSelectionForward.R
@@ -0,0 +1,86 @@
+#' @title FeatureSelectionForward
+#'
+#' @description
+#' FeatureSelection child class to conduct forward search.
+#'
+#' @section Usage:
+#'  ```
+#' fs = FeatureSelectionForward$new()
+#' ```
+#' See [FeatureSelection] for a description of the interface.
+#'
+#' @section Arguments:
+#' * `pe` (`[PerformanceEvaluator]`).
+#' * `tm` (`[Terminator]`).
+#'
+#' @section Details:
+#' `$new()` creates a new object of class [FeatureSelectionForward].
+#' `$get_result()` Returns selected features in each step.
+#' The interface is described in [FeatureSelection].
+#'
+#' Each step is possibly executed in parallel via [mlr3::benchmark()]
+#'
+#' @name FeatureSelectionForward
+#' @family FeatureSelection
+#' @examples
+#' task = mlr3::mlr_tasks$get("pima")
+#' measures = mlr3::mlr_measures$mget(c("classif.acc"))
-#' measures = mlr3::mlr_measures$mget(c("classif.acc"))
+#' measures = mlr3::mlr_measures$get(c("classif.acc"))
-#' measures = mlr3::mlr_measures$mget(c("classif.acc"))
+#' measures = mlr3::mlr_measures$get(c("classif.acc"))
+#' task$measures = measures
+#' learner = mlr3::mlr_learners$get("classif.rpart")
+#' resampling = mlr3::mlr_resamplings$get("cv", param_vals = list(folds = 5L))
+#' pe = PerformanceEvaluator$new(task, learner, resampling)
+#' tm = TerminatorPerformanceStep$new(threshold = 0.01)
+#' fs = FeatureSelectionForward$new(pe, tm)
+#' fs$calculate()
+#' fs$get_result()
+NULL
+
+#' @export
+#' @include FeatureSelection.R
+
+FeatureSelectionForward = R6Class("FeatureSelectionRandom",
+   inherit = FeatureSelection,
+   public = list(
+      initialize = function(pe, tm, max_features = NA) {
+         if(is.na(max_features)) {
+            max_features = length(pe$task$feature_names)
+         }
+
+         super$initialize(id = "forward_selection", pe = pe, tm = tm,
+                          settings = list(max_features = checkmate::assert_numeric(max_features,
+                                                                                   lower = 1,
+                                                                                   upper = length(pe$task$feature_names))))
+
+         self$state = rep(0, length(pe$task$feature_names))
+     },
+
+     get_result = function() {
+        bmr = self$pe$bmr[[length(self$pe$bmr)]]$get_best(self$pe$task$measures[[1L]]$id)
+        list(features = bmr$task$feature_names,
+             performance = bmr$aggregated)
+     }
+   ),
+   private = list(
+      generate_states = function() {
+         new_states = list()
+         for (i in seq_along(self$state)) {
+            if (self$state[i] == 0) {
+            state = self$state
+            state[i] = 1
+            new_states[[length(new_states) + 1]] = state
+            }
+         }
+         new_states
+      },
+      eval_states_terminator = function(states) {
+         self$tm$update_start(self$pe)
+         self$pe$eval_states(states)
+         self$tm$update_end(self$pe)
+
+         # Side-effect stop maximum features
+         if(!self$tm$terminated) {
+            self$tm$terminated = (length(states[[1]]) == self$settings$max_features)
+         }
+      }
+   )
+)
diff --git a/R/FeatureSelectionRandom.R b/R/FeatureSelectionRandom.R
@@ -0,0 +1,79 @@
+#' @title FeatureSelectionRandom
+#'
+#' @description
+#' FeatureSelection child class to conduct random search
+#'
+#' @section Usage:
+#'  ```
+#' fs = FeatureSelectionRandom$new()
+#' ```
+#' See [FeatureSelection] for a description of the interface.
+#'
+#' @section Arguments:
+#' * `pe` (`[PerformanceEvaluator]`).
+#' * `tm` (`[Terminator]`).
+#' * `max_features` (`integer(1)`)
+#'   Maximum number of features
+#' * `batch_size` (`integer(1`):
+#'   Maximum number of feature combinations to try in a batch.
+#'   Each batch is possibly executed in parallel via [mlr3::benchmark()].
+#'
+#' @section Details:
+#' `$new()` creates a new object of class [FeatureSelectionRandom].
+#' `$get_result()` Returns best feature combination.
+#' The interface is described in [FeatureSelection].
+#'
+#' @name FeatureSelectionRandom
+#' @family FeatureSelection
+#' @examples
+#' task = mlr3::mlr_tasks$get("boston_housing")
+#' learner = mlr3::mlr_learners$get("regr.rpart")
+#' resampling = mlr3::mlr_resamplings$get("cv", param_vals = list(folds = 5L))
+#' pe = PerformanceEvaluator$new(task = task,learner = learner, resampling = resampling)
+#' tm = TerminatorEvaluations$new(max_evaluations = 20)
+#' fs = FeatureSelectionRandom$new(pe, tm, batch_size = 10, max_features = 8)
+#' fs$calculate()
+#' fs$get_result()
+NULL
+
+#' @export
+#' @include FeatureSelection.R
+
+FeatureSelectionRandom = R6Class("FeatureSelectionRandom",
+  inherit = FeatureSelection,
+  public = list(
+     initialize = function(pe, tm, max_features = NA, batch_size = 10) {
+        super$initialize(id = "random_selection", pe = pe, tm = tm,
+                         settings = list(max_features = checkmate::assert_numeric(max_features,
+                                                                                  lower = 1,
+                                                                                  upper = length(pe$task$feature_names)),
+                                         batch_size = checkmate::assert_numeric(batch_size)))
+     },
+
+   get_result = function() {
+      if(length(self$pe$bmr) > 1) {
+         bmr = lapply(self$pe$bmr[1:length(self$pe$bmr)], function(bmr) self$pe$bmr[[1]]$combine(bmr))
+      } else {
+         bmr = self$pe$bmr
+      }
+      bmr_best = bmr[[length(bmr)]]$get_best(self$pe$task$measures[[1L]]$id)
+      list(features = bmr_best$task$feature_names,
+           performance = bmr_best$aggregated)
+   }
+  ),
+  private = list(
+   generate_states = function() {
+     lapply(seq_len(self$settings$batch_size), function(i) {
+       if(is.na(self$settings$max_features)) {
+         return(rbinom(length(self$pe$task$feature_names), 1, 0.5))
+       }
+        x = Inf
+        while (sum(x) >= self$settings$max_features) {
+           x = rbinom(length(self$pe$task$feature_names), 1, 0.5)
+        }
+        return(x)
+     }
+     )
+     }
+   )
+  )
diff --git a/R/PerformanceEvaluator.R b/R/PerformanceEvaluator.R
@@ -0,0 +1,89 @@
+#' @title Abstract PerformanceEvaluator Class
+#'
+#' @description
+#' `PerformanceEvaluator` class that implements the performance evaluation on a set of feature combinations. A pe is an object that stores all informations that are necesarry to conduct a feature selection (`mlr3::Task`, `mlr3::Learner`, `mlr3::Resampling`).
+#'
+#' @section Usage:
+#' ```
+#' # Construction
+#' pe = PerformanceEvaluator$new()
+#'
+#' # Public members
+#' pe$task
+#' pe$learner
+#' pe$resampling
+#' pe$bmr
+#'
+#' # Public methods
+#' pe$eval_states(states)
+#' pe$get_best()
+#' ```
+#'
+#' @section Arguments:
+#' * `task` (`mlr3::Task`):
+#'   The task that we want to evaluate.
+#' * `learner` (`mlr3::Learner`):
+#'   The learner that we want to evaluate.
+#' * `resampling` (`mlr3::Resampling`):
+#'   The Resampling method that is used to evaluate the learner.
+#'
+#' @section Details:
+#' * `$new()` creates a new object of class [PerformanceEvaluator].
+#' * `$task` (`mlr3::Task`) the task for which the feature selection should be conducted.
+#' * `$learner` (`mlr3::Learner`) the algorithm for which the feature selection should be conducted.
+#' * `$resampling` (`mlr3::Resampling`) strategy to evaluate a feature combination
+#' * `$bmr` (`list`) of (`mlr3::BenchmarkResult`) objects. Each entry corresponds to one batch or step depending one the used feature selection method.
+#' * `$eval_states(states)` evaluates the feature combinations `states`.
+#' * `$get_best()` returns selected features with the best performance of each entry in `$bmr`.
+#'
+#' @name PerformanceEvaluator
+#' @keywords internal
+#' @family PerformanceEvaluator
+#' @examples
+#' task = mlr3::mlr_tasks$get("iris")
+#' learner = mlr3::mlr_learners$get("classif.rpart")
+#' resampling = mlr3::mlr_resamplings$get("holdout")
+#' pe = PerformanceEvaluator$new(task, learner, resampling)
+NULL
+
+#' @export
+PerformanceEvaluator = R6Class("PerformanceEvaluator",
+  public = list(
+    task = NULL,
+    learner = NULL,
+    resampling = NULL,
+    bmr = list(),
+    states = list(),
+
+    initialize = function(task, learner, resampling) {
+      self$task = mlr3::assert_task(task)
+      self$learner = mlr3::assert_learner(learner, task = task)
+      self$resampling = mlr3::assert_resampling(resampling)
+    },
+
+    eval_states = function(states) {
+      self$states[[length(self$states)+1]] <- states
+      # For each state, clone task and set feature subset
+      task_list <- list()
+      for(state in states) {
+        task = self$task$clone()
+        task$select(state)
+        task_list[[length(task_list)+1]] <- task
+      }
+
+      new_bmr = benchmark(data.table::data.table(task = task_list,
+                                     learner = list(self$learner),
+                                     resampling = list(self$resampling)))
+
+      self$bmr[[length(self$bmr)+1]] <- new_bmr
+    },
+
+    get_best = function() {
+      lapply(self$bmr, function(x) {
+        rr = x$get_best(self$task$measures[[1L]]$id)
+        list(features = rr$task$feature_names,
+             performance = mean(rr$performance(self$task$measures[[1L]]$id)))
+      })
+    }
+  )
+)