Skip to content

Commit

Permalink
Merge pull request #270 from mlr-org/fix_factors_try_2
Browse files Browse the repository at this point in the history
FixFactors and CollapseFactors
  • Loading branch information
mb706 authored Sep 26, 2019
2 parents 7adc4a0 + 1184796 commit 48caf65
Show file tree
Hide file tree
Showing 49 changed files with 667 additions and 1 deletion.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mlr3pipelines
Title: Preprocessing Operators and Pipelines for 'mlr3'
Version: 0.1.0
Version: 0.1.0.9000
Authors@R:
c(person(given = "Martin",
family = "Binder",
Expand Down Expand Up @@ -89,11 +89,13 @@ Collate:
'PipeOpClassBalancing.R'
'PipeOpClassifAvg.R'
'PipeOpColApply.R'
'PipeOpCollapseFactors.R'
'PipeOpCopy.R'
'PipeOpEncode.R'
'PipeOpEncodeLmer.R'
'PipeOpFeatureUnion.R'
'PipeOpFilter.R'
'PipeOpFixFactors.R'
'PipeOpHistBin.R'
'PipeOpICA.R'
'PipeOpImpute.R'
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ export(PipeOpChunk)
export(PipeOpClassBalancing)
export(PipeOpClassifAvg)
export(PipeOpColApply)
export(PipeOpCollapseFactors)
export(PipeOpCopy)
export(PipeOpEncode)
export(PipeOpEncodeLmer)
export(PipeOpEnsemble)
export(PipeOpFeatureUnion)
export(PipeOpFilter)
export(PipeOpFixFactors)
export(PipeOpHistBin)
export(PipeOpICA)
export(PipeOpImpute)
Expand Down
136 changes: 136 additions & 0 deletions R/PipeOpCollapseFactors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#' @title PipeOpCollapseFactors
#'
#' @usage NULL
#' @name mlr_pipeops_collapsefactors
#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the
#' training samples, until `target_level_count` levels remain. Levels that have prevalence above `no_collapse_above_prevalence`
#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables,
#' rare variables are collapsed to the neighbouring class, whichever has fewer samples.
#'
#' Levels not seen during training are not touched during prediction; Therefore it is useful to combine this with the
#' [`PipeOpFixFactors`].
#'
#' @section Construction:
#' ```
#' PipeOpCollapseFactors$new(id = "collapsefactors", param_vals = list())
#' ```
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"collapsefactors"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with rare affected `factor` and `ordered` feature levels collapsed.
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
#' * `collapse_map` :: named `list` of named `list` of `character`\cr
#' List of factor level maps. For each factor, `collapse_map` contains a named `list` that indicates what levels
#' of the input task get mapped to what levels of the output task. If `collapse_map` has an entry `feat_1` with
#' an entry `a = c("x", "y")`, it means that levels `"x"` and `"y"` get collapsed to level `"a"` in feature `"feat_1"`.
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `no_collapse_above_prevalence` :: `numeric(1)` \cr
#' Fraction of samples below which factor levels get collapsed. Default is 1, which causes all levels
#' to be collapsed until `target_level_count` remain.
#' * `target_level_count` :: `integer(1)` \cr
#' Number of levels to retain. Default is 2.
#'
#' @section Internals:
#' Makes use of the fact that `levels(fact_var) = list(target1 = c("source1", "source2"), target2 = "source2")` causes
#' renaming of level `"source1"` and `"source2"` both to `"target1"`, and also `"source2"` to `"target2"`.
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @family PipeOps
#' @include PipeOpTaskPreproc.R
#' @export
#' @examples
#' library("mlr3")
PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "collapsefactors", param_vals = list()) {
ps = ParamSet$new(params = list(
ParamDbl$new("no_collapse_above_prevalence", 0, 1, tags = c("train", "predict")),
ParamInt$new("target_level_count", 2, tags = c("train", "predict"))
))
ps$values = list(no_collapse_above_prevalence = 1, target_level_count = 2)
super$initialize(id, param_set = ps, param_vals = param_vals)
},

select_cols = function(task) {
task$feature_types[get("type") %in% c("factor", "ordered"), get("id")]
},

get_state = function(task) {
# get the levels of the training task
dt = task$data(cols = self$select_cols(task))

keep_fraction = self$param_set$values$no_collapse_above_prevalence
target_count = self$param_set$values$target_level_count

collapse_map = sapply(dt, function(d) {
if (all(is.na(d))) {
return(NULL)
}
if (length(levels(d)) <= target_count) {
return(NULL)
}
dtable = table(d)
fractions = sort(dtable, decreasing = TRUE) / sum(!is.na(d))
keep_fraction = names(fractions)[fractions >= keep_fraction]
keep_count = names(fractions)[seq_len(target_count)] # at this point we know there are more levels than target_count
keep = union(keep_fraction, keep_count)
dont_keep = setdiff(levels(d), keep)
if (is.ordered(d)) {
cmap = setNames(as.list(levels(d)), levels(d))
for (eliminating in dont_keep) {
position = match(eliminating, names(cmap))
if (position == 1) {
cmap[[2]] = c(cmap[[2]], eliminating)
} else if (position == length(cmap) || dtable[position - 1] < dtable[position + 1]) {
cmap[[position - 1]] = c(cmap[[position - 1]], eliminating)
} else {
cmap[[position + 1]] = c(cmap[[position + 1]], eliminating)
}
dtable = dtable[-position]
cmap[[position]] = NULL
}
} else {
cmap = setNames(as.list(keep), keep)
lowest_kept = keep[length(keep)]
cmap[[lowest_kept]] = c(lowest_kept, dont_keep)
}
cmap
}, simplify = FALSE)

list(collapse_map = discard(collapse_map, is.null))
},

transform = function(task) {
cmaplist = self$state$collapse_map
dt = task$data(cols = names(cmaplist))

for (n in names(cmaplist)) {
# don't touch unseen factor levels
new_lvls = setdiff(levels(dt[[n]]), unlist(cmaplist[[n]], use.names = FALSE))
all_lvls = c(cmaplist[[n]], setNames(as.list(new_lvls), new_lvls))
levels(dt[[n]]) = c(
all_lvls[intersect(levels(dt[[n]]), names(all_lvls))], # keep all levels in their order, if they were present before
all_lvls[setdiff(names(all_lvls), levels(dt[[n]]))] # levels that are missing now get sorted to the back.
)
}
task$select(setdiff(task$feature_names, names(cmaplist)))$cbind(dt)
}
)
)

mlr_pipeops$add("collapsefactors", PipeOpCollapseFactors)
106 changes: 106 additions & 0 deletions R/PipeOpFixFactors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#' @title PipeOpFixFactors
#'
#' @usage NULL
#' @name mlr_pipeops_fixfactors
#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Fixes factors of type `factor`, `ordered`: Makes sure the factor levels
#' during prediction are the same as during training; possibly dropping empty
#' training factor levels before.
#'
#' Note this may introduce *missing values* during prediction if unseen factor levels are found.
#'
#' @section Construction:
#' ```
#' PipeOpFixFactors$new(id = "fixfactors", param_vals = list())
#' ```
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"fixfactors"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` feature levels fixed.
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
#' * `levels` :: named `list` of `character`\cr
#' List of factor levels of each affected `factor` or `ordered` feature that will be fixed.
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `droplevels` :: `logical(1)` \cr
#' Whether to drop empty factor levels of the training task. Default `TRUE`
#'
#' @section Internals:
#' Changes factor levels of columns and attaches them with a new `data.table` backend and the virtual `cbind()` backend.
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @family PipeOps
#' @include PipeOpTaskPreproc.R
#' @export
#' @examples
#' library("mlr3")
PipeOpFixFactors = R6Class("PipeOpFixFactors",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "fixfactors", param_vals = list()) {
ps = ParamSet$new(params = list(
ParamLgl$new("droplevels", tags = c("train", "predict"))
))
ps$values = list(droplevels = TRUE)
super$initialize(id, param_set = ps, param_vals = param_vals)
},

select_cols = function(task) {
task$feature_types[get("type") %in% c("factor", "ordered"), get("id")]
},

get_state = function(task) {
# get the levels of the training task
dt = task$data(cols = self$select_cols(task))
if (self$param_set$values$droplevels) {
dt = droplevels(dt)
}
list(levels = lapply(dt, function(x) levels(x))) # explicitly access the "levels" function
},

transform = function(task) {
dt = task$data(cols = names(self$state$levels))

# check which levels are actually different during training and prediction
needs_adjustment = as.logical(imap(self$state$levels, function(lvx, id) {
!identical(lvx, levels(dt[[id]]))
}))

if (!any(needs_adjustment)) {
return(task)
}

changed_cols = as.data.table(imap(self$state$levels[needs_adjustment], function(lvx, id) {
x = dt[[id]]
if (is.ordered(x)) {
ordered(x, levels = lvx)
} else {
factor(x, levels = lvx)
}
}))
task$select(setdiff(task$feature_names, colnames(changed_cols)))$cbind(changed_cols)
}
)
)

mlr_pipeops$add("fixfactors", PipeOpFixFactors)

# FIXME: from mlr3; should probably go to mlr3misc
ujoin = function (x, y, key) {
cn = setdiff(intersect(names(x), names(y)), key)
expr = parse(text = paste0("`:=`(", paste0(sprintf("%1$s=i.%1$s",
cn), collapse = ","), ")"))
x[y, eval(expr), on = key]
}
2 changes: 2 additions & 0 deletions man/PipeOp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/PipeOpEnsemble.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/PipeOpImpute.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/PipeOpTaskPreproc.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_boxcox.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_branch.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/mlr_pipeops_chunk.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 48caf65

Please sign in to comment.