diff --git a/DESCRIPTION b/DESCRIPTION index b5a5a56e8..0c004295e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mlr3pipelines Title: Preprocessing Operators and Pipelines for 'mlr3' -Version: 0.1.0 +Version: 0.1.0.9000 Authors@R: c(person(given = "Martin", family = "Binder", @@ -89,11 +89,13 @@ Collate: 'PipeOpClassBalancing.R' 'PipeOpClassifAvg.R' 'PipeOpColApply.R' + 'PipeOpCollapseFactors.R' 'PipeOpCopy.R' 'PipeOpEncode.R' 'PipeOpEncodeLmer.R' 'PipeOpFeatureUnion.R' 'PipeOpFilter.R' + 'PipeOpFixFactors.R' 'PipeOpHistBin.R' 'PipeOpICA.R' 'PipeOpImpute.R' diff --git a/NAMESPACE b/NAMESPACE index 32c2750d1..cef50f0e6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,12 +26,14 @@ export(PipeOpChunk) export(PipeOpClassBalancing) export(PipeOpClassifAvg) export(PipeOpColApply) +export(PipeOpCollapseFactors) export(PipeOpCopy) export(PipeOpEncode) export(PipeOpEncodeLmer) export(PipeOpEnsemble) export(PipeOpFeatureUnion) export(PipeOpFilter) +export(PipeOpFixFactors) export(PipeOpHistBin) export(PipeOpICA) export(PipeOpImpute) diff --git a/R/PipeOpCollapseFactors.R b/R/PipeOpCollapseFactors.R new file mode 100644 index 000000000..203e208fa --- /dev/null +++ b/R/PipeOpCollapseFactors.R @@ -0,0 +1,136 @@ +#' @title PipeOpCollapseFactors +#' +#' @usage NULL +#' @name mlr_pipeops_collapsefactors +#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @description +#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the +#' training samples, until `target_level_count` levels remain. Levels that have prevalence above `no_collapse_above_prevalence` +#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables, +#' rare variables are collapsed to the neighbouring class, whichever has fewer samples. +#' +#' Levels not seen during training are not touched during prediction; Therefore it is useful to combine this with the +#' [`PipeOpFixFactors`]. +#' +#' @section Construction: +#' ``` +#' PipeOpCollapseFactors$new(id = "collapsefactors", param_vals = list()) +#' ``` +#' * `id` :: `character(1)`\cr +#' Identifier of resulting object, default `"collapsefactors"`. +#' * `param_vals` :: named `list`\cr +#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. +#' +#' @section Input and Output Channels: +#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. +#' +#' The output is the input [`Task`][mlr3::Task] with rare affected `factor` and `ordered` feature levels collapsed. +#' +#' @section State: +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: +#' * `collapse_map` :: named `list` of named `list` of `character`\cr +#' List of factor level maps. For each factor, `collapse_map` contains a named `list` that indicates what levels +#' of the input task get mapped to what levels of the output task. If `collapse_map` has an entry `feat_1` with +#' an entry `a = c("x", "y")`, it means that levels `"x"` and `"y"` get collapsed to level `"a"` in feature `"feat_1"`. +#' +#' @section Parameters: +#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: +#' * `no_collapse_above_prevalence` :: `numeric(1)` \cr +#' Fraction of samples below which factor levels get collapsed. Default is 1, which causes all levels +#' to be collapsed until `target_level_count` remain. +#' * `target_level_count` :: `integer(1)` \cr +#' Number of levels to retain. Default is 2. +#' +#' @section Internals: +#' Makes use of the fact that `levels(fact_var) = list(target1 = c("source1", "source2"), target2 = "source2")` causes +#' renaming of level `"source1"` and `"source2"` both to `"target1"`, and also `"source2"` to `"target2"`. +#' +#' @section Methods: +#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @family PipeOps +#' @include PipeOpTaskPreproc.R +#' @export +#' @examples +#' library("mlr3") +PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors", + inherit = PipeOpTaskPreprocSimple, + public = list( + initialize = function(id = "collapsefactors", param_vals = list()) { + ps = ParamSet$new(params = list( + ParamDbl$new("no_collapse_above_prevalence", 0, 1, tags = c("train", "predict")), + ParamInt$new("target_level_count", 2, tags = c("train", "predict")) + )) + ps$values = list(no_collapse_above_prevalence = 1, target_level_count = 2) + super$initialize(id, param_set = ps, param_vals = param_vals) + }, + + select_cols = function(task) { + task$feature_types[get("type") %in% c("factor", "ordered"), get("id")] + }, + + get_state = function(task) { + # get the levels of the training task + dt = task$data(cols = self$select_cols(task)) + + keep_fraction = self$param_set$values$no_collapse_above_prevalence + target_count = self$param_set$values$target_level_count + + collapse_map = sapply(dt, function(d) { + if (all(is.na(d))) { + return(NULL) + } + if (length(levels(d)) <= target_count) { + return(NULL) + } + dtable = table(d) + fractions = sort(dtable, decreasing = TRUE) / sum(!is.na(d)) + keep_fraction = names(fractions)[fractions >= keep_fraction] + keep_count = names(fractions)[seq_len(target_count)] # at this point we know there are more levels than target_count + keep = union(keep_fraction, keep_count) + dont_keep = setdiff(levels(d), keep) + if (is.ordered(d)) { + cmap = setNames(as.list(levels(d)), levels(d)) + for (eliminating in dont_keep) { + position = match(eliminating, names(cmap)) + if (position == 1) { + cmap[[2]] = c(cmap[[2]], eliminating) + } else if (position == length(cmap) || dtable[position - 1] < dtable[position + 1]) { + cmap[[position - 1]] = c(cmap[[position - 1]], eliminating) + } else { + cmap[[position + 1]] = c(cmap[[position + 1]], eliminating) + } + dtable = dtable[-position] + cmap[[position]] = NULL + } + } else { + cmap = setNames(as.list(keep), keep) + lowest_kept = keep[length(keep)] + cmap[[lowest_kept]] = c(lowest_kept, dont_keep) + } + cmap + }, simplify = FALSE) + + list(collapse_map = discard(collapse_map, is.null)) + }, + + transform = function(task) { + cmaplist = self$state$collapse_map + dt = task$data(cols = names(cmaplist)) + + for (n in names(cmaplist)) { + # don't touch unseen factor levels + new_lvls = setdiff(levels(dt[[n]]), unlist(cmaplist[[n]], use.names = FALSE)) + all_lvls = c(cmaplist[[n]], setNames(as.list(new_lvls), new_lvls)) + levels(dt[[n]]) = c( + all_lvls[intersect(levels(dt[[n]]), names(all_lvls))], # keep all levels in their order, if they were present before + all_lvls[setdiff(names(all_lvls), levels(dt[[n]]))] # levels that are missing now get sorted to the back. + ) + } + task$select(setdiff(task$feature_names, names(cmaplist)))$cbind(dt) + } + ) +) + +mlr_pipeops$add("collapsefactors", PipeOpCollapseFactors) diff --git a/R/PipeOpFixFactors.R b/R/PipeOpFixFactors.R new file mode 100644 index 000000000..1ab58f7ad --- /dev/null +++ b/R/PipeOpFixFactors.R @@ -0,0 +1,106 @@ +#' @title PipeOpFixFactors +#' +#' @usage NULL +#' @name mlr_pipeops_fixfactors +#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @description +#' Fixes factors of type `factor`, `ordered`: Makes sure the factor levels +#' during prediction are the same as during training; possibly dropping empty +#' training factor levels before. +#' +#' Note this may introduce *missing values* during prediction if unseen factor levels are found. +#' +#' @section Construction: +#' ``` +#' PipeOpFixFactors$new(id = "fixfactors", param_vals = list()) +#' ``` +#' * `id` :: `character(1)`\cr +#' Identifier of resulting object, default `"fixfactors"`. +#' * `param_vals` :: named `list`\cr +#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. +#' +#' @section Input and Output Channels: +#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. +#' +#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` feature levels fixed. +#' +#' @section State: +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: +#' * `levels` :: named `list` of `character`\cr +#' List of factor levels of each affected `factor` or `ordered` feature that will be fixed. +#' +#' @section Parameters: +#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: +#' * `droplevels` :: `logical(1)` \cr +#' Whether to drop empty factor levels of the training task. Default `TRUE` +#' +#' @section Internals: +#' Changes factor levels of columns and attaches them with a new `data.table` backend and the virtual `cbind()` backend. +#' +#' @section Methods: +#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @family PipeOps +#' @include PipeOpTaskPreproc.R +#' @export +#' @examples +#' library("mlr3") +PipeOpFixFactors = R6Class("PipeOpFixFactors", + inherit = PipeOpTaskPreprocSimple, + public = list( + initialize = function(id = "fixfactors", param_vals = list()) { + ps = ParamSet$new(params = list( + ParamLgl$new("droplevels", tags = c("train", "predict")) + )) + ps$values = list(droplevels = TRUE) + super$initialize(id, param_set = ps, param_vals = param_vals) + }, + + select_cols = function(task) { + task$feature_types[get("type") %in% c("factor", "ordered"), get("id")] + }, + + get_state = function(task) { + # get the levels of the training task + dt = task$data(cols = self$select_cols(task)) + if (self$param_set$values$droplevels) { + dt = droplevels(dt) + } + list(levels = lapply(dt, function(x) levels(x))) # explicitly access the "levels" function + }, + + transform = function(task) { + dt = task$data(cols = names(self$state$levels)) + + # check which levels are actually different during training and prediction + needs_adjustment = as.logical(imap(self$state$levels, function(lvx, id) { + !identical(lvx, levels(dt[[id]])) + })) + + if (!any(needs_adjustment)) { + return(task) + } + + changed_cols = as.data.table(imap(self$state$levels[needs_adjustment], function(lvx, id) { + x = dt[[id]] + if (is.ordered(x)) { + ordered(x, levels = lvx) + } else { + factor(x, levels = lvx) + } + })) + task$select(setdiff(task$feature_names, colnames(changed_cols)))$cbind(changed_cols) + } + ) +) + +mlr_pipeops$add("fixfactors", PipeOpFixFactors) + +# FIXME: from mlr3; should probably go to mlr3misc +ujoin = function (x, y, key) { + cn = setdiff(intersect(names(x), names(y)), key) + expr = parse(text = paste0("`:=`(", paste0(sprintf("%1$s=i.%1$s", + cn), collapse = ","), ")")) + x[y, eval(expr), on = key] +} diff --git a/man/PipeOp.Rd b/man/PipeOp.Rd index c4a7c8f4d..706097fcf 100644 --- a/man/PipeOp.Rd +++ b/man/PipeOp.Rd @@ -207,11 +207,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/PipeOpEnsemble.Rd b/man/PipeOpEnsemble.Rd index 59f6cb427..e19157e50 100644 --- a/man/PipeOpEnsemble.Rd +++ b/man/PipeOpEnsemble.Rd @@ -97,11 +97,13 @@ Other PipeOps: \code{\link{PipeOpImpute}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/PipeOpImpute.Rd b/man/PipeOpImpute.Rd index fcf82cb16..7a6c8de43 100644 --- a/man/PipeOpImpute.Rd +++ b/man/PipeOpImpute.Rd @@ -118,11 +118,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/PipeOpTaskPreproc.Rd b/man/PipeOpTaskPreproc.Rd index c11c5189b..3ed96fcbb 100644 --- a/man/PipeOpTaskPreproc.Rd +++ b/man/PipeOpTaskPreproc.Rd @@ -177,11 +177,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops.Rd b/man/mlr_pipeops.Rd index 20c40e366..f5c380ba1 100644 --- a/man/mlr_pipeops.Rd +++ b/man/mlr_pipeops.Rd @@ -68,11 +68,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_boxcox.Rd b/man/mlr_pipeops_boxcox.Rd index 28e5eb628..92143a59d 100644 --- a/man/mlr_pipeops_boxcox.Rd +++ b/man/mlr_pipeops_boxcox.Rd @@ -83,11 +83,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_branch.Rd b/man/mlr_pipeops_branch.Rd index 69477b4ba..a8092b7f1 100644 --- a/man/mlr_pipeops_branch.Rd +++ b/man/mlr_pipeops_branch.Rd @@ -103,11 +103,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_chunk.Rd b/man/mlr_pipeops_chunk.Rd index 339ea820a..954e47836 100644 --- a/man/mlr_pipeops_chunk.Rd +++ b/man/mlr_pipeops_chunk.Rd @@ -82,11 +82,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_classbalancing.Rd b/man/mlr_pipeops_classbalancing.Rd index dd20d3420..137319bf2 100644 --- a/man/mlr_pipeops_classbalancing.Rd +++ b/man/mlr_pipeops_classbalancing.Rd @@ -121,11 +121,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_chunk}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_classifavg.Rd b/man/mlr_pipeops_classifavg.Rd index 534b0437a..b3b66b581 100644 --- a/man/mlr_pipeops_classifavg.Rd +++ b/man/mlr_pipeops_classifavg.Rd @@ -89,11 +89,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_chunk}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_colapply.Rd b/man/mlr_pipeops_colapply.Rd index c22e706bd..7c489185a 100644 --- a/man/mlr_pipeops_colapply.Rd +++ b/man/mlr_pipeops_colapply.Rd @@ -112,11 +112,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_chunk}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_collapsefactors.Rd b/man/mlr_pipeops_collapsefactors.Rd new file mode 100644 index 000000000..d0c7fb063 --- /dev/null +++ b/man/mlr_pipeops_collapsefactors.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpCollapseFactors.R +\docType{data} +\name{mlr_pipeops_collapsefactors} +\alias{mlr_pipeops_collapsefactors} +\alias{PipeOpCollapseFactors} +\title{PipeOpCollapseFactors} +\format{\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}.} +\description{ +Collapses factors of type \code{factor}, \code{ordered}: Collapses the rarest factors in the +training samples, until \code{target_level_count} levels remain. Levels that have prevalence above \code{no_collapse_above_prevalence} +are retained, however. For \code{factor} variables, these are collapsed to the next larger level, for \code{ordered} variables, +rare variables are collapsed to the neighbouring class, whichever has fewer samples. + +Levels not seen during training are not touched during prediction; Therefore it is useful to combine this with the +\code{\link{PipeOpFixFactors}}. +} +\section{Construction}{ +\preformatted{PipeOpCollapseFactors$new(id = "collapsefactors", param_vals = list()) +} +\itemize{ +\item \code{id} :: \code{character(1)}\cr +Identifier of resulting object, default \code{"collapsefactors"}. +\item \code{param_vals} :: named \code{list}\cr +List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. +} +} + +\section{Input and Output Channels}{ + +Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. + +The output is the input \code{\link[mlr3:Task]{Task}} with rare affected \code{factor} and \code{ordered} feature levels collapsed. +} + +\section{State}{ + +The \code{$state} is a named \code{list} with the \code{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{collapse_map} :: named \code{list} of named \code{list} of \code{character}\cr +List of factor level maps. For each factor, \code{collapse_map} contains a named \code{list} that indicates what levels +of the input task get mapped to what levels of the output task. If \code{collapse_map} has an entry \code{feat_1} with +an entry \code{a = c("x", "y")}, it means that levels \code{"x"} and \code{"y"} get collapsed to level \code{"a"} in feature \code{"feat_1"}. +} +} + +\section{Parameters}{ + +The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{no_collapse_above_prevalence} :: \code{numeric(1)} \cr +Fraction of samples below which factor levels get collapsed. Default is 1, which causes all levels +to be collapsed until \code{target_level_count} remain. +\item \code{target_level_count} :: \code{integer(1)} \cr +Number of levels to retain. Default is 2. +} +} + +\section{Internals}{ + +Makes use of the fact that \code{levels(fact_var) = list(target1 = c("source1", "source2"), target2 = "source2")} causes +renaming of level \code{"source1"} and \code{"source2"} both to \code{"target1"}, and also \code{"source2"} to \code{"target2"}. +} + +\section{Methods}{ + +Only methods inherited from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} + +\examples{ +library("mlr3") +} +\seealso{ +Other PipeOps: \code{\link{PipeOpEnsemble}}, + \code{\link{PipeOpImpute}}, + \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, + \code{\link{mlr_pipeops_boxcox}}, + \code{\link{mlr_pipeops_branch}}, + \code{\link{mlr_pipeops_chunk}}, + \code{\link{mlr_pipeops_classbalancing}}, + \code{\link{mlr_pipeops_classifavg}}, + \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_copy}}, + \code{\link{mlr_pipeops_encodelmer}}, + \code{\link{mlr_pipeops_encode}}, + \code{\link{mlr_pipeops_featureunion}}, + \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, + \code{\link{mlr_pipeops_histbin}}, + \code{\link{mlr_pipeops_ica}}, + \code{\link{mlr_pipeops_imputehist}}, + \code{\link{mlr_pipeops_imputemean}}, + \code{\link{mlr_pipeops_imputemedian}}, + \code{\link{mlr_pipeops_imputenewlvl}}, + \code{\link{mlr_pipeops_imputesample}}, + \code{\link{mlr_pipeops_kernelpca}}, + \code{\link{mlr_pipeops_learner}}, + \code{\link{mlr_pipeops_missind}}, + \code{\link{mlr_pipeops_modelmatrix}}, + \code{\link{mlr_pipeops_mutate}}, + \code{\link{mlr_pipeops_nop}}, + \code{\link{mlr_pipeops_pca}}, + \code{\link{mlr_pipeops_quantilebin}}, + \code{\link{mlr_pipeops_regravg}}, + \code{\link{mlr_pipeops_removeconstants}}, + \code{\link{mlr_pipeops_scalemaxabs}}, + \code{\link{mlr_pipeops_scalerange}}, + \code{\link{mlr_pipeops_scale}}, + \code{\link{mlr_pipeops_select}}, + \code{\link{mlr_pipeops_smote}}, + \code{\link{mlr_pipeops_spatialsign}}, + \code{\link{mlr_pipeops_subsample}}, + \code{\link{mlr_pipeops_unbranch}}, + \code{\link{mlr_pipeops_yeojohnson}}, + \code{\link{mlr_pipeops}} +} +\concept{PipeOps} +\keyword{datasets} diff --git a/man/mlr_pipeops_copy.Rd b/man/mlr_pipeops_copy.Rd index ec94a1d5f..381171f46 100644 --- a/man/mlr_pipeops_copy.Rd +++ b/man/mlr_pipeops_copy.Rd @@ -102,10 +102,12 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_encode.Rd b/man/mlr_pipeops_encode.Rd index 9938220c9..654ce0cab 100644 --- a/man/mlr_pipeops_encode.Rd +++ b/man/mlr_pipeops_encode.Rd @@ -105,10 +105,12 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_encodelmer.Rd b/man/mlr_pipeops_encodelmer.Rd index 595dfd71a..917e0c326 100644 --- a/man/mlr_pipeops_encodelmer.Rd +++ b/man/mlr_pipeops_encodelmer.Rd @@ -108,10 +108,12 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_featureunion.Rd b/man/mlr_pipeops_featureunion.Rd index 9c72ad4a7..8e60ac8bd 100644 --- a/man/mlr_pipeops_featureunion.Rd +++ b/man/mlr_pipeops_featureunion.Rd @@ -98,10 +98,12 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_filter.Rd b/man/mlr_pipeops_filter.Rd index 351928259..2bef134c5 100644 --- a/man/mlr_pipeops_filter.Rd +++ b/man/mlr_pipeops_filter.Rd @@ -126,10 +126,12 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_fixfactors.Rd b/man/mlr_pipeops_fixfactors.Rd new file mode 100644 index 000000000..6d93f1c40 --- /dev/null +++ b/man/mlr_pipeops_fixfactors.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpFixFactors.R +\docType{data} +\name{mlr_pipeops_fixfactors} +\alias{mlr_pipeops_fixfactors} +\alias{PipeOpFixFactors} +\title{PipeOpFixFactors} +\format{\code{\link{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}.} +\description{ +Fixes factors of type \code{factor}, \code{ordered}: Makes sure the factor levels +during prediction are the same as during training; possibly dropping empty +training factor levels before. + +Note this may introduce \emph{missing values} during prediction if unseen factor levels are found. +} +\section{Construction}{ +\preformatted{PipeOpFixFactors$new(id = "fixfactors", param_vals = list()) +} +\itemize{ +\item \code{id} :: \code{character(1)}\cr +Identifier of resulting object, default \code{"fixfactors"}. +\item \code{param_vals} :: named \code{list}\cr +List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. +} +} + +\section{Input and Output Channels}{ + +Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. + +The output is the input \code{\link[mlr3:Task]{Task}} with all affected \code{factor} and \code{ordered} feature levels fixed. +} + +\section{State}{ + +The \code{$state} is a named \code{list} with the \code{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{levels} :: named \code{list} of \code{character}\cr +List of factor levels of each affected \code{factor} or \code{ordered} feature that will be fixed. +} +} + +\section{Parameters}{ + +The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{droplevels} :: \code{logical(1)} \cr +Whether to drop empty factor levels of the training task. Default \code{TRUE} +} +} + +\section{Internals}{ + +Changes factor levels of columns and attaches them with a new \code{data.table} backend and the virtual \code{cbind()} backend. +} + +\section{Methods}{ + +Only methods inherited from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} + +\examples{ +library("mlr3") +} +\seealso{ +Other PipeOps: \code{\link{PipeOpEnsemble}}, + \code{\link{PipeOpImpute}}, + \code{\link{PipeOpTaskPreproc}}, \code{\link{PipeOp}}, + \code{\link{mlr_pipeops_boxcox}}, + \code{\link{mlr_pipeops_branch}}, + \code{\link{mlr_pipeops_chunk}}, + \code{\link{mlr_pipeops_classbalancing}}, + \code{\link{mlr_pipeops_classifavg}}, + \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, + \code{\link{mlr_pipeops_copy}}, + \code{\link{mlr_pipeops_encodelmer}}, + \code{\link{mlr_pipeops_encode}}, + \code{\link{mlr_pipeops_featureunion}}, + \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_histbin}}, + \code{\link{mlr_pipeops_ica}}, + \code{\link{mlr_pipeops_imputehist}}, + \code{\link{mlr_pipeops_imputemean}}, + \code{\link{mlr_pipeops_imputemedian}}, + \code{\link{mlr_pipeops_imputenewlvl}}, + \code{\link{mlr_pipeops_imputesample}}, + \code{\link{mlr_pipeops_kernelpca}}, + \code{\link{mlr_pipeops_learner}}, + \code{\link{mlr_pipeops_missind}}, + \code{\link{mlr_pipeops_modelmatrix}}, + \code{\link{mlr_pipeops_mutate}}, + \code{\link{mlr_pipeops_nop}}, + \code{\link{mlr_pipeops_pca}}, + \code{\link{mlr_pipeops_quantilebin}}, + \code{\link{mlr_pipeops_regravg}}, + \code{\link{mlr_pipeops_removeconstants}}, + \code{\link{mlr_pipeops_scalemaxabs}}, + \code{\link{mlr_pipeops_scalerange}}, + \code{\link{mlr_pipeops_scale}}, + \code{\link{mlr_pipeops_select}}, + \code{\link{mlr_pipeops_smote}}, + \code{\link{mlr_pipeops_spatialsign}}, + \code{\link{mlr_pipeops_subsample}}, + \code{\link{mlr_pipeops_unbranch}}, + \code{\link{mlr_pipeops_yeojohnson}}, + \code{\link{mlr_pipeops}} +} +\concept{PipeOps} +\keyword{datasets} diff --git a/man/mlr_pipeops_histbin.Rd b/man/mlr_pipeops_histbin.Rd index 78dd6d32f..9cfb53693 100644 --- a/man/mlr_pipeops_histbin.Rd +++ b/man/mlr_pipeops_histbin.Rd @@ -82,11 +82,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, \code{\link{mlr_pipeops_imputemean}}, diff --git a/man/mlr_pipeops_ica.Rd b/man/mlr_pipeops_ica.Rd index 6000c170e..bed7d740d 100644 --- a/man/mlr_pipeops_ica.Rd +++ b/man/mlr_pipeops_ica.Rd @@ -110,11 +110,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_imputehist}}, \code{\link{mlr_pipeops_imputemean}}, diff --git a/man/mlr_pipeops_imputehist.Rd b/man/mlr_pipeops_imputehist.Rd index d9eac8b36..7ef6c2977 100644 --- a/man/mlr_pipeops_imputehist.Rd +++ b/man/mlr_pipeops_imputehist.Rd @@ -71,11 +71,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputemean}}, diff --git a/man/mlr_pipeops_imputemean.Rd b/man/mlr_pipeops_imputemean.Rd index e22dd44eb..2b800ab15 100644 --- a/man/mlr_pipeops_imputemean.Rd +++ b/man/mlr_pipeops_imputemean.Rd @@ -71,11 +71,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_imputemedian.Rd b/man/mlr_pipeops_imputemedian.Rd index 4a001d67d..b89d8327f 100644 --- a/man/mlr_pipeops_imputemedian.Rd +++ b/man/mlr_pipeops_imputemedian.Rd @@ -71,11 +71,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_imputenewlvl.Rd b/man/mlr_pipeops_imputenewlvl.Rd index 04313acfb..1b721bfd1 100644 --- a/man/mlr_pipeops_imputenewlvl.Rd +++ b/man/mlr_pipeops_imputenewlvl.Rd @@ -70,11 +70,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_imputesample.Rd b/man/mlr_pipeops_imputesample.Rd index dcfb127c6..c2366d932 100644 --- a/man/mlr_pipeops_imputesample.Rd +++ b/man/mlr_pipeops_imputesample.Rd @@ -70,11 +70,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_kernelpca.Rd b/man/mlr_pipeops_kernelpca.Rd index 21ff9e954..ddfcfd86f 100644 --- a/man/mlr_pipeops_kernelpca.Rd +++ b/man/mlr_pipeops_kernelpca.Rd @@ -85,11 +85,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_learner.Rd b/man/mlr_pipeops_learner.Rd index e8e48ab37..6be255820 100644 --- a/man/mlr_pipeops_learner.Rd +++ b/man/mlr_pipeops_learner.Rd @@ -102,11 +102,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_missind.Rd b/man/mlr_pipeops_missind.Rd index 81e0d5d64..28ffaabce 100644 --- a/man/mlr_pipeops_missind.Rd +++ b/man/mlr_pipeops_missind.Rd @@ -97,11 +97,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_modelmatrix.Rd b/man/mlr_pipeops_modelmatrix.Rd index 667d4b2af..4044a25f7 100644 --- a/man/mlr_pipeops_modelmatrix.Rd +++ b/man/mlr_pipeops_modelmatrix.Rd @@ -77,11 +77,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_mutate.Rd b/man/mlr_pipeops_mutate.Rd index c61b69399..f8431ed59 100644 --- a/man/mlr_pipeops_mutate.Rd +++ b/man/mlr_pipeops_mutate.Rd @@ -87,11 +87,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_nop.Rd b/man/mlr_pipeops_nop.Rd index 9819472aa..95dce7cc8 100644 --- a/man/mlr_pipeops_nop.Rd +++ b/man/mlr_pipeops_nop.Rd @@ -79,11 +79,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_pca.Rd b/man/mlr_pipeops_pca.Rd index 7486a2a5a..9894eea9e 100644 --- a/man/mlr_pipeops_pca.Rd +++ b/man/mlr_pipeops_pca.Rd @@ -88,11 +88,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_quantilebin.Rd b/man/mlr_pipeops_quantilebin.Rd index 8fb87b063..0a27b7fc6 100644 --- a/man/mlr_pipeops_quantilebin.Rd +++ b/man/mlr_pipeops_quantilebin.Rd @@ -76,11 +76,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_regravg.Rd b/man/mlr_pipeops_regravg.Rd index 52f0dfa82..805ce85b1 100644 --- a/man/mlr_pipeops_regravg.Rd +++ b/man/mlr_pipeops_regravg.Rd @@ -84,11 +84,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_removeconstants.Rd b/man/mlr_pipeops_removeconstants.Rd index ede37e3ad..bacb84468 100644 --- a/man/mlr_pipeops_removeconstants.Rd +++ b/man/mlr_pipeops_removeconstants.Rd @@ -81,11 +81,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_scale.Rd b/man/mlr_pipeops_scale.Rd index 8cc6fc10a..f28bb197b 100644 --- a/man/mlr_pipeops_scale.Rd +++ b/man/mlr_pipeops_scale.Rd @@ -89,11 +89,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_scalemaxabs.Rd b/man/mlr_pipeops_scalemaxabs.Rd index 5c0a3ded3..afb8ea8cc 100644 --- a/man/mlr_pipeops_scalemaxabs.Rd +++ b/man/mlr_pipeops_scalemaxabs.Rd @@ -70,11 +70,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_scalerange.Rd b/man/mlr_pipeops_scalerange.Rd index f2e7bb852..927c7c294 100644 --- a/man/mlr_pipeops_scalerange.Rd +++ b/man/mlr_pipeops_scalerange.Rd @@ -74,11 +74,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_select.Rd b/man/mlr_pipeops_select.Rd index a8e2dc778..0bb250578 100644 --- a/man/mlr_pipeops_select.Rd +++ b/man/mlr_pipeops_select.Rd @@ -91,11 +91,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_smote.Rd b/man/mlr_pipeops_smote.Rd index 1342596a3..73452abb8 100644 --- a/man/mlr_pipeops_smote.Rd +++ b/man/mlr_pipeops_smote.Rd @@ -93,11 +93,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_spatialsign.Rd b/man/mlr_pipeops_spatialsign.Rd index 66452d64a..99281c9c5 100644 --- a/man/mlr_pipeops_spatialsign.Rd +++ b/man/mlr_pipeops_spatialsign.Rd @@ -70,11 +70,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_subsample.Rd b/man/mlr_pipeops_subsample.Rd index 1adb7cc12..605da430c 100644 --- a/man/mlr_pipeops_subsample.Rd +++ b/man/mlr_pipeops_subsample.Rd @@ -89,11 +89,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_unbranch.Rd b/man/mlr_pipeops_unbranch.Rd index a3405c64d..e357c8da1 100644 --- a/man/mlr_pipeops_unbranch.Rd +++ b/man/mlr_pipeops_unbranch.Rd @@ -82,11 +82,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/man/mlr_pipeops_yeojohnson.Rd b/man/mlr_pipeops_yeojohnson.Rd index 158101398..045e9fe20 100644 --- a/man/mlr_pipeops_yeojohnson.Rd +++ b/man/mlr_pipeops_yeojohnson.Rd @@ -85,11 +85,13 @@ Other PipeOps: \code{\link{PipeOpEnsemble}}, \code{\link{mlr_pipeops_classbalancing}}, \code{\link{mlr_pipeops_classifavg}}, \code{\link{mlr_pipeops_colapply}}, + \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_featureunion}}, \code{\link{mlr_pipeops_filter}}, + \code{\link{mlr_pipeops_fixfactors}}, \code{\link{mlr_pipeops_histbin}}, \code{\link{mlr_pipeops_ica}}, \code{\link{mlr_pipeops_imputehist}}, diff --git a/tests/testthat/test_pipeop_fixfactors.R b/tests/testthat/test_pipeop_fixfactors.R new file mode 100644 index 000000000..b67001a2e --- /dev/null +++ b/tests/testthat/test_pipeop_fixfactors.R @@ -0,0 +1,108 @@ +context("PipeOpFixFactors") + +test_that("PipeOpFixFactors", { + task = mlr_tasks$get("boston_housing") + + chaslevels = task$levels()$chas + townlevels = task$levels()$town + + expect_datapreproc_pipeop_class(PipeOpFixFactors, task = task) + + expect_datapreproc_pipeop_class(PipeOpFixFactors, task = mlr_tasks$get("iris")) + + op = PipeOpFixFactors$new() + expect_pipeop(op) + + nt = train_pipeop(op, inputs = list(task))[[1L]] + fn = nt$feature_names + + # factor cols are removed + expect_true(all(c("chas", "town") %in% fn)) + expect_set_equal(nt$levels()$chas, c("0", "1")) + + nt = op$train(list(task$clone()$filter(1)))[[1]] + + expect_equal(nt$levels()$chas, "0") + expect_equal(nt$levels()$town, "Nahant") + expect_equal(length(nt$levels()), 2) + + nt = op$predict(list(task))[[1]] + + expect_equal(nt$levels()$chas, "0") + expect_equal(nt$levels()$town, "Nahant") + expect_equal(length(nt$levels()), 2) + + expect_equal(levels(nt$data()$chas), "0") + expect_equal(which(task$data()$chas == "1"), which(is.na(nt$data()$chas))) + + nt = op$train(list(task$clone()$filter(1:2)))[[1]] + + expect_equal(nt$levels()$chas, "0") + expect_set_equal(nt$levels()$town, c("Nahant", "Swampscott")) + expect_equal(length(nt$levels()), 2) + + nt = op$predict(list(task))[[1]] + expect_equal(nt$levels()$chas, "0") + expect_set_equal(nt$levels()$town, c("Nahant", "Swampscott")) + expect_equal(length(nt$levels()), 2) + + dattrain = data.table( + a = factor(c("a", "b", "c", NA), levels = letters), + b = ordered(c("a", "b", "c", NA)), + target = 1:4) + + tasktrain = TaskRegr$new("train", dattrain, "target") + + dattest = data.table( + a = factor(c("a", "b", "c", "d")), + b = ordered(c("a", "b", "c", "d"), levels = letters[10:1]), + target = 1:4) + + tasktest = TaskRegr$new("train", dattest, "target") + + op$param_set$values$droplevels = TRUE + + opt = op$train(list(tasktrain))[[1]] + + expect_equal(opt$levels(), list(a = letters[1:3], b = letters[1:3])) + expect_equal(levels(opt$data()$a), letters[1:3]) + expect_equal(levels(opt$data()$b), letters[1:3]) + expect_true(is.ordered(opt$data()$b)) + expect_false(is.ordered(opt$data()$a)) + + opt = op$predict(list(tasktest))[[1]] + + expect_equal(opt$levels(), list(a = letters[1:3], b = letters[1:3])) + expect_equal(levels(opt$data()$a), letters[1:3]) + expect_equal(levels(opt$data()$b), letters[1:3]) + expect_true(is.ordered(opt$data()$b)) + expect_false(is.ordered(opt$data()$a)) + expect_equal(opt$data()$a, factor(c("a", "b", "c", NA), levels = letters[1:3])) + expect_equal(opt$data()$b, ordered(c("a", "b", "c", NA), levels = letters[1:3])) + + expect_equal(op$state$levels, list(a = letters[1:3], b = letters[1:3])) + + op$param_set$values$droplevels = FALSE + + opt = op$train(list(tasktrain))[[1]] + + expect_equal(opt$levels(), list(a = letters, b = letters[1:3])) + expect_equal(levels(opt$data()$a), letters) + expect_equal(levels(opt$data()$b), letters[1:3]) + expect_true(is.ordered(opt$data()$b)) + expect_false(is.ordered(opt$data()$a)) + + opt = op$predict(list(tasktest))[[1]] + + expect_equal(opt$levels(), list(a = letters, b = letters[1:3])) + expect_equal(levels(opt$data()$a), letters) + expect_equal(levels(opt$data()$b), letters[1:3]) + expect_true(is.ordered(opt$data()$b)) + expect_false(is.ordered(opt$data()$a)) + expect_equal(opt$data()$a, factor(c("a", "b", "c", "d"), levels = letters)) + expect_equal(opt$data()$b, ordered(c("a", "b", "c", NA), levels = letters[1:3])) + + expect_equal(op$state$levels, list(a = letters, b = letters[1:3])) + +}) +