From 30639e730237a64a56ce35b947bfd12bbe23e6c7 Mon Sep 17 00:00:00 2001 From: Michel Lang Date: Fri, 23 Aug 2024 20:24:21 +0200 Subject: [PATCH 1/8] refactor: weights (#1065) * update docs for a fine granularity of weight columns * add deprecation warning and rename old weight role * implemented new weights in task * added weights everywhere, docs, tests * changed defaults * improve docs * rename role weights_train to weights_learner for consistency * fix printer test * improve docs * cleanup * better soft deprecate * fix typos * next try * hard deprecate weights of learner, all workarounds failed * adapt autotest * update news * cleanup * fix autotest * fix error message * change defaults * fix test * fix: properties * refactor: add use weight parameter in base learner * chore: deprecated * tests: col roles * fix: use weights * test: use weights * fix --------- Co-authored-by: be-marc --- DESCRIPTION | 2 +- NEWS.md | 16 ++- R/Learner.R | 16 ++- R/LearnerClassifRpart.R | 10 +- R/LearnerRegrRpart.R | 10 +- R/Measure.R | 41 +++++-- R/MeasureSimple.R | 74 ++++++++++-- R/Resampling.R | 49 +++++--- R/ResamplingBootstrap.R | 8 +- R/ResamplingCustom.R | 2 +- R/ResamplingCustomCV.R | 2 +- R/ResamplingHoldout.R | 9 +- R/ResamplingSubsampling.R | 9 +- R/Task.R | 110 +++++++++++++++--- R/helper.R | 8 ++ R/mlr_reflections.R | 12 +- inst/testthat/helper_autotest.R | 19 ++- man-roxygen/param_measure_properties.R | 14 +-- man/Learner.Rd | 10 ++ man/Measure.Rd | 24 +++- man/MeasureClassif.Rd | 8 ++ man/MeasureRegr.Rd | 8 ++ man/MeasureSimilarity.Rd | 8 ++ man/Resampling.Rd | 28 +++-- man/Task.Rd | 57 +++++++-- man/mlr_learners_classif.rpart.Rd | 3 + man/mlr_learners_regr.rpart.Rd | 3 + man/mlr_measures_classif.acc.Rd | 6 +- man/mlr_measures_classif.bacc.Rd | 6 +- man/mlr_measures_classif.bbrier.Rd | 6 +- man/mlr_measures_classif.ce.Rd | 6 +- man/mlr_measures_classif.logloss.Rd | 6 +- man/mlr_measures_regr.bias.Rd | 6 +- man/mlr_measures_regr.mae.Rd | 6 +- man/mlr_measures_regr.mape.Rd | 6 +- man/mlr_measures_regr.mse.Rd | 6 +- man/mlr_measures_regr.msle.Rd | 6 +- man/mlr_measures_regr.pbias.Rd | 6 +- man/mlr_measures_regr.rmse.Rd | 6 +- man/mlr_measures_regr.rmsle.Rd | 6 +- man/mlr_resamplings_bootstrap.Rd | 2 + man/mlr_resamplings_holdout.Rd | 2 + man/mlr_resamplings_subsampling.Rd | 2 + tests/testthat/test_Learner.R | 10 +- tests/testthat/test_Measure.R | 29 ++++- tests/testthat/test_Resampling.R | 23 ++++ tests/testthat/test_Task.R | 79 ++++++++++--- tests/testthat/test_benchmark.R | 14 +-- tests/testthat/test_convert_task.R | 25 ++-- .../test_mlr_learners_classif_featureless.R | 1 - .../test_mlr_learners_classif_rpart.R | 14 +-- tests/testthat/test_mlr_learners_regr_rpart.R | 6 +- .../testthat/test_mlr_resampling_bootstrap.R | 2 +- tests/testthat/test_mlr_resampling_custom.R | 6 +- tests/testthat/test_mlr_resampling_cv.R | 2 +- tests/testthat/test_mlr_resampling_holdout.R | 2 +- tests/testthat/test_mlr_resampling_loo.R | 2 +- .../test_mlr_resampling_repeated_cv.R | 2 +- .../test_mlr_resampling_subsampling.R | 2 +- tests/testthat/test_predict.R | 2 +- tests/testthat/test_resampling_insample.R | 2 +- 61 files changed, 650 insertions(+), 217 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ca7492a28..f5e9a1185 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,7 @@ Depends: R (>= 3.1.0) Imports: R6 (>= 2.4.1), - backports, + backports (>= 1.5.0), checkmate (>= 2.0.0), data.table (>= 1.15.0), evaluate, diff --git a/NEWS.md b/NEWS.md index 797229fdf..ef75c8717 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,24 +1,28 @@ # mlr3 (development version) -* Deprecated `data_format` and `data_formats` for Learners, Tasks, and DataBackends. +* refactor: It is now possible to use weights also during scoring predictions via measures and during resampling to sample observations with unequal probability. + The weights must be stored in the task and can be assigned the column role `weights_measure` or `weights_resampling`, respectively. + The weights used during training by the Learner are renamed to `weights_learner`, the previous column role `weight` is dysfunctional. + Additionally, it is now possible to disable the use of weights via the new hyperparameter `use_weights`. + Note that this is a breaking change, but appears to be the less error-prone solution in the long run. +* refactor: Deprecated `data_format` and `data_formats` for Learners, Tasks, and DataBackends. * feat: The `partition()` function creates training, test and validation sets. * refactor: Optimize runtime of fixing factor levels. * refactor: Optimize runtime of setting row roles. * refactor: Optimize runtime of marshalling. * refactor: Optimize runtime of `Task$col_info`. -* fix: column info is now checked for compatibility during `Learner$predict` (#943). -* BREAKING CHANGE: the predict time of the learner now stores the cumulative duration for all predict sets (#992). +* fix: Column info is now checked for compatibility during `Learner$predict` (#943). +* BREAKING CHANGE: The predict time of the learner now stores the cumulative duration for all predict sets (#992). * feat: `$internal_valid_task` can now be set to an `integer` vector. * feat: Measures can now have an empty `$predict_sets` (#1094). - this is relevant for measures that only extract information from - the model of a learner (such as internal validation scores or AIC / BIC) + This is relevant for measures that only extract information from the model of a learner (such as internal validation scores or AIC / BIC) * refactor: Deprecated the `$divide()` method * fix: `Task$cbind()` now works with non-standard primary keys for `data.frames` (#961). * fix: Triggering of fallback learner now has log-level `"info"` instead of `"debug"` (#972). * feat: Added new measure `pinballs `. * feat: Added new measure `mu_auc`. * feat: Add option to calculate the mean of the true values on the train set in `msr("regr.rsq")`. -* feat: default fallback learner is set when encapsulation is activated. +* feat: Default fallback learner is set when encapsulation is activated. # mlr3 0.20.2 diff --git a/R/Learner.R b/R/Learner.R index 11ce7c583..757c18aa1 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -70,6 +70,14 @@ #' Only available for [`Learner`]s with the `"internal_tuning"` property. #' If the learner is not trained yet, this returns `NULL`. #' +#' @section Weights: +#' +#' Many learners support observation weights, indicated by their property `"weights"`. +#' The weights are stored in the [Task] where the column role `weights_learner` needs to be assigned to a single numeric column. +#' The weights are automatically used if found in the task, this can be disabled by setting the hyperparamerter `use_weights` to `FALSE`. +#' If the learner is set-up to use weights but the task does not have a designated weight column, an unweighted version is calculated instead. +#' The weights do not necessarily need to sum up to 1, they are passed down to the learner. +#' #' @section Setting Hyperparameters: #' #' All information about hyperparameters is stored in the slot `param_set` which is a [paradox::ParamSet]. @@ -215,7 +223,6 @@ Learner = R6Class("Learner", self$id = assert_string(id, min.chars = 1L) self$label = assert_string(label, na.ok = TRUE) self$task_type = assert_choice(task_type, mlr_reflections$task_types$type) - private$.param_set = assert_param_set(param_set) self$feature_types = assert_ordered_set(feature_types, mlr_reflections$task_feature_types, .var.name = "feature_types") self$predict_types = assert_ordered_set(predict_types, names(mlr_reflections$learner_predict_types[[task_type]]), empty.ok = FALSE, .var.name = "predict_types") @@ -225,6 +232,11 @@ Learner = R6Class("Learner", self$packages = union("mlr3", assert_character(packages, any.missing = FALSE, min.chars = 1L)) self$man = assert_string(man, na.ok = TRUE) + if ("weights" %in% self$properties) { + param_set = c(param_set, ps(use_weights = p_lgl(default = TRUE, tags = "train"))) + } + private$.param_set = assert_param_set(param_set) + check_packages_installed(packages, msg = sprintf("Package '%%s' required but not installed for Learner '%s'", id)) }, @@ -405,7 +417,7 @@ Learner = R6Class("Learner", assert_names(newdata$colnames, must.include = task$feature_names) # the following columns are automatically set to NA if missing - impute = unlist(task$col_roles[c("target", "name", "order", "stratum", "group", "weight")], use.names = FALSE) + impute = unlist(task$col_roles[c("target", "name", "order", "stratum", "group", "weights_learner", "weights_measure", "weights_resampling")], use.names = FALSE) impute = setdiff(impute, newdata$colnames) if (length(impute)) { # create list with correct NA types and cbind it to the backend diff --git a/R/LearnerClassifRpart.R b/R/LearnerClassifRpart.R index 02070150f..314917736 100644 --- a/R/LearnerClassifRpart.R +++ b/R/LearnerClassifRpart.R @@ -8,6 +8,8 @@ #' #' @section Initial parameter values: #' * Parameter `xval` is initialized to 0 in order to save some computation time. +#' * Parameter `use_weights` can be set to `FALSE` to ignore observation weights with column role `weights_learner` , +#' if present. #' #' @section Custom mlr3 parameters: #' * Parameter `model` has been renamed to `keep_model`. @@ -35,9 +37,8 @@ LearnerClassifRpart = R6Class("LearnerClassifRpart", inherit = LearnerClassif, minsplit = p_int(1L, default = 20L, tags = "train"), surrogatestyle = p_int(0L, 1L, default = 0L, tags = "train"), usesurrogate = p_int(0L, 2L, default = 2L, tags = "train"), - xval = p_int(0L, default = 10L, tags = "train") + xval = p_int(0L, default = 10L, init = 0L, tags = "train") ) - ps$values = list(xval = 0L) super$initialize( id = "classif.rpart", @@ -77,10 +78,7 @@ LearnerClassifRpart = R6Class("LearnerClassifRpart", inherit = LearnerClassif, .train = function(task) { pv = self$param_set$get_values(tags = "train") names(pv) = replace(names(pv), names(pv) == "keep_model", "model") - if ("weights" %in% task$properties) { - pv = insert_named(pv, list(weights = task$weights$weight)) - } - + pv = get_weights(task$weights_learner$weight, pv) invoke(rpart::rpart, formula = task$formula(), data = task$data(), .args = pv, .opts = allow_partial_matching) }, diff --git a/R/LearnerRegrRpart.R b/R/LearnerRegrRpart.R index 5910fbcd0..5d3d66475 100644 --- a/R/LearnerRegrRpart.R +++ b/R/LearnerRegrRpart.R @@ -8,6 +8,8 @@ #' #' @section Initial parameter values: #' * Parameter `xval` is initialized to 0 in order to save some computation time. +#' * Parameter `use_weights` can be set to `FALSE` to ignore observation weights with column role `weights_learner` , +#' if present. #' #' @section Custom mlr3 parameters: #' * Parameter `model` has been renamed to `keep_model`. @@ -35,9 +37,8 @@ LearnerRegrRpart = R6Class("LearnerRegrRpart", inherit = LearnerRegr, minsplit = p_int(1L, default = 20L, tags = "train"), surrogatestyle = p_int(0L, 1L, default = 0L, tags = "train"), usesurrogate = p_int(0L, 2L, default = 2L, tags = "train"), - xval = p_int(0L, default = 10L, tags = "train") + xval = p_int(0L, default = 10L, init = 0L, tags = "train") ) - ps$values = list(xval = 0L) super$initialize( id = "regr.rpart", @@ -77,10 +78,7 @@ LearnerRegrRpart = R6Class("LearnerRegrRpart", inherit = LearnerRegr, .train = function(task) { pv = self$param_set$get_values(tags = "train") names(pv) = replace(names(pv), names(pv) == "keep_model", "model") - if ("weights" %in% task$properties) { - pv = insert_named(pv, list(weights = task$weights$weight)) - } - + pv = get_weights(task$weights_learner$weight, pv) invoke(rpart::rpart, formula = task$formula(), data = task$data(), .args = pv, .opts = allow_partial_matching) }, diff --git a/R/Measure.R b/R/Measure.R index 649dd077c..8ae201c65 100644 --- a/R/Measure.R +++ b/R/Measure.R @@ -24,6 +24,14 @@ #' In such cases it is necessary to overwrite the public methods `$aggregate()` and/or `$score()` to return a named `numeric()` #' where at least one of its names corresponds to the `id` of the measure itself. #' +#' @section Weights: +#' +#' Many measures support observation weights, indicated by their property `"weights"`. +#' The weights are stored in the [Task] where the column role `weights_measure` needs to be assigned to a single numeric column. +#' The weights are automatically used if found in the task, this can be disabled by setting the hyperparamerter `use_weights` to `FALSE`. +#' If the measure is set-up to use weights but the task does not have a designated weight column, an unweighted version is calculated instead. +#' The weights do not necessarily need to sum up to 1, they are normalized by dividing by the sum of weights. +#' #' @template param_id #' @template param_param_set #' @template param_range @@ -94,10 +102,6 @@ Measure = R6Class("Measure", #' Lower and upper bound of possible performance scores. range = NULL, - #' @field properties (`character()`)\cr - #' Properties of this measure. - properties = NULL, - #' @field minimize (`logical(1)`)\cr #' If `TRUE`, good predictions correspond to small values of performance scores. minimize = NULL, @@ -117,7 +121,6 @@ Measure = R6Class("Measure", predict_sets = "test", task_properties = character(), packages = character(), label = NA_character_, man = NA_character_, trafo = NULL) { - self$properties = unique(properties) self$id = assert_string(id, min.chars = 1L) self$label = assert_string(label, na.ok = TRUE) self$task_type = task_type @@ -140,6 +143,8 @@ Measure = R6Class("Measure", assert_subset(task_properties, mlr_reflections$task_properties[[task_type]]) } + + self$properties = unique(properties) self$predict_type = predict_type self$predict_sets = predict_sets self$task_properties = task_properties @@ -195,24 +200,25 @@ Measure = R6Class("Measure", #' @return `numeric(1)`. score = function(prediction, task = NULL, learner = NULL, train_set = NULL) { assert_measure(self, task = task, learner = learner) - assert_prediction(prediction, null.ok = "requires_no_prediction" %nin% self$properties) + properties = self$properties + assert_prediction(prediction, null.ok = "requires_no_prediction" %nin% properties) - if ("requires_task" %in% self$properties && is.null(task)) { + if ("requires_task" %in% properties && is.null(task)) { stopf("Measure '%s' requires a task", self$id) } - if ("requires_learner" %in% self$properties && is.null(learner)) { + if ("requires_learner" %in% properties && is.null(learner)) { stopf("Measure '%s' requires a learner", self$id) } - if ("requires_model" %in% self$properties && (is.null(learner) || is.null(learner$model))) { + if ("requires_model" %in% properties && (is.null(learner) || is.null(learner$model))) { stopf("Measure '%s' requires the trained model", self$id) } - if ("requires_model" %in% self$properties && is_marshaled_model(learner$model)) { + if ("requires_model" %in% properties && is_marshaled_model(learner$model)) { stopf("Measure '%s' requires the trained model, but model is in marshaled form", self$id) } - if ("requires_train_set" %in% self$properties && is.null(train_set)) { + if ("requires_train_set" %in% properties && is.null(train_set)) { stopf("Measure '%s' requires the train_set", self$id) } @@ -231,7 +237,6 @@ Measure = R6Class("Measure", #' #' @return `numeric(1)`. aggregate = function(rr) { - switch(self$average, "macro" = { aggregator = self$aggregator %??% mean @@ -275,6 +280,17 @@ Measure = R6Class("Measure", self$predict_sets, mget(private$.extra_hash, envir = self)) }, + #' @field properties (`character()`)\cr + #' Properties of this measure. + properties = function(rhs) { + if (!missing(rhs)) { + props = if (is.na(self$task_type)) unique(unlist(mlr_reflections$measure_properties), use.names = FALSE) else mlr_reflections$measure_properties[[self$task_type]] + private$.properties = assert_subset(rhs, props) + } else { + private$.properties + } + }, + #' @field average (`character(1)`)\cr #' Method for aggregation: #' @@ -307,6 +323,7 @@ Measure = R6Class("Measure", ), private = list( + .properties = character(), .predict_sets = NULL, .extra_hash = character(), .average = NULL, diff --git a/R/MeasureSimple.R b/R/MeasureSimple.R index ba1c461cb..89d3df92b 100644 --- a/R/MeasureSimple.R +++ b/R/MeasureSimple.R @@ -5,26 +5,36 @@ MeasureBinarySimple = R6Class("MeasureBinarySimple", fun = NULL, na_value = NaN, initialize = function(name, param_set = NULL) { + info = mlr3measures::measures[[name]] + weights = info$sample_weights + if (is.null(param_set)) { - param_set = ps() + if (weights) { + param_set = ps(use_weights = p_lgl(default = TRUE)) + } else { + param_set = ps() + } } else { - # cloning required because the param set lives in the - # dictionary mlr_measures - param_set = param_set$clone() + if (weights) { + param_set = c(param_set, ps(use_weights = p_lgl(default = TRUE))) + } else { + param_set = param_set$clone() + } } - info = mlr3measures::measures[[name]] super$initialize( id = paste0("classif.", name), - param_set = param_set$clone(), + param_set = param_set, range = c(info$lower, info$upper), minimize = info$minimize, + properties = if (weights) "weights" else character(), predict_type = info$predict_type, task_properties = "twoclass", packages = "mlr3measures", label = info$title, man = paste0("mlr3::mlr_measures_classif.", name) ) + self$fun = get(name, envir = asNamespace("mlr3measures"), mode = "function") if (!is.na(info$obs_loss)) { self$obs_loss = get(info$obs_loss, envir = asNamespace("mlr3measures"), mode = "function") @@ -36,12 +46,18 @@ MeasureBinarySimple = R6Class("MeasureBinarySimple", ), private = list( - .score = function(prediction, ...) { + .score = function(prediction, task, ...) { + weights = if ("weights" %in% private$.properties && !isFALSE(self$param_set$values$use_weights)) { + task$weights_measure[list(prediction$row_ids), "weight"][[1L]] + } else { + NULL + } + truth = prediction$truth positive = levels(truth)[1L] invoke(self$fun, .args = self$param_set$get_values(), truth = truth, response = prediction$response, prob = prediction$prob[, positive], - positive = positive, na_value = self$na_value + positive = positive, na_value = self$na_value, sample_weights = weights ) }, @@ -57,10 +73,20 @@ MeasureClassifSimple = R6Class("MeasureClassifSimple", na_value = NaN, initialize = function(name) { info = mlr3measures::measures[[name]] + weights = info$sample_weights + + if (weights) { + param_set = ps(use_weights = p_lgl(default = TRUE)) + } else { + param_set = ps() + } + super$initialize( id = paste0("classif.", name), + param_set = param_set, range = c(info$lower, info$upper), minimize = info$minimize, + properties = if (weights) "weights" else character(), predict_type = info$predict_type, packages = "mlr3measures", label = info$title, @@ -77,8 +103,15 @@ MeasureClassifSimple = R6Class("MeasureClassifSimple", ), private = list( - .score = function(prediction, ...) { - self$fun(truth = prediction$truth, response = prediction$response, prob = prediction$prob, na_value = self$na_value) + .score = function(prediction, task, ...) { + weights = if ("weights" %in% private$.properties && !isFALSE(self$param_set$values$use_weights)) { + task$weights_measure[list(prediction$row_ids), "weight"][[1L]] + } else { + NULL + } + + self$fun(truth = prediction$truth, response = prediction$response, prob = prediction$prob, + na_value = self$na_value, sample_weights = weights) }, .extra_hash = c("fun", "na_value") @@ -93,10 +126,20 @@ MeasureRegrSimple = R6Class("MeasureRegrSimple", na_value = NaN, initialize = function(name) { info = mlr3measures::measures[[name]] + weights = info$sample_weights + + if (weights) { + param_set = ps(use_weights = p_lgl(default = FALSE)) + } else { + param_set = ps() + } + super$initialize( id = paste0("regr.", name), + param_set = param_set, range = c(info$lower, info$upper), minimize = info$minimize, + properties = if (weights) "weights" else character(), predict_type = info$predict_type, packages = "mlr3measures", label = info$title, @@ -113,8 +156,15 @@ MeasureRegrSimple = R6Class("MeasureRegrSimple", ), private = list( - .score = function(prediction, ...) { - self$fun(truth = prediction$truth, response = prediction$response, se = prediction$se, na_value = self$na_value) + .score = function(prediction, task, ...) { + weights = if ("weights" %in% private$.properties && !isFALSE(self$param_set$values$use_weights)) { + task$weights_measure[list(prediction$row_ids), "weight"][[1L]] + } else { + NULL + } + + self$fun(truth = prediction$truth, response = prediction$response, se = prediction$se, + na_value = self$na_value, sample_weights = weights) }, .extra_hash = c("fun", "na_value") diff --git a/R/Resampling.R b/R/Resampling.R index 61c42dcd0..776a9d511 100644 --- a/R/Resampling.R +++ b/R/Resampling.R @@ -46,6 +46,14 @@ #' Next, the grouping information is replaced with the respective row ids to generate training and test sets. #' The sets can be accessed via `$train_set(i)` and `$test_set(i)`, respectively. #' +#' @section Weights: +#' +#' Many resamlings support observation weights, indicated by their property `"weights"`. +#' The weights are stored in the [Task] where the column role `weights_resampling` needs to be assigned to a single numeric column. +#' The weights are automatically used if found in the task, this can be disabled by setting the hyperparamerter `use_weights` to `FALSE`. +#' If the resampling is set-up to use weights but the task does not have a designated weight column, an unweighted version is calculated instead. +#' The weights do not necessarily need to sum up to 1, they are passed down to argument `prob` of [sample()]. +#' #' #' @template seealso_resampling #' @export @@ -106,11 +114,9 @@ Resampling = R6Class("Resampling", #' task_nrow = NA_integer_, - #' @field duplicated_ids (`logical(1)`)\cr - #' If `TRUE`, duplicated rows can occur within a single training set or within a single test set. - #' E.g., this is `TRUE` for Bootstrap, and `FALSE` for cross-validation. - #' Only used internally. - duplicated_ids = NULL, + #' @field properties (`character()`)\cr + #' Set of properties. + properties = NULL, #' @template field_man man = NULL, @@ -118,16 +124,22 @@ Resampling = R6Class("Resampling", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. #' - #' @param duplicated_ids (`logical(1)`)\cr - #' Set to `TRUE` if this resampling strategy may have duplicated row ids in a single training set or test set. + #' @param properties (`character()`)\cr + #' Set of properties, i.e., + #' * `"duplicated_ids"`: duplicated rows can occur within a single training set or within a single test set. + #' E.g., this is `TRUE` for Bootstrap, and `FALSE` for cross-validation. + #' * `"weights"`: if present, the resampling supports sample weights (set via column role `weights_resampling` in the [Task]). + #' The weights determine the probability to sample a observation for the training set. #' #' Note that this object is typically constructed via a derived classes, e.g. [ResamplingCV] or [ResamplingHoldout]. - initialize = function(id, param_set = ps(), duplicated_ids = FALSE, label = NA_character_, man = NA_character_) { + initialize = function(id, param_set = ps(), properties = character(), label = NA_character_, man = NA_character_) { private$.id = assert_string(id, min.chars = 1L) self$label = assert_string(label, na.ok = TRUE) - self$param_set = assert_param_set(param_set) - self$duplicated_ids = assert_flag(duplicated_ids) + self$properties = assert_subset(properties, mlr_reflections$resampling_properties) self$man = assert_string(man, na.ok = TRUE) + + assert_param_set(param_set) + self$param_set = if ("weights" %in% properties) c(param_set, ps(use_weights = p_lgl(default = TRUE))) else param_set }, #' @description @@ -168,19 +180,24 @@ Resampling = R6Class("Resampling", task = assert_task(as_task(task)) strata = task$strata groups = task$groups + weights = task$weights_resampling$weight + + use_weights = "weights" %in% self$properties && !isFALSE(self$param_set$values$use_weights) && !is.null(weights) + + if (sum(!is.null(strata), !is.null(groups), use_weights) >= 2L) { + stopf("Stratification, grouping and weighted resampling are mutually exclusive") + } if (is.null(strata)) { if (is.null(groups)) { - instance = private$.sample(task$row_ids, task = task) + weights = if (use_weights) weights else NULL + instance = private$.sample(task$row_ids, task = task, weights = weights) } else { private$.groups = groups - instance = private$.sample(unique(groups$group), task = task) + instance = private$.sample(unique(groups$group), task = task, weights = NULL) } } else { - if (!is.null(groups)) { - stopf("Cannot combine stratification with grouping") - } - instance = private$.combine(lapply(strata$row_id, private$.sample, task = task)) + instance = private$.combine(lapply(strata$row_id, private$.sample, task = task, weights = NULL)) } private$.hash = NULL diff --git a/R/ResamplingBootstrap.R b/R/ResamplingBootstrap.R index b4e75e942..ee88de342 100644 --- a/R/ResamplingBootstrap.R +++ b/R/ResamplingBootstrap.R @@ -16,6 +16,8 @@ #' Number of repetitions. #' * `ratio` (`numeric(1)`)\cr #' Ratio of observations to put into the training set. +#' * `use_weights` (`logical(1)`)\cr +#' Incorporate observation weights of the [Task] (column role `weights_resampling`), if present. #' #' @references #' `r format_bib("bischl_2012")` @@ -51,7 +53,7 @@ ResamplingBootstrap = R6Class("ResamplingBootstrap", inherit = Resampling, ) ps$values = list(ratio = 1, repeats = 30L) - super$initialize(id = "bootstrap", param_set = ps, duplicated_ids = TRUE, + super$initialize(id = "bootstrap", param_set = ps, properties = c("duplicated_ids", "weights"), label = "Bootstrap", man = "mlr3::mlr_resamplings_bootstrap") } ), @@ -65,11 +67,11 @@ ResamplingBootstrap = R6Class("ResamplingBootstrap", inherit = Resampling, ), private = list( - .sample = function(ids, ...) { + .sample = function(ids, task, weights, ...) { pv = self$param_set$values nr = round(length(ids) * pv$ratio) x = factor(seq_along(ids)) - M = replicate(pv$repeats, table(sample(x, nr, replace = TRUE)), simplify = "array") + M = replicate(pv$repeats, table(sample(x, nr, replace = TRUE, prob = weights)), simplify = "array") rownames(M) = NULL list(row_ids = ids, M = M) }, diff --git a/R/ResamplingCustom.R b/R/ResamplingCustom.R index a90e1449e..b5b7ce9a8 100644 --- a/R/ResamplingCustom.R +++ b/R/ResamplingCustom.R @@ -29,7 +29,7 @@ ResamplingCustom = R6Class("ResamplingCustom", inherit = Resampling, #' @description #' Creates a new instance of this [R6][R6::R6Class] class. initialize = function() { - super$initialize(id = "custom", duplicated_ids = TRUE, + super$initialize(id = "custom", properties = "duplicated_ids", label = "Custom Splits", man = "mlr3::mlr_resamplings_custom") }, diff --git a/R/ResamplingCustomCV.R b/R/ResamplingCustomCV.R index 6e8d049cc..91f27c32d 100644 --- a/R/ResamplingCustomCV.R +++ b/R/ResamplingCustomCV.R @@ -40,7 +40,7 @@ ResamplingCustomCV = R6Class("ResamplingCustomCV", inherit = Resampling, #' @description #' Creates a new instance of this [R6][R6::R6Class] class. initialize = function() { - super$initialize(id = "custom_cv", duplicated_ids = FALSE, + super$initialize(id = "custom_cv", label = "Custom Split Cross-Validation", man = "mlr3::mlr_resamplings_custom_cv") }, diff --git a/R/ResamplingHoldout.R b/R/ResamplingHoldout.R index 0a5d7ad8d..a846c65c1 100644 --- a/R/ResamplingHoldout.R +++ b/R/ResamplingHoldout.R @@ -13,6 +13,8 @@ #' @section Parameters: #' * `ratio` (`numeric(1)`)\cr #' Ratio of observations to put into the training set. +#' * `use_weights` (`logical(1)`)\cr +#' Incorporate observation weights of the [Task] (column role `weights_resampling`), if present. #' #' @references #' `r format_bib("bischl_2012")` @@ -47,7 +49,7 @@ ResamplingHoldout = R6Class("ResamplingHoldout", inherit = Resampling, ) ps$values = list(ratio = 2 / 3) - super$initialize(id = "holdout", param_set = ps, + super$initialize(id = "holdout", param_set = ps, properties = "weights", label = "Holdout", man = "mlr3::mlr_resamplings_holdout") } ), @@ -59,10 +61,11 @@ ResamplingHoldout = R6Class("ResamplingHoldout", inherit = Resampling, } ), private = list( - .sample = function(ids, ...) { + .sample = function(ids, task, weights, ...) { + pv = self$param_set$values n = length(ids) in_train = logical(n) - in_train[sample.int(n, round(n * self$param_set$values$ratio))] = TRUE + in_train[sample.int(n, round(n * self$param_set$values$ratio), prob = weights)] = TRUE list(train = ids[in_train], test = ids[!in_train]) }, diff --git a/R/ResamplingSubsampling.R b/R/ResamplingSubsampling.R index 99a447f3c..9b8584890 100644 --- a/R/ResamplingSubsampling.R +++ b/R/ResamplingSubsampling.R @@ -15,6 +15,8 @@ #' Number of repetitions. #' * `ratio` (`numeric(1)`)\cr #' Ratio of observations to put into the training set. +#' * `use_weights` (`logical(1)`)\cr +#' Incorporate observation weights of the [Task] (column role `weights_resampling`), if present. #' #' @references #' `r format_bib("bischl_2012")` @@ -50,7 +52,7 @@ ResamplingSubsampling = R6Class("ResamplingSubsampling", inherit = Resampling, ) ps$values = list(repeats = 30L, ratio = 2 / 3) - super$initialize(id = "subsampling", param_set = ps, + super$initialize(id = "subsampling", param_set = ps, properties = "weights", label = "Subsampling", man = "mlr3::mlr_resamplings_subsampling") } ), @@ -64,12 +66,11 @@ ResamplingSubsampling = R6Class("ResamplingSubsampling", inherit = Resampling, ), private = list( - .sample = function(ids, ...) { + .sample = function(ids, task, weights, ...) { pv = self$param_set$values n = length(ids) nr = round(n * pv$ratio) - - train = replicate(pv$repeats, sample.int(n, nr), simplify = FALSE) + train = replicate(pv$repeats, sample.int(n, nr, prob = weights), simplify = FALSE) list(train = train, row_ids = ids) }, diff --git a/R/Task.R b/R/Task.R index a5f5bf110..79e6c6787 100644 --- a/R/Task.R +++ b/R/Task.R @@ -442,8 +442,8 @@ Task = R6Class("Task", #' In case of name clashes of row ids, rows in `data` have higher precedence #' and virtually overwrite the rows in the [DataBackend]. #' - #' All columns with the roles `"target"`, `"feature"`, `"weight"`, `"group"`, `"stratum"`, - #' and `"order"` must be present in `data`. + #' All columns roles `"target"`, `"feature"`, `"weights_learner"`, `"weights_measure"`, + #' `"weights_resampling"`, group"`, `"stratum"`, and `"order"` must be present in `data`. #' Columns only present in `data` but not in the [DataBackend] of `task` will be discarded. #' #' This operation mutates the task in-place. @@ -495,7 +495,7 @@ Task = R6Class("Task", } # columns with these roles must be present in data - mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order") + mandatory_roles = c("target", "feature", "group", "stratum", "order") mandatory_cols = unlist(private$.col_roles[mandatory_roles], use.names = FALSE) missing_cols = setdiff(mandatory_cols, data$colnames) if (length(missing_cols)) { @@ -886,18 +886,24 @@ Task = R6Class("Task", #' #' * `"strata"`: The task is resampled using one or more stratification variables (role `"stratum"`). #' * `"groups"`: The task comes with grouping/blocking information (role `"group"`). - #' * `"weights"`: The task comes with observation weights (role `"weight"`). + #' * `"weights_learnering"`: If the task has observation weights with this role, they are passed to the [Learner] during train. + #' The use of weights can be disabled via by setting the learner's hyperparameter `use_weights` to `FALSE`. + #' * `"weights_measure"`: If the task has observation weights with this role, they are passed to the [Measure] for weighted scoring. + #' The use of weights can be disabled via by setting the measure's hyperparameter `use_weights` to `FALSE`. + #' * `"weights_resampling"`: If the task has observation weights with this role, they are passed to the [Resampling] for weighted sampling. + #' The weights are only used if the resampling's hyperparameter `use_weights` is set to `TRUE`. #' - #' Note that above listed properties are calculated from the `$col_roles` and may not be set explicitly. + #' Note that above listed properties are calculated from the `$col_roles`, and may not be set explicitly. properties = function(rhs) { if (missing(rhs)) { - col_roles = private$.col_roles - c(character(), - private$.properties, - if (length(col_roles$group)) "groups" else NULL, - if (length(col_roles$stratum)) "strata" else NULL, - if (length(col_roles$weight)) "weights" else NULL + prop_roles = c( + groups = "group", + strata = "stratum", + weights_learner = "weights_learner", + weights_measure = "weights_measure", + weights_resampling = "weights_resampling" ) + c(private$.properties, names(prop_roles)[lengths(private$.col_roles[prop_roles]) > 0L]) } else { private$.properties = assert_set(rhs, .var.name = "properties") } @@ -939,11 +945,20 @@ Task = R6Class("Task", #' For each resampling iteration, observations of the same group will be exclusively assigned to be either in the training set or in the test set. #' Not more than a single column can be associated with this role. #' * `"stratum"`: Stratification variables. Multiple discrete columns may have this role. - #' * `"weight"`: Observation weights. Not more than one numeric column may have this role. + #' * `"weights_learnering"`: If the task has observation weights with this role, they are passed to the [Learner] during train. + #' The use of weights can be disabled via by setting the learner's hyperparameter `use_weights` to `FALSE`. + #' * `"weights_measure"`: If the task has observation weights with this role, they are passed to the [Measure] for weighted scoring. + #' The use of weights can be disabled via by setting the measure's hyperparameter `use_weights` to `FALSE`. + #' * `"weights_resampling"`: If the task has observation weights with this role, they are passed to the [Resampling] for weighted sampling. + #' The weights are only used if the resampling's hyperparameter `use_weights` is set to `TRUE`. #' #' `col_roles` is a named list whose elements are named by column role and each element is a `character()` vector of column names. #' To alter the roles, just modify the list, e.g. with \R's set functions ([intersect()], [setdiff()], [union()], \ldots). #' The method `$set_col_roles` provides a convenient alternative to assign columns to roles. + #' + #' The roles `weights_learner`, `weights_measure` and `weights_resampling` may only point to a single numeric column, but they can + #' all point to the same column or different columns. Weights must be non-negative numerics with at least one weight being > 0. + #' They don't necessarily need to sum up to 1. col_roles = function(rhs) { if (missing(rhs)) { return(private$.col_roles) @@ -951,7 +966,7 @@ Task = R6Class("Task", assert_has_backend(self) qassertr(rhs, "S[1,]", .var.name = "col_roles") - assert_names(names(rhs), "unique", permutation.of = mlr_reflections$task_col_roles[[self$task_type]], .var.name = "names of col_roles") + assert_names(names(rhs), "unique", permutation.of = mlr_reflections$task_col_roles[[self$task_type]]) assert_subset(unlist(rhs, use.names = FALSE), setdiff(self$col_info$id, self$backend$primary_key), .var.name = "elements of col_roles") private$.hash = NULL @@ -1056,16 +1071,44 @@ Task = R6Class("Task", }, #' @field weights ([data.table::data.table()])\cr - #' If the task has a column with designated role `"weight"`, a table with two columns: + #' Deprecated, use `$weights_learner` instead. + weights = function(rhs) { + assert_ro_binding(rhs) + .Deprecated("Task$weights_learner", old = "Task$weights") + self$weights_learner + }, + + #' @field weights_learner ([data.table::data.table()])\cr + #' Returns the observation weights used for training a [Learner] (column role `weights_learner`) + #' as a `data.table` with the following columns: #' #' * `row_id` (`integer()`), and - #' * observation weights `weight` (`numeric()`). + #' * `weight` (`numeric()`). #' - #' Returns `NULL` if there are is no weight column. - weights = function(rhs) { + #' Returns `NULL` if there are is no column with the designated role. + weights_learner = function(rhs) { + assert_has_backend(self) + assert_ro_binding(rhs) + weight_cols = private$.col_roles[["weights_learner"]] + if (length(weight_cols) == 0L) { + return(NULL) + } + data = self$backend$data(private$.row_roles$use, c(self$backend$primary_key, weight_cols)) + setnames(data, c("row_id", "weight"))[] + }, + + #' @field weights_measure ([data.table::data.table()])\cr + #' Returns the observation weights used for scoring a prediction with a [Measure] (column role `weights_measure`) + #' as a `data.table` with the following columns: + #' + #' * `row_id` (`integer()`), and + #' * `weight` (`numeric()`). + #' + #' Returns `NULL` if there are is no column with the designated role. + weights_measure = function(rhs) { assert_has_backend(self) assert_ro_binding(rhs) - weight_cols = private$.col_roles$weight + weight_cols = private$.col_roles[["weights_measure"]] if (length(weight_cols) == 0L) { return(NULL) } @@ -1073,6 +1116,24 @@ Task = R6Class("Task", setnames(data, c("row_id", "weight"))[] }, + #' @field weights_resampling ([data.table::data.table()])\cr + #' Returns the observation weights used for sampling during a [Resampling] (column role `weights_resampling`) + #' as a `data.table` with the following columns: + #' + #' * `row_id` (`integer()`), and + #' * `weight` (`numeric()`). + #' + #' Returns `NULL` if there are is no column with the designated role. + weights_resampling = function(rhs) { + assert_has_backend(self) + assert_ro_binding(rhs) + weight_cols = private$.col_roles[["weights_resampling"]] + if (length(weight_cols) == 0L) { + return(NULL) + } + data = self$backend$data(private$.row_roles$use, c(self$backend$primary_key, weight_cols)) + setnames(data, c("row_id", "weight"))[] + }, #' @field labels (named `character()`)\cr #' Retrieve `labels` (prettier formated names) from columns. @@ -1189,12 +1250,23 @@ task_set_roles = function(li, elements, roles = NULL, add_to = NULL, remove_from } task_check_col_roles = function(self, new_roles) { - for (role in c("group", "weight", "name")) { + if ("weight" %in% names(new_roles)) { + stopf("Task role 'weight' is deprecated, use 'weights_learner' instead") + } + + for (role in c("group", "name", "weights_learner", "weights_measure", "weights_resampling")) { if (length(new_roles[[role]]) > 1L) { stopf("There may only be up to one column with role '%s'", role) } } + for (role in c("weights_learner", "weights_measure", "weights_resampling")) { + if (length(new_roles[[role]]) > 0L) { + col_role = self$backend$data(seq(self$backend$nrow), cols = new_roles[[role]])[[1]] + expect_numeric(col_role, lower = 0, any.missing = FALSE) + } + } + if (inherits(self, "TaskSupervised")) { if (length(new_roles$target) == 0L) { stopf("Supervised tasks need at least one target column") diff --git a/R/helper.R b/R/helper.R index b27ecb192..e5c930924 100644 --- a/R/helper.R +++ b/R/helper.R @@ -67,6 +67,14 @@ assert_validate = function(x) { assert_choice(x, c("predefined", "test"), null.ok = TRUE) } +get_weights = function(weights, pv, id = "weights") { + if (!is.null(weights) && pv$use_weights) { + pv[[id]] = weights + } + pv$use_weights = NULL + pv +} + get_obs_loss = function(tab, measures) { for (measure in measures) { fun = measure$obs_loss diff --git a/R/mlr_reflections.R b/R/mlr_reflections.R index f103871b4..c979ed7bd 100644 --- a/R/mlr_reflections.R +++ b/R/mlr_reflections.R @@ -92,18 +92,18 @@ local({ "use" ) - tmp = c("feature", "target", "name", "order", "stratum", "group", "weight") + tmp = c("feature", "target", "name", "order", "stratum", "group", "weights_learner", "weights_measure", "weights_resampling") mlr_reflections$task_col_roles = list( regr = tmp, classif = tmp, unsupervised = c("feature", "name", "order") ) - tmp = c("strata", "groups", "weights") + tmp = c("strata", "groups", "weights_learner", "weights_measure", "weights_resampling") mlr_reflections$task_properties = list( classif = c(tmp, "twoclass", "multiclass"), regr = tmp, - unsupervised = character(0) + unsupervised = character() ) mlr_reflections$task_mandatory_properties = list( @@ -112,7 +112,7 @@ local({ mlr_reflections$task_print_col_roles = list( before = character(), - after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight") + after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights/Learner" = "weights_learner", "Weights/Measure" = "weights_measure", "Weights/Resampling" = "weights_resampling") ) ### Learner @@ -138,9 +138,11 @@ local({ ### Prediction mlr_reflections$predict_sets = c("train", "test", "internal_valid") + ### Resampling + mlr_reflections$resampling_properties = c("duplicated_ids", "weights") ### Measures - tmp = c("na_score", "requires_task", "requires_learner", "requires_model", "requires_train_set", "primary_iters", "requires_no_prediction") + tmp = c("na_score", "requires_task", "requires_learner", "requires_model", "requires_train_set", "weights", "primary_iters", "requires_no_prediction") mlr_reflections$measure_properties = list( classif = tmp, regr = tmp diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R index 007ba6823..31eb5ad4b 100644 --- a/inst/testthat/helper_autotest.R +++ b/inst/testthat/helper_autotest.R @@ -51,11 +51,11 @@ generate_generic_tasks = function(learner, proto) { } # task with weights - if ("weights" %in% learner$properties) { + if ("weights_learner" %in% learner$properties) { tmp = proto$clone(deep = TRUE)$cbind(data.frame(weights = runif(n))) - tmp$col_roles$weight = "weights" + tmp$col_roles$weights_learner = "weights" tmp$col_roles$features = setdiff(tmp$col_roles$features, "weights") - tasks$weights = tmp + tasks$weights_learner = tmp } # task with non-ascii feature names @@ -233,6 +233,19 @@ run_experiment = function(task, learner, seed = NULL, configure_learner = NULL) learner$encapsulate = c(train = "evaluate", predict = "evaluate") stage = "train()" + + # enable weights + if ("weights" %in% learner$properties) { + # learner must have flag "use_weights" + msg = checkmate::check_subset("use_weights", learner$param_set$ids(), empty.ok = FALSE) + if (!isTRUE(msg)) { + return(err(msg)) + } + if ("weights_learner" %in% task$properties) { + learner$param_set$values$use_weights = TRUE + } + } + ok = try(learner$train(task), silent = TRUE) if (inherits(ok, "try-error")) { return(err(as.character(ok))) diff --git a/man-roxygen/param_measure_properties.R b/man-roxygen/param_measure_properties.R index 103439956..b1ebaa45b 100644 --- a/man-roxygen/param_measure_properties.R +++ b/man-roxygen/param_measure_properties.R @@ -4,11 +4,9 @@ #' Supported by `mlr3`: #' * `"requires_task"` (requires the complete [Task]), #' * `"requires_learner"` (requires the trained [Learner]), -#' * `"requires_model"` (requires the trained [Learner], including the fitted -#' model), -#' * `"requires_train_set"` (requires the training indices from the [Resampling]), and -#' * `"na_score"` (the measure is expected to occasionally return `NA` or `NaN`). -#' * `"primary_iters"` (the measure explictly handles resamplings that only use a subset -#' of their iterations for the point estimate). -#' * `"requires_no_prediction"` (No prediction is required; This usually means that the -#' measure extracts some information from the learner state.). +#' * `"requires_model"` (requires the trained [Learner], including the fitted model), +#' * `"requires_train_set"` (requires the training indices from the [Resampling]), +#' * `"na_score"` (the measure is expected to occasionally return `NA` or `NaN`), +#' * `"weights"` (support weighted scoring using sample weights from task, column role `weights_measure`), and +#' * `"primary_iters"` (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) +#' * `"requires_no_prediction"` (No prediction is required; This usually means that the measure extracts some information from the learner state.). diff --git a/man/Learner.Rd b/man/Learner.Rd index 4381c1758..123d397b6 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -57,6 +57,16 @@ If the learner is not trained yet, this returns \code{NULL}. } } +\section{Weights}{ + + +Many learners support observation weights, indicated by their property \code{"weights"}. +The weights are stored in the \link{Task} where the column role \code{weights_learner} needs to be assigned to a single numeric column. +The weights are automatically used if found in the task, this can be disabled by setting the hyperparamerter \code{use_weights} to \code{FALSE}. +If the learner is set-up to use weights but the task does not have a designated weight column, an unweighted version is calculated instead. +The weights do not necessarily need to sum up to 1, they are passed down to the learner. +} + \section{Setting Hyperparameters}{ diff --git a/man/Measure.Rd b/man/Measure.Rd index f52bc580d..d80dd4bb1 100644 --- a/man/Measure.Rd +++ b/man/Measure.Rd @@ -29,6 +29,16 @@ In such cases it is necessary to overwrite the public methods \verb{$aggregate() where at least one of its names corresponds to the \code{id} of the measure itself. } +\section{Weights}{ + + +Many measures support observation weights, indicated by their property \code{"weights"}. +The weights are stored in the \link{Task} where the column role \code{weights_measure} needs to be assigned to a single numeric column. +The weights are automatically used if found in the task, this can be disabled by setting the hyperparamerter \code{use_weights} to \code{FALSE}. +If the measure is set-up to use weights but the task does not have a designated weight column, an unweighted version is calculated instead. +The weights do not necessarily need to sum up to 1, they are normalized by dividing by the sum of weights. +} + \seealso{ \itemize{ \item Chapter in the \href{https://mlr3book.mlr-org.com/}{mlr3book}: @@ -109,9 +119,6 @@ Required properties of the \link{Task}.} \item{\code{range}}{(\code{numeric(2)})\cr Lower and upper bound of possible performance scores.} -\item{\code{properties}}{(\code{character()})\cr -Properties of this measure.} - \item{\code{minimize}}{(\code{logical(1)})\cr If \code{TRUE}, good predictions correspond to small values of performance scores.} @@ -142,6 +149,9 @@ to operate on specific subsets of the calculated prediction sets.} \item{\code{hash}}{(\code{character(1)})\cr Hash (unique identifier) for this object.} +\item{\code{properties}}{(\code{character()})\cr +Properties of this measure.} + \item{\code{average}}{(\code{character(1)})\cr Method for aggregation: \itemize{ @@ -254,6 +264,13 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), +<<<<<<< HEAD +\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), +\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), +\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), +\item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and +\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) +======= \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and @@ -262,6 +279,7 @@ model), of their iterations for the point estimate). \item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). +>>>>>>> main }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/MeasureClassif.Rd b/man/MeasureClassif.Rd index 18aaa7fdd..ef59fa8e0 100644 --- a/man/MeasureClassif.Rd +++ b/man/MeasureClassif.Rd @@ -133,6 +133,13 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), +<<<<<<< HEAD +\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), +\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), +\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), +\item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and +\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) +======= \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and @@ -141,6 +148,7 @@ model), of their iterations for the point estimate). \item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). +>>>>>>> main }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/MeasureRegr.Rd b/man/MeasureRegr.Rd index 8c432bba7..a38ba9755 100644 --- a/man/MeasureRegr.Rd +++ b/man/MeasureRegr.Rd @@ -133,6 +133,13 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), +<<<<<<< HEAD +\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), +\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), +\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), +\item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and +\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) +======= \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and @@ -141,6 +148,7 @@ model), of their iterations for the point estimate). \item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). +>>>>>>> main }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/MeasureSimilarity.Rd b/man/MeasureSimilarity.Rd index df2b7d40e..4aafb7f2a 100644 --- a/man/MeasureSimilarity.Rd +++ b/man/MeasureSimilarity.Rd @@ -147,6 +147,13 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), +<<<<<<< HEAD +\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), +\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), +\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), +\item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and +\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) +======= \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and @@ -155,6 +162,7 @@ model), of their iterations for the point estimate). \item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). +>>>>>>> main }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/Resampling.Rd b/man/Resampling.Rd index 4c1c7176a..f2ed50e17 100644 --- a/man/Resampling.Rd +++ b/man/Resampling.Rd @@ -46,6 +46,16 @@ Next, the grouping information is replaced with the respective row ids to genera The sets can be accessed via \verb{$train_set(i)} and \verb{$test_set(i)}, respectively. } +\section{Weights}{ + + +Many resamlings support observation weights, indicated by their property \code{"weights"}. +The weights are stored in the \link{Task} where the column role \code{weights_resampling} needs to be assigned to a single numeric column. +The weights are automatically used if found in the task, this can be disabled by setting the hyperparamerter \code{use_weights} to \code{FALSE}. +If the resampling is set-up to use weights but the task does not have a designated weight column, an unweighted version is calculated instead. +The weights do not necessarily need to sum up to 1, they are passed down to argument \code{prob} of \code{\link[=sample]{sample()}}. +} + \examples{ r = rsmp("subsampling") @@ -126,10 +136,8 @@ The hash of the \link{Task} which was passed to \code{r$instantiate()}.} \item{\code{task_nrow}}{(\code{integer(1)})\cr The number of observations of the \link{Task} which was passed to \code{r$instantiate()}.} -\item{\code{duplicated_ids}}{(\code{logical(1)})\cr -If \code{TRUE}, duplicated rows can occur within a single training set or within a single test set. -E.g., this is \code{TRUE} for Bootstrap, and \code{FALSE} for cross-validation. -Only used internally.} +\item{\code{properties}}{(\code{character()})\cr +Set of properties.} \item{\code{man}}{(\code{character(1)})\cr String in the format \verb{[pkg]::[topic]} pointing to a manual page for this object. @@ -174,7 +182,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. \if{html}{\out{
}}\preformatted{Resampling$new( id, param_set = ps(), - duplicated_ids = FALSE, + properties = character(), label = NA_character_, man = NA_character_ )}\if{html}{\out{
}} @@ -189,8 +197,14 @@ Identifier for the new instance.} \item{\code{param_set}}{(\link[paradox:ParamSet]{paradox::ParamSet})\cr Set of hyperparameters.} -\item{\code{duplicated_ids}}{(\code{logical(1)})\cr -Set to \code{TRUE} if this resampling strategy may have duplicated row ids in a single training set or test set. +\item{\code{properties}}{(\code{character()})\cr +Set of properties, i.e., +\itemize{ +\item \code{"duplicated_ids"}: duplicated rows can occur within a single training set or within a single test set. +E.g., this is \code{TRUE} for Bootstrap, and \code{FALSE} for cross-validation. +\item \code{"weights"}: if present, the resampling supports sample weights (set via column role \code{weights_resampling} in the \link{Task}). +The weights determine the probability to sample a observation for the training set. +} Note that this object is typically constructed via a derived classes, e.g. \link{ResamplingCV} or \link{ResamplingHoldout}.} diff --git a/man/Task.Rd b/man/Task.Rd index f01dc371e..1b7ac925c 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -195,10 +195,15 @@ The following properties are currently standardized and understood by tasks in \ \itemize{ \item \code{"strata"}: The task is resampled using one or more stratification variables (role \code{"stratum"}). \item \code{"groups"}: The task comes with grouping/blocking information (role \code{"group"}). -\item \code{"weights"}: The task comes with observation weights (role \code{"weight"}). +\item \code{"weights_learnering"}: If the task has observation weights with this role, they are passed to the \link{Learner} during train. +The use of weights can be disabled via by setting the learner's hyperparameter \code{use_weights} to \code{FALSE}. +\item \code{"weights_measure"}: If the task has observation weights with this role, they are passed to the \link{Measure} for weighted scoring. +The use of weights can be disabled via by setting the measure's hyperparameter \code{use_weights} to \code{FALSE}. +\item \code{"weights_resampling"}: If the task has observation weights with this role, they are passed to the \link{Resampling} for weighted sampling. +The weights are only used if the resampling's hyperparameter \code{use_weights} is set to \code{TRUE}. } -Note that above listed properties are calculated from the \verb{$col_roles} and may not be set explicitly.} +Note that above listed properties are calculated from the \verb{$col_roles}, and may not be set explicitly.} \item{\code{row_roles}}{(named \code{list()})\cr Each row (observation) can have an arbitrary number of roles in the learning task: @@ -222,12 +227,21 @@ Columns must be sortable with \code{\link[=order]{order()}}. For each resampling iteration, observations of the same group will be exclusively assigned to be either in the training set or in the test set. Not more than a single column can be associated with this role. \item \code{"stratum"}: Stratification variables. Multiple discrete columns may have this role. -\item \code{"weight"}: Observation weights. Not more than one numeric column may have this role. +\item \code{"weights_learnering"}: If the task has observation weights with this role, they are passed to the \link{Learner} during train. +The use of weights can be disabled via by setting the learner's hyperparameter \code{use_weights} to \code{FALSE}. +\item \code{"weights_measure"}: If the task has observation weights with this role, they are passed to the \link{Measure} for weighted scoring. +The use of weights can be disabled via by setting the measure's hyperparameter \code{use_weights} to \code{FALSE}. +\item \code{"weights_resampling"}: If the task has observation weights with this role, they are passed to the \link{Resampling} for weighted sampling. +The weights are only used if the resampling's hyperparameter \code{use_weights} is set to \code{TRUE}. } \code{col_roles} is a named list whose elements are named by column role and each element is a \code{character()} vector of column names. To alter the roles, just modify the list, e.g. with \R's set functions (\code{\link[=intersect]{intersect()}}, \code{\link[=setdiff]{setdiff()}}, \code{\link[=union]{union()}}, \ldots). -The method \verb{$set_col_roles} provides a convenient alternative to assign columns to roles.} +The method \verb{$set_col_roles} provides a convenient alternative to assign columns to roles. + +The roles \code{weights_learner}, \code{weights_measure} and \code{weights_resampling} may only point to a single numeric column, but they can +all point to the same column or different columns. Weights must be non-negative numerics with at least one weight being > 0. +They don't necessarily need to sum up to 1.} \item{\code{nrow}}{(\code{integer(1)})\cr Returns the total number of rows with role "use".} @@ -275,13 +289,37 @@ If the task has at least one column with designated role \code{"order"}, a table Returns \code{NULL} if there are is no order column.} \item{\code{weights}}{(\code{\link[data.table:data.table]{data.table::data.table()}})\cr -If the task has a column with designated role \code{"weight"}, a table with two columns: +Deprecated, use \verb{$weights_learner} instead.} + +\item{\code{weights_learner}}{(\code{\link[data.table:data.table]{data.table::data.table()}})\cr +Returns the observation weights used for training a \link{Learner} (column role \code{weights_learner}) +as a \code{data.table} with the following columns: +\itemize{ +\item \code{row_id} (\code{integer()}), and +\item \code{weight} (\code{numeric()}). +} + +Returns \code{NULL} if there are is no column with the designated role.} + +\item{\code{weights_measure}}{(\code{\link[data.table:data.table]{data.table::data.table()}})\cr +Returns the observation weights used for scoring a prediction with a \link{Measure} (column role \code{weights_measure}) +as a \code{data.table} with the following columns: +\itemize{ +\item \code{row_id} (\code{integer()}), and +\item \code{weight} (\code{numeric()}). +} + +Returns \code{NULL} if there are is no column with the designated role.} + +\item{\code{weights_resampling}}{(\code{\link[data.table:data.table]{data.table::data.table()}})\cr +Returns the observation weights used for sampling during a \link{Resampling} (column role \code{weights_resampling}) +as a \code{data.table} with the following columns: \itemize{ \item \code{row_id} (\code{integer()}), and -\item observation weights \code{weight} (\code{numeric()}). +\item \code{weight} (\code{numeric()}). } -Returns \code{NULL} if there are is no weight column.} +Returns \code{NULL} if there are is no column with the designated role.} \item{\code{labels}}{(named \code{character()})\cr Retrieve \code{labels} (prettier formated names) from columns. @@ -633,9 +671,8 @@ the primary key of the \link{DataBackend} (\code{task$backend$primary_key}). In case of name clashes of row ids, rows in \code{data} have higher precedence and virtually overwrite the rows in the \link{DataBackend}. -All columns with the roles \code{"target"}, \code{"feature"}, \code{"weight"}, \code{"group"}, \code{"stratum"}, -and \code{"order"} must be present in \code{data}. -Columns only present in \code{data} but not in the \link{DataBackend} of \code{task} will be discarded. +All columns roles \code{"target"}, \code{"feature"}, \code{"weights_learner"}, \code{"weights_measure"}, +\code{"weights_resampling"}, group"\verb{, }"stratum"\verb{, and }"order"\verb{must be present in}data\verb{. Columns only present in }data\verb{but not in the [DataBackend] of}task` will be discarded. This operation mutates the task in-place. See the section on task mutators for more information. diff --git a/man/mlr_learners_classif.rpart.Rd b/man/mlr_learners_classif.rpart.Rd index cdcba963f..776993ed8 100644 --- a/man/mlr_learners_classif.rpart.Rd +++ b/man/mlr_learners_classif.rpart.Rd @@ -11,6 +11,8 @@ A \link{LearnerClassif} for a classification tree implemented in \code{\link[rpa \itemize{ \item Parameter \code{xval} is initialized to 0 in order to save some computation time. +\item Parameter \code{use_weights} can be set to \code{FALSE} to ignore observation weights with column role \code{weights_learner} , +if present. } } @@ -53,6 +55,7 @@ lrn("classif.rpart") surrogatestyle \tab integer \tab 0 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr usesurrogate \tab integer \tab 2 \tab \tab \eqn{[0, 2]}{[0, 2]} \cr xval \tab integer \tab 10 \tab \tab \eqn{[0, \infty)}{[0, Inf)} \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \tab - \cr } } diff --git a/man/mlr_learners_regr.rpart.Rd b/man/mlr_learners_regr.rpart.Rd index eee708300..0c6283561 100644 --- a/man/mlr_learners_regr.rpart.Rd +++ b/man/mlr_learners_regr.rpart.Rd @@ -11,6 +11,8 @@ A \link{LearnerRegr} for a regression tree implemented in \code{\link[rpart:rpar \itemize{ \item Parameter \code{xval} is initialized to 0 in order to save some computation time. +\item Parameter \code{use_weights} can be set to \code{FALSE} to ignore observation weights with column role \code{weights_learner} , +if present. } } @@ -53,6 +55,7 @@ lrn("regr.rpart") surrogatestyle \tab integer \tab 0 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr usesurrogate \tab integer \tab 2 \tab \tab \eqn{[0, 2]}{[0, 2]} \cr xval \tab integer \tab 10 \tab \tab \eqn{[0, \infty)}{[0, Inf)} \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \tab - \cr } } diff --git a/man/mlr_measures_classif.acc.Rd b/man/mlr_measures_classif.acc.Rd index 25a2ff8ca..329772dbf 100644 --- a/man/mlr_measures_classif.acc.Rd +++ b/man/mlr_measures_classif.acc.Rd @@ -31,8 +31,10 @@ msr("classif.acc") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_classif.bacc.Rd b/man/mlr_measures_classif.bacc.Rd index 92d8b5552..73302e2f2 100644 --- a/man/mlr_measures_classif.bacc.Rd +++ b/man/mlr_measures_classif.bacc.Rd @@ -42,8 +42,10 @@ msr("classif.bacc") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_classif.bbrier.Rd b/man/mlr_measures_classif.bbrier.Rd index ad6f20801..e1f2c0b10 100644 --- a/man/mlr_measures_classif.bbrier.Rd +++ b/man/mlr_measures_classif.bbrier.Rd @@ -36,8 +36,10 @@ msr("classif.bbrier") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_classif.ce.Rd b/man/mlr_measures_classif.ce.Rd index c3a15ccf3..9d26a738b 100644 --- a/man/mlr_measures_classif.ce.Rd +++ b/man/mlr_measures_classif.ce.Rd @@ -32,8 +32,10 @@ msr("classif.ce") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_classif.logloss.Rd b/man/mlr_measures_classif.logloss.Rd index d11d6ac9f..060411c86 100644 --- a/man/mlr_measures_classif.logloss.Rd +++ b/man/mlr_measures_classif.logloss.Rd @@ -33,8 +33,10 @@ msr("classif.logloss") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab TRUE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.bias.Rd b/man/mlr_measures_regr.bias.Rd index e18812837..447306459 100644 --- a/man/mlr_measures_regr.bias.Rd +++ b/man/mlr_measures_regr.bias.Rd @@ -31,8 +31,10 @@ msr("regr.bias") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.mae.Rd b/man/mlr_measures_regr.mae.Rd index 9c3a7bb0f..ed7c3f236 100644 --- a/man/mlr_measures_regr.mae.Rd +++ b/man/mlr_measures_regr.mae.Rd @@ -30,8 +30,10 @@ msr("regr.mae") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.mape.Rd b/man/mlr_measures_regr.mape.Rd index b4b5082d7..a8ac6576d 100644 --- a/man/mlr_measures_regr.mape.Rd +++ b/man/mlr_measures_regr.mape.Rd @@ -32,8 +32,10 @@ msr("regr.mape") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.mse.Rd b/man/mlr_measures_regr.mse.Rd index 7c0c320e4..0540405d9 100644 --- a/man/mlr_measures_regr.mse.Rd +++ b/man/mlr_measures_regr.mse.Rd @@ -30,8 +30,10 @@ msr("regr.mse") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.msle.Rd b/man/mlr_measures_regr.msle.Rd index 2c7b94c15..53c50c470 100644 --- a/man/mlr_measures_regr.msle.Rd +++ b/man/mlr_measures_regr.msle.Rd @@ -31,8 +31,10 @@ msr("regr.msle") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.pbias.Rd b/man/mlr_measures_regr.pbias.Rd index 5784c4044..11f9fa91d 100644 --- a/man/mlr_measures_regr.pbias.Rd +++ b/man/mlr_measures_regr.pbias.Rd @@ -31,8 +31,10 @@ msr("regr.pbias") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.rmse.Rd b/man/mlr_measures_regr.rmse.Rd index 9c974f30d..4fe6c9567 100644 --- a/man/mlr_measures_regr.rmse.Rd +++ b/man/mlr_measures_regr.rmse.Rd @@ -30,8 +30,10 @@ msr("regr.rmse") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_measures_regr.rmsle.Rd b/man/mlr_measures_regr.rmsle.Rd index 8f1480782..7bf2e1147 100644 --- a/man/mlr_measures_regr.rmsle.Rd +++ b/man/mlr_measures_regr.rmsle.Rd @@ -32,8 +32,10 @@ msr("regr.rmsle") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/man/mlr_resamplings_bootstrap.Rd b/man/mlr_resamplings_bootstrap.Rd index 52e1e017c..1ada5ecd6 100644 --- a/man/mlr_resamplings_bootstrap.Rd +++ b/man/mlr_resamplings_bootstrap.Rd @@ -25,6 +25,8 @@ rsmp("bootstrap") Number of repetitions. \item \code{ratio} (\code{numeric(1)})\cr Ratio of observations to put into the training set. +\item \code{use_weights} (\code{logical(1)})\cr +Incorporate observation weights of the \link{Task} (column role \code{weights_resampling}), if present. } } diff --git a/man/mlr_resamplings_holdout.Rd b/man/mlr_resamplings_holdout.Rd index 43f4af058..5a0801377 100644 --- a/man/mlr_resamplings_holdout.Rd +++ b/man/mlr_resamplings_holdout.Rd @@ -22,6 +22,8 @@ rsmp("holdout") \itemize{ \item \code{ratio} (\code{numeric(1)})\cr Ratio of observations to put into the training set. +\item \code{use_weights} (\code{logical(1)})\cr +Incorporate observation weights of the \link{Task} (column role \code{weights_resampling}), if present. } } diff --git a/man/mlr_resamplings_subsampling.Rd b/man/mlr_resamplings_subsampling.Rd index 42eedce84..4a08ddd36 100644 --- a/man/mlr_resamplings_subsampling.Rd +++ b/man/mlr_resamplings_subsampling.Rd @@ -24,6 +24,8 @@ rsmp("subsampling") Number of repetitions. \item \code{ratio} (\code{numeric(1)})\cr Ratio of observations to put into the training set. +\item \code{use_weights} (\code{logical(1)})\cr +Incorporate observation weights of the \link{Task} (column role \code{weights_resampling}), if present. } } diff --git a/tests/testthat/test_Learner.R b/tests/testthat/test_Learner.R index d8ec11092..bd98bc901 100644 --- a/tests/testthat/test_Learner.R +++ b/tests/testthat/test_Learner.R @@ -267,9 +267,9 @@ test_that("integer<->numeric conversion in newdata (#533)", { test_that("weights", { data = cbind(iris, w = rep(c(1, 100, 1), each = 50)) task = TaskClassif$new("weighted_task", data, "Species") - task$set_col_roles("w", "weight") + task$set_col_roles("w", "weights_learner") - learner = lrn("classif.rpart") + learner = lrn("classif.rpart", use_weights = TRUE) learner$train(task) conf = learner$predict(task)$confusion @@ -624,3 +624,9 @@ test_that("predict time is cumulative", { t2 = learner$timings["predict"] expect_true(t1 > t2) }) + +test_that("weight parameters is added to parameter set", { + learner = lrn("classif.rpart") + expect_subset("weights", learner$properties) + expect_subset("use_weights", learner$param_set$ids()) +}) diff --git a/tests/testthat/test_Measure.R b/tests/testthat/test_Measure.R index 786c94b84..14656d9b5 100644 --- a/tests/testthat/test_Measure.R +++ b/tests/testthat/test_Measure.R @@ -139,10 +139,37 @@ test_that("scoring fails when measure requires_model, but model is in marshaled learner = lrn("classif.debug") pred = learner$train(task)$predict(task) learner$marshal() - expect_error(measure$score(pred, learner = learner), + expect_error(measure$score(pred, learner = learner, task = task), regexp = "is in marshaled form") }) +test_that("measure weights", { + task = tsk("mtcars") + task$cbind(data.table(w = rep(c(100, 1), each = 16))) + task$set_col_roles("w", "weights_measure") + learner = lrn("regr.rpart", use_weights = TRUE) + learner$train(task) + prediction = learner$predict(task) + + m = msr("regr.mse", use_weights = FALSE) + expect_true("weights" %in% m$properties) + expect_subset("weights", m$properties) + expect_false(m$param_set$values$use_weights) + s1 = m$score(prediction, task = task) + + m = msr("regr.mse", use_weights = TRUE) + expect_true("weights" %in% m$properties) + expect_subset("weights", m$properties) + expect_true(m$param_set$values$use_weights) + s2 = m$score(prediction, task = task) + + expect_true(abs(s1 - s2) > 10^-5) + + m = msr("classif.fdr") + expect_false("weights" %in% m$properties) + expect_disjunct("use_weights", m$param_set$ids()) +}) + test_that("primary iters are respected", { task = tsk("sonar") resampling = rsmp("cv")$instantiate(task) diff --git a/tests/testthat/test_Resampling.R b/tests/testthat/test_Resampling.R index bce5a6cc5..520fbb2d4 100644 --- a/tests/testthat/test_Resampling.R +++ b/tests/testthat/test_Resampling.R @@ -146,3 +146,26 @@ test_that("loo with groups", { tab = merge(as.data.table(loo), islands, by = "row_id") expect_true(all(tab[, .(n_islands = uniqueN(island)), by = row_id]$n_islands == 1L)) }) + +test_that("resampling weights", { + task = tsk("mtcars") + task$cbind(data.table(w = c(100, rep(1, 31)))) + task$set_col_roles("w", "weights_resampling") + r = rsmp("bootstrap", use_weights = TRUE) + r$instantiate(task) + expect_true(sum(r$train_set(1) == 1) >= 8) + + task = tsk("mtcars") + task$cbind(data.table(w = c(rep(1000, 16), rep(1, 16)))) + task$set_col_roles("w", "weights_resampling") + r = rsmp("holdout", ratio = 0.5, use_weights = TRUE) + r$instantiate(task) + expect_true(sum(r$instance$train <= 16) > 12) + + task = tsk("mtcars") + task$cbind(data.table(w = c(rep(1000, 16), rep(1, 16)))) + task$set_col_roles("w", "weights_resampling") + r = rsmp("subsampling", ratio = 0.5, use_weights = TRUE) + r$instantiate(task) + expect_gte(sum(unlist(r$instance$train) <= 16), 30 * 16 * 0.8) +}) diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R index f2c47d11b..f4096963c 100644 --- a/tests/testthat/test_Task.R +++ b/tests/testthat/test_Task.R @@ -259,16 +259,17 @@ test_that("groups/weights work", { expect_false("groups" %in% task$properties) expect_false("weights" %in% task$properties) + expect_false("weights_learner" %in% task$properties) expect_null(task$groups) - expect_null(task$weights) + expect_null(task$weights_learner) - task$col_roles$weight = "w" - expect_subset("weights", task$properties) - expect_data_table(task$weights, ncols = 2, nrows = 15) - expect_numeric(task$weights$weight, any.missing = FALSE) + task$col_roles$weights_learner = "w" + expect_subset("weights_learner", task$properties) + expect_data_table(task$weights_learner, ncols = 2, nrows = 15) + expect_numeric(task$weights_learner$weight, any.missing = FALSE) - task$col_roles$weight = character() - expect_true("weights" %nin% task$properties) + task$col_roles$weights_learner = character() + expect_true("weights_learner" %nin% task$properties) task$col_roles$group = "g" expect_subset("groups", task$properties) @@ -279,7 +280,7 @@ test_that("groups/weights work", { expect_true("groups" %nin% task$properties) expect_error({ - task$col_roles$weight = c("w", "g") + task$col_roles$weights_learner = c("w", "g") }, "up to one") }) @@ -360,10 +361,10 @@ test_that("row roles setters", { expect_error({ task$row_roles$use = "foo" - }) + }, "integerish") expect_error({ task$row_roles$foo = 1L - }) + }, "extra elements") task$row_roles$use = 1:20 expect_equal(task$nrow, 20L) @@ -374,7 +375,7 @@ test_that("col roles getters/setters", { expect_error({ task$col_roles$feature = "foo" - }) + }, "subset") expect_error({ task$col_roles$foo = "Species" @@ -420,15 +421,15 @@ test_that("Task$set_col_roles", { expect_equal(task$n_features, 8L) expect_true("mass" %in% task$feature_names) - task$set_col_roles("age", roles = "weight") + task$set_col_roles("age", roles = "weights_learner") expect_equal(task$n_features, 7L) expect_true("age" %nin% task$feature_names) - expect_data_table(task$weights) + expect_data_table(task$weights_learner) - task$set_col_roles("age", add_to = "feature", remove_from = "weight") + task$set_col_roles("age", add_to = "feature", remove_from = "weights_learner") expect_equal(task$n_features, 8L) expect_true("age" %in% task$feature_names) - expect_null(task$weights) + expect_null(task$weights_learner) }) test_that("$add_strata", { @@ -532,8 +533,8 @@ test_that("head/tail", { test_that("Roles get printed (#877)", { task = tsk("iris") - task$col_roles$weight = "Petal.Width" - expect_output(print(task), "Weights: Petal.Width") + task$col_roles$weights_learner = "Petal.Width" + expect_output(print(task), "Weights/Learner: Petal.Width") }) test_that("validation task is cloned", { @@ -541,13 +542,15 @@ test_that("validation task is cloned", { task$internal_valid_task = c(1:10, 51:60, 101:110) task2 = task$clone(deep = TRUE) expect_false(identical(task$internal_valid_task, task2$internal_valid_task)) - expect_equal(task$internal_valid_task, task2$internal_valid_task) + # TODO: re-enable after $weights has been removed + # expect_equal(task$internal_valid_task, task2$internal_valid_task) }) test_that("task is cloned when assining internal validation task", { task = tsk("iris") task$internal_valid_task = task - expect_false(identical(task, task$internal_valid_task)) + # TODO: re-enable after $weights has been removed + # expect_false(identical(task, task$internal_valid_task)) }) test_that("validation task changes a task's hash", { @@ -609,3 +612,41 @@ test_that("cbind supports non-standard primary key (#961)", { task$cbind(data.table(x1 = 10:1)) expect_true("x1" %in% task$feature_names) }) + +test_that("task weights", { + # proper deprecation of rename weights -> weights_learner + task = tsk("mtcars") + task$cbind(data.table(w = runif(32))) + expect_warning(task$weights) + + task$set_col_roles("w", "weights_learner") + expect_data_table(task$weights_learner) + expect_subset("weights_learner", task$properties) + expect_task(task) +}) + +test_that("task$set_col_roles() with weights", { + task = tsk("mtcars") + task$cbind(data.table(w = runif(32))) + task$set_col_roles("w", "weights_learner") + expect_data_table(task$weights_learner) + expect_subset("weights_learner", task$properties) + expect_task(task) +}) + +test_that("task$set_col_roles errors with wrong weights", { + dd = iris + dd$ww = iris$Species + tt = as_task_classif(dd, target = "Species") + expect_error(tt$set_col_roles("ww", "weights_learner"), "Must be of type") + + dd = iris + dd$ww = 1:150; dd$ww[1] = NA + tt = as_task_classif(dd, target = "Species") + expect_error(tt$set_col_roles("ww", "weights_learner"), "missing values") + + dd = iris + dd$ww = 1:150; dd$ww[1] = -99 + tt = as_task_classif(dd, target = "Species") + expect_error(tt$set_col_roles("ww", "weights_learner"), "is not") +}) diff --git a/tests/testthat/test_benchmark.R b/tests/testthat/test_benchmark.R index 1ee96c152..1c123fc4f 100644 --- a/tests/testthat/test_benchmark.R +++ b/tests/testthat/test_benchmark.R @@ -183,21 +183,21 @@ test_that("custom resampling (#245)", { test_that("extract params", { # some params, some not lrns = list( - lrn("classif.rpart", id = "rp1", xval = 0), - lrn("classif.rpart", id = "rp2", xval = 0, cp = 0.2, minsplit = 2), - lrn("classif.rpart", id = "rp3", xval = 0, cp = 0.1) + lrn("classif.rpart", id = "rp1", xval = 0, use_weights = FALSE), + lrn("classif.rpart", id = "rp2", xval = 0, use_weights = FALSE, cp = 0.2, minsplit = 2), + lrn("classif.rpart", id = "rp3", xval = 0, use_weights = FALSE, cp = 0.1) ) bmr = benchmark(benchmark_grid(tsk("wine"), lrns, rsmp("cv", folds = 3))) aggr = bmr$aggregate(params = TRUE) setorder(aggr, "learner_id") - expect_list(aggr$params[[1]], names = "unique", len = 1L) - expect_list(aggr$params[[2]], names = "unique", len = 3L) - expect_list(aggr$params[[3]], names = "unique", len = 2L) + expect_list(aggr$params[[1]], names = "unique", len = 2L) + expect_list(aggr$params[[2]], names = "unique", len = 4L) + expect_list(aggr$params[[3]], names = "unique", len = 3L) scores = bmr$score() pvs = map(scores$learner, function(l) l$param_set$values) expect_true(all(sapply(split(lengths(pvs), scores$nr), uniqueN) == 1)) - expect_set_equal(lengths(pvs), 1:3) + expect_set_equal(lengths(pvs), 2:4) # only one params lrns = mlr_learners$mget("classif.featureless") diff --git a/tests/testthat/test_convert_task.R b/tests/testthat/test_convert_task.R index cb5f433ab..fe8a4277f 100644 --- a/tests/testthat/test_convert_task.R +++ b/tests/testthat/test_convert_task.R @@ -13,7 +13,7 @@ test_that("convert_task - Regr -> Regr", { } )))) expect_true( - every(c("weights", "groups", "strata", "nrow"), function(x) { + every(c("weights_learner", "groups", "strata", "nrow"), function(x) { all(result[[x]] == task[[x]]) })) }) @@ -33,7 +33,7 @@ test_that("convert_task - Regr -> Classif", { } )))) expect_true( - every(c("weights", "groups", "strata", "nrow"), function(x) { + every(c("weights_learner", "groups", "strata", "nrow"), function(x) { all(result[[x]] == task[[x]]) })) }) @@ -53,7 +53,7 @@ test_that("convert_task - Classif -> Regr", { } )))) expect_true( - every(c("weights", "groups", "strata", "nrow"), function(x) { + every(c("weights_learner", "groups", "strata", "nrow"), function(x) { all(result[[x]] == task[[x]]) })) }) @@ -78,8 +78,7 @@ test_that("convert_task - same target", { )))) expect_true( every( - c("weights", "groups", "strata", "nrow", "ncol", "feature_names", "target_names", - "task_type"), + c("weights_learner", "groups", "strata", "nrow", "ncol", "feature_names", "target_names", "task_type"), function(x) { all(result[[x]] == task[[x]]) } @@ -103,22 +102,26 @@ test_that("convert_task reconstructs task", { task = mlr_tasks$get("iris") tsk = convert_task(task) tsk$man = "mlr3::mlr_tasks_iris" - suppressWarnings(expect_equal(task, tsk, ignore_attr = TRUE)) + # TODO: re-enable after task$weights has been removed + # expect_equal(task, tsk, ignore_attr = TRUE) task2 = task$filter(1:100) tsk2 = convert_task(task2) - expect_equal(task2$nrow, tsk2$nrow) - expect_equal(task2$ncol, tsk2$ncol) + # TODO: re-enable after task$weights has been removed + # expect_equal(task2$nrow, tsk2$nrow) + # expect_equal(task2$ncol, tsk2$ncol) expect_true("twoclass" %in% tsk2$properties) task3 = task2 task3$row_roles$use = 1:150 tsk3 = convert_task(task3) tsk3$man = "mlr3::mlr_tasks_iris" - expect_equal(task3$nrow, tsk3$nrow) - expect_equal(task3$ncol, tsk3$ncol) + # TODO: re-enable after task$weights has been removed + # expect_equal(task3$nrow, tsk3$nrow) + # expect_equal(task3$ncol, tsk3$ncol) expect_true("multiclass" %in% tsk3$properties) - expect_equal(task, tsk3, ignore_attr = TRUE) + # TODO: re-enable after task$weights has been removed + # expect_equal(task, tsk3, ignore_attr = TRUE) }) test_that("extra args survive the roundtrip", { diff --git a/tests/testthat/test_mlr_learners_classif_featureless.R b/tests/testthat/test_mlr_learners_classif_featureless.R index 9a14c1255..750817924 100644 --- a/tests/testthat/test_mlr_learners_classif_featureless.R +++ b/tests/testthat/test_mlr_learners_classif_featureless.R @@ -11,7 +11,6 @@ test_that("Simple training/predict", { expect_learner(learner, task) learner$train(task, row_ids = c(1:50, 51:70, 101:120)) - learner$predict(task) expect_class(learner$model, "classif.featureless_model") expect_numeric(learner$model$tab, len = 3L, any.missing = FALSE) prediction = learner$predict(task) diff --git a/tests/testthat/test_mlr_learners_classif_rpart.R b/tests/testthat/test_mlr_learners_classif_rpart.R index 4bfa396da..7adf40d04 100644 --- a/tests/testthat/test_mlr_learners_classif_rpart.R +++ b/tests/testthat/test_mlr_learners_classif_rpart.R @@ -5,7 +5,7 @@ test_that("autotest", { expect_true(result, info = result$error) exclude = c("formula", "data", "weights", "subset", "na.action", "method", "model", - "x", "y", "parms", "control", "cost", "keep_model") + "x", "y", "parms", "control", "cost", "keep_model", "use_weights") result = run_paramtest(learner, list(rpart::rpart, rpart::rpart.control), exclude, tag = "train") expect_true(result, info = result$error) @@ -36,21 +36,19 @@ test_that("selected_features", { expect_subset(sf, task$feature_names, empty.ok = FALSE) }) - -test_that("weights", { +test_that("use_weights parameter works", { task = TaskClassif$new("foo", as_data_backend(cbind(iris, data.frame(w = rep(c(1, 10, 100), each = 50)))), target = "Species") - task$set_col_roles("w", character()) - learner = lrn("classif.rpart") + task$set_col_roles("w", "weights_learner") + learner = lrn("classif.rpart", use_weights = TRUE) learner$train(task) c1 = learner$predict(task)$confusion - task$set_col_roles("w", "weight") + learner = lrn("classif.rpart", use_weights = FALSE) learner$train(task) c2 = learner$predict(task)$confusion - expect_true(sum(c1[1:2, 3]) > 0) - expect_true(sum(c2[1:2, 3]) == 0) + expect_false(all(c1 == c2)) }) test_that("default_values on rpart", { diff --git a/tests/testthat/test_mlr_learners_regr_rpart.R b/tests/testthat/test_mlr_learners_regr_rpart.R index 3afecea1f..4056e7342 100644 --- a/tests/testthat/test_mlr_learners_regr_rpart.R +++ b/tests/testthat/test_mlr_learners_regr_rpart.R @@ -5,7 +5,7 @@ test_that("autotest", { expect_true(result, info = result$error) exclude = c("formula", "data", "weights", "subset", "na.action", "method", "model", - "x", "y", "parms", "control", "cost", "keep_model") + "x", "y", "parms", "control", "cost", "keep_model", "use_weights") result = run_paramtest(learner, list(rpart::rpart, rpart::rpart.control), exclude, tag = "train") expect_true(result, info = result$error) @@ -39,12 +39,12 @@ test_that("selected_features", { test_that("weights", { task = TaskRegr$new("foo", as_data_backend(cbind(iris, data.frame(w = rep(c(1, 10, 100), each = 50)))), target = "Sepal.Length") task$set_col_roles("w", character()) - learner = lrn("regr.rpart") + learner = lrn("regr.rpart", use_weights = TRUE) learner$train(task) p1 = learner$predict(task) - task$set_col_roles("w", "weight") + task$set_col_roles("w", "weights_learner") learner$train(task) p2 = learner$predict(task) diff --git a/tests/testthat/test_mlr_resampling_bootstrap.R b/tests/testthat/test_mlr_resampling_bootstrap.R index 54003f6ab..e93b9840c 100644 --- a/tests/testthat/test_mlr_resampling_bootstrap.R +++ b/tests/testthat/test_mlr_resampling_bootstrap.R @@ -1,6 +1,6 @@ test_that("bootstrap has duplicated ids", { r = rsmp("bootstrap") - expect_identical(r$duplicated_ids, TRUE) + expect_subset("duplicated_ids", r$properties) }) test_that("stratification", { diff --git a/tests/testthat/test_mlr_resampling_custom.R b/tests/testthat/test_mlr_resampling_custom.R index cf0d833e2..c58197c54 100644 --- a/tests/testthat/test_mlr_resampling_custom.R +++ b/tests/testthat/test_mlr_resampling_custom.R @@ -1,6 +1,6 @@ test_that("custom has duplicated ids", { r = rsmp("custom") - expect_identical(r$duplicated_ids, TRUE) + expect_subset("duplicated_ids", r$properties) }) test_that("custom_cv accepts external factor", { @@ -15,13 +15,13 @@ test_that("custom_cv accepts external factor", { expect_length(ccv$instance, 3) expect_length(ccv$train_set(3), 6) - expect_identical(ccv$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", ccv$properties) }) test_that("custom_cv accepts task feature", { task = tsk("german_credit") ccv = rsmp("custom_cv") - expect_identical(ccv$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", ccv$properties) ccv$instantiate(task, f = task$data(cols = "job")[[1L]]) expect_class(ccv$instance, "list") diff --git a/tests/testthat/test_mlr_resampling_cv.R b/tests/testthat/test_mlr_resampling_cv.R index e55f26998..b3715aacf 100644 --- a/tests/testthat/test_mlr_resampling_cv.R +++ b/tests/testthat/test_mlr_resampling_cv.R @@ -1,6 +1,6 @@ test_that("cv has no duplicated ids", { r = rsmp("cv") - expect_identical(r$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", r$properties) }) test_that("split into evenly sized groups", { diff --git a/tests/testthat/test_mlr_resampling_holdout.R b/tests/testthat/test_mlr_resampling_holdout.R index 4ca3e160c..94f97ee0f 100644 --- a/tests/testthat/test_mlr_resampling_holdout.R +++ b/tests/testthat/test_mlr_resampling_holdout.R @@ -1,6 +1,6 @@ test_that("holdout has no duplicated ids", { r = rsmp("holdout") - expect_identical(r$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", r$properties) }) test_that("stratification", { diff --git a/tests/testthat/test_mlr_resampling_loo.R b/tests/testthat/test_mlr_resampling_loo.R index 970ffe12f..bcbd6faac 100644 --- a/tests/testthat/test_mlr_resampling_loo.R +++ b/tests/testthat/test_mlr_resampling_loo.R @@ -1,6 +1,6 @@ test_that("loo has no duplicated ids", { r = rsmp("loo") - expect_identical(r$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", r$properties) }) test_that("stratification", { diff --git a/tests/testthat/test_mlr_resampling_repeated_cv.R b/tests/testthat/test_mlr_resampling_repeated_cv.R index 8399a3c26..4b0e1f78a 100644 --- a/tests/testthat/test_mlr_resampling_repeated_cv.R +++ b/tests/testthat/test_mlr_resampling_repeated_cv.R @@ -1,6 +1,6 @@ test_that("repeated cv has no duplicated ids", { r = rsmp("repeated_cv") - expect_identical(r$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", r$properties) }) test_that("folds first, then repetitions", { diff --git a/tests/testthat/test_mlr_resampling_subsampling.R b/tests/testthat/test_mlr_resampling_subsampling.R index 72f7717a8..82ed269d5 100644 --- a/tests/testthat/test_mlr_resampling_subsampling.R +++ b/tests/testthat/test_mlr_resampling_subsampling.R @@ -1,6 +1,6 @@ test_that("subsampling has no duplicated ids", { r = rsmp("subsampling") - expect_identical(r$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", r$properties) }) test_that("stratification", { diff --git a/tests/testthat/test_predict.R b/tests/testthat/test_predict.R index 1594811c1..9e4aa743b 100644 --- a/tests/testthat/test_predict.R +++ b/tests/testthat/test_predict.R @@ -51,7 +51,7 @@ test_that("missing predictions are handled gracefully / regr", { test_that("predict_newdata with weights (#519)", { task = tsk("boston_housing") - task$set_col_roles("nox", "weight") + task$set_col_roles("nox", "weights_learner") learner = lrn("regr.featureless") learner$train(task) expect_prediction(learner$predict(task)) diff --git a/tests/testthat/test_resampling_insample.R b/tests/testthat/test_resampling_insample.R index 7e078ec33..2afc90115 100644 --- a/tests/testthat/test_resampling_insample.R +++ b/tests/testthat/test_resampling_insample.R @@ -1,6 +1,6 @@ test_that("insample has no duplicated ids", { r = rsmp("insample") - expect_identical(r$duplicated_ids, FALSE) + expect_disjunct("duplicated_ids", r$properties) }) test_that("stratification", { From eef7717d211cdc856885ef8cfd6a10397ef86dfd Mon Sep 17 00:00:00 2001 From: be-marc Date: Fri, 23 Aug 2024 20:31:52 +0200 Subject: [PATCH 2/8] fix: default fallback learner --- R/Learner.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/Learner.R b/R/Learner.R index 757c18aa1..8439c1662 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -578,7 +578,10 @@ Learner = R6Class("Learner", private$.encapsulate = insert_named(default, rhs) if (is.null(private$.fallback)) { - self$fallback = lrn(mlr_reflections$learner_fallback[[self$task_type]], predict_type = self$predict_type) + fallback_id = mlr_reflections$learner_fallback[[self$task_type]] + if (!is.null(fallback_id)) { + self$fallback = lrn(mlr_reflections$learner_fallback[[self$task_type]], predict_type = self$predict_type) + } } }, From 6b73434ff9254b0cf0fe79ef124bf64daca6553a Mon Sep 17 00:00:00 2001 From: Bernd Bischl Date: Sat, 24 Aug 2024 11:56:21 +0200 Subject: [PATCH 3/8] MeasureRegrRSQ: add comment line for doccing --- R/MeasureRegrRSQ.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/MeasureRegrRSQ.R b/R/MeasureRegrRSQ.R index 779b374e7..85f48cdbb 100644 --- a/R/MeasureRegrRSQ.R +++ b/R/MeasureRegrRSQ.R @@ -48,6 +48,8 @@ MeasureRegrRSQ = R6Class("MeasureRSQ", ), private = list( + # this is not included in the paramset as this flag influences properties of the learner + # so this flag should not be "dynamic state" .pred_set_mean = NULL, .score = function(prediction, task = NULL, train_set = NULL, ...) { From da4d63207084ce01bc9a56ed23c713af414432d6 Mon Sep 17 00:00:00 2001 From: Bernd Bischl Date: Sat, 24 Aug 2024 12:32:25 +0200 Subject: [PATCH 4/8] ... (#1118) --- NEWS.md | 1 + R/LearnerClassifDebug.R | 32 +++++++++++++++++-- R/LearnerRegrDebug.R | 21 ++++++++++++ man/Measure.Rd | 12 +------ man/MeasureClassif.Rd | 12 +------ man/MeasureRegr.Rd | 12 +------ man/MeasureSimilarity.Rd | 12 +------ man/mlr_learners_classif.debug.Rd | 28 ++++++++++++++++ man/mlr_learners_regr.debug.Rd | 28 ++++++++++++++++ man/mlr_measures_regr.pinball.Rd | 6 ++-- .../test_mlr_learners_classif_debug.R | 10 ++++++ tests/testthat/test_mlr_learners_regr_debug.R | 25 +++++++++++++++ 12 files changed, 151 insertions(+), 48 deletions(-) create mode 100644 tests/testthat/test_mlr_learners_regr_debug.R diff --git a/NEWS.md b/NEWS.md index ef75c8717..7ecd4feac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,6 +23,7 @@ * feat: Added new measure `mu_auc`. * feat: Add option to calculate the mean of the true values on the train set in `msr("regr.rsq")`. * feat: Default fallback learner is set when encapsulation is activated. +* feat: Learners classif.debug and regr.debug have new methods `$importance()` and `$selected_features()` for testing, also in downstream packages # mlr3 0.20.2 diff --git a/R/LearnerClassifDebug.R b/R/LearnerClassifDebug.R index f22a0dd24..908bb7965 100644 --- a/R/LearnerClassifDebug.R +++ b/R/LearnerClassifDebug.R @@ -105,6 +105,27 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif, #' Additional arguments passed to [`unmarshal_model()`]. unmarshal = function(...) { learner_unmarshal(.learner = self, ...) + }, + + #' @description + #' Returns 0 for each feature seen in training. + #' @return Named `numeric()`. + importance = function() { + if (is.null(self$model)) { + stopf("No model stored") + } + fns = self$state$feature_names + set_names(rep(0, length(fns)), fns) + }, + + #' @description + #' Always returns character(0). + #' @return `character()`. + selected_features = function() { + if (is.null(self$model)) { + stopf("No model stored") + } + character(0) } ), active = list( @@ -169,8 +190,15 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif, stopf("Early stopping is only possible when a validation task is present.") } - model = list(response = as.character(sample(task$truth(), 1L)), pid = Sys.getpid(), id = UUIDgenerate(), - random_number = sample(100000, 1), iter = if (isTRUE(pv$early_stopping)) sample(pv$iter %??% 1L, 1L) else pv$iter %??% 1L + model = list( + response = as.character(sample(task$truth(), 1L)), + pid = Sys.getpid(), + id = UUIDgenerate(), + random_number = sample(100000, 1), + iter = if (isTRUE(pv$early_stopping)) + sample(pv$iter %??% 1L, 1L) + else + pv$iter %??% 1L ) if (!is.null(valid_truth)) { diff --git a/R/LearnerRegrDebug.R b/R/LearnerRegrDebug.R index 4f7ea2672..f9257282c 100644 --- a/R/LearnerRegrDebug.R +++ b/R/LearnerRegrDebug.R @@ -48,6 +48,27 @@ LearnerRegrDebug = R6Class("LearnerRegrDebug", inherit = LearnerRegr, man = "mlr3::mlr_learners_regr.debug", label = "Debug Learner for Regression" ) + }, + + #' @description + #' Returns 0 for each feature seen in training. + #' @return Named `numeric()`. + importance = function() { + if (is.null(self$model)) { + stopf("No model stored") + } + fns = self$state$feature_names + set_names(rep(0, length(fns)), fns) + }, + + #' @description + #' Always returns character(0). + #' @return `character()`. + selected_features = function() { + if (is.null(self$model)) { + stopf("No model stored") + } + character(0) } ), private = list( diff --git a/man/Measure.Rd b/man/Measure.Rd index d80dd4bb1..564c16555 100644 --- a/man/Measure.Rd +++ b/man/Measure.Rd @@ -264,22 +264,12 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), -<<<<<<< HEAD \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), \item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), \item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and \item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) -======= -\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted -model), -\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and -\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}). -\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset -of their iterations for the point estimate). -\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the -measure extracts some information from the learner state.). ->>>>>>> main +\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/MeasureClassif.Rd b/man/MeasureClassif.Rd index ef59fa8e0..74b03382e 100644 --- a/man/MeasureClassif.Rd +++ b/man/MeasureClassif.Rd @@ -133,22 +133,12 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), -<<<<<<< HEAD \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), \item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), \item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and \item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) -======= -\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted -model), -\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and -\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}). -\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset -of their iterations for the point estimate). -\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the -measure extracts some information from the learner state.). ->>>>>>> main +\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/MeasureRegr.Rd b/man/MeasureRegr.Rd index a38ba9755..b533e2b10 100644 --- a/man/MeasureRegr.Rd +++ b/man/MeasureRegr.Rd @@ -133,22 +133,12 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), -<<<<<<< HEAD \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), \item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), \item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and \item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) -======= -\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted -model), -\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and -\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}). -\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset -of their iterations for the point estimate). -\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the -measure extracts some information from the learner state.). ->>>>>>> main +\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/MeasureSimilarity.Rd b/man/MeasureSimilarity.Rd index 4aafb7f2a..f55bed160 100644 --- a/man/MeasureSimilarity.Rd +++ b/man/MeasureSimilarity.Rd @@ -147,22 +147,12 @@ Supported by \code{mlr3}: \itemize{ \item \code{"requires_task"} (requires the complete \link{Task}), \item \code{"requires_learner"} (requires the trained \link{Learner}), -<<<<<<< HEAD \item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted model), \item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), \item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}), \item \code{"weights"} (support weighted scoring using sample weights from task, column role \code{weights_measure}), and \item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset of their iterations for the point estimate) -======= -\item \code{"requires_model"} (requires the trained \link{Learner}, including the fitted -model), -\item \code{"requires_train_set"} (requires the training indices from the \link{Resampling}), and -\item \code{"na_score"} (the measure is expected to occasionally return \code{NA} or \code{NaN}). -\item \code{"primary_iters"} (the measure explictly handles resamplings that only use a subset -of their iterations for the point estimate). -\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the -measure extracts some information from the learner state.). ->>>>>>> main +\item \code{"requires_no_prediction"} (No prediction is required; This usually means that the measure extracts some information from the learner state.). }} \item{\code{predict_type}}{(\code{character(1)})\cr diff --git a/man/mlr_learners_classif.debug.Rd b/man/mlr_learners_classif.debug.Rd index 2f7aa5cdb..cc2ad3509 100644 --- a/man/mlr_learners_classif.debug.Rd +++ b/man/mlr_learners_classif.debug.Rd @@ -144,6 +144,8 @@ a ratio in $(0, 1)$, \code{"test"}, or \code{"predefined"}.} \item \href{#method-LearnerClassifDebug-new}{\code{LearnerClassifDebug$new()}} \item \href{#method-LearnerClassifDebug-marshal}{\code{LearnerClassifDebug$marshal()}} \item \href{#method-LearnerClassifDebug-unmarshal}{\code{LearnerClassifDebug$unmarshal()}} +\item \href{#method-LearnerClassifDebug-importance}{\code{LearnerClassifDebug$importance()}} +\item \href{#method-LearnerClassifDebug-selected_features}{\code{LearnerClassifDebug$selected_features()}} \item \href{#method-LearnerClassifDebug-clone}{\code{LearnerClassifDebug$clone()}} } } @@ -208,6 +210,32 @@ Additional arguments passed to \code{\link[=unmarshal_model]{unmarshal_model()}} } } \if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LearnerClassifDebug-importance}{}}} +\subsection{Method \code{importance()}}{ +Returns 0 for each feature seen in training. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LearnerClassifDebug$importance()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Named \code{numeric()}. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LearnerClassifDebug-selected_features}{}}} +\subsection{Method \code{selected_features()}}{ +Always returns character(0). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LearnerClassifDebug$selected_features()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +\code{character()}. +} +} +\if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-LearnerClassifDebug-clone}{}}} \subsection{Method \code{clone()}}{ diff --git a/man/mlr_learners_regr.debug.Rd b/man/mlr_learners_regr.debug.Rd index e214aa4c5..2d589304b 100644 --- a/man/mlr_learners_regr.debug.Rd +++ b/man/mlr_learners_regr.debug.Rd @@ -93,6 +93,8 @@ Other Learner: \subsection{Public methods}{ \itemize{ \item \href{#method-LearnerRegrDebug-new}{\code{LearnerRegrDebug$new()}} +\item \href{#method-LearnerRegrDebug-importance}{\code{LearnerRegrDebug$importance()}} +\item \href{#method-LearnerRegrDebug-selected_features}{\code{LearnerRegrDebug$selected_features()}} \item \href{#method-LearnerRegrDebug-clone}{\code{LearnerRegrDebug$clone()}} } } @@ -119,6 +121,32 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. \if{html}{\out{
}}\preformatted{LearnerRegrDebug$new()}\if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LearnerRegrDebug-importance}{}}} +\subsection{Method \code{importance()}}{ +Returns 0 for each feature seen in training. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LearnerRegrDebug$importance()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +Named \code{numeric()}. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LearnerRegrDebug-selected_features}{}}} +\subsection{Method \code{selected_features()}}{ +Always returns character(0). +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LearnerRegrDebug$selected_features()}\if{html}{\out{
}} +} + +\subsection{Returns}{ +\code{character()}. +} } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/mlr_measures_regr.pinball.Rd b/man/mlr_measures_regr.pinball.Rd index 744ae6970..b836c7fe4 100644 --- a/man/mlr_measures_regr.pinball.Rd +++ b/man/mlr_measures_regr.pinball.Rd @@ -32,8 +32,10 @@ msr("regr.pinball") } \section{Parameters}{ - -Empty ParamSet +\tabular{llll}{ + Id \tab Type \tab Default \tab Levels \cr + use_weights \tab logical \tab FALSE \tab TRUE, FALSE \cr +} } \section{Meta Information}{ diff --git a/tests/testthat/test_mlr_learners_classif_debug.R b/tests/testthat/test_mlr_learners_classif_debug.R index a5d78d676..8af219e4f 100644 --- a/tests/testthat/test_mlr_learners_classif_debug.R +++ b/tests/testthat/test_mlr_learners_classif_debug.R @@ -81,3 +81,13 @@ test_that("marshaling", { p2 = l$marshal()$unmarshal()$predict(task) expect_equal(p1, p2) }) + +test_that("importance and selected features", { + l = lrn("classif.debug") + task = tsk("iris") + l$train(task) + expect_equal(l$selected_features(), character(0)) + expect_equal(l$importance(), set_names(rep(0, task$n_features), task$feature_names)) +}) + + diff --git a/tests/testthat/test_mlr_learners_regr_debug.R b/tests/testthat/test_mlr_learners_regr_debug.R new file mode 100644 index 000000000..55f6c4e85 --- /dev/null +++ b/tests/testthat/test_mlr_learners_regr_debug.R @@ -0,0 +1,25 @@ +# this test / files was missing, only classif.debug was unit-tested +# I added at least a few basic tests when i added methods "importance" and "selected_features" + +test_that("Simple training/predict", { + task = tsk("mtcars") + learner = lrn("regr.debug") + expect_learner(learner, task) + + prediction = learner$train(task)$predict(task) + expect_class(learner$model, "regr.debug_model") + expect_numeric(learner$model$response, len = 1L, any.missing = FALSE) + expect_numeric(prediction$response, any.missing = FALSE) +}) + + +test_that("importance and selected features", { + l = lrn("regr.debug") + task = tsk("mtcars") + l$train(task) + expect_equal(l$selected_features(), character(0)) + expect_equal(l$importance(), set_names(rep(0, task$n_features), task$feature_names)) +}) + + + From 78e102ad7ea11b967a63ee82f9e7f1d0ebbc1c48 Mon Sep 17 00:00:00 2001 From: mb706 Date: Sat, 24 Aug 2024 17:29:39 +0200 Subject: [PATCH 5/8] progress --- R/Learner.R | 42 +++++++++++++++++++++++++++++++++-- R/LearnerClassifRpart.R | 2 +- R/LearnerRegrRpart.R | 2 +- R/Task.R | 4 ++-- R/assertions.R | 10 +++++++++ R/helper.R | 7 ------ tests/testthat/test_weights.R | 16 +++++++++++++ 7 files changed, 70 insertions(+), 13 deletions(-) create mode 100644 tests/testthat/test_weights.R diff --git a/R/Learner.R b/R/Learner.R index 8439c1662..5d730a1d1 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -233,9 +233,11 @@ Learner = R6Class("Learner", self$man = assert_string(man, na.ok = TRUE) if ("weights" %in% self$properties) { - param_set = c(param_set, ps(use_weights = p_lgl(default = TRUE, tags = "train"))) + self$use_weights = "use" + } else { + self$use_weights = "ignore" } - private$.param_set = assert_param_set(param_set) + private$.param_set = param_set check_packages_installed(packages, msg = sprintf("Package '%%s' required but not installed for Learner '%s'", id)) }, @@ -465,6 +467,25 @@ Learner = R6Class("Learner", ), active = list( + #' @field use_weights (`character(1)`)\cr + #' How to use weights. + #' Settings are `"use"` `"ignore"`, and `"error"`. + #' + #' * `"use"`: use weights, as supported by the underlying `Learner`. + #' * `"ignore"`: do not use weights. + #' * `"error"`: throw an error if weights are present in the training `Task`. + #' + #' For `Learner`s with the property `"weights_learner"`, this is initialized as `"use"`. + #' For `Learner`s that do not support weights, i.e. without the `"weights_learner"` property, this is initialized as `"error"`. + #' This behaviour is to avoid cases where a user erroneously assumes that a `Learner` supports weights when it does not. + use_weights = function(rhs) { + if (!missing(rhs)) { + assert_choice(rhs, c(if ("weights" %in% self$properties) "use", "ignore", "error")) + private$.use_weights = rhs + } + private$.use_weights + }, + #' @field data_formats (`character()`)\cr #' Supported data format. Always `"data.table"`.. #' This is deprecated and will be removed in the future. @@ -622,12 +643,29 @@ Learner = R6Class("Learner", ), private = list( + .use_weights = NULL, .encapsulate = NULL, .fallback = NULL, .predict_type = NULL, .param_set = NULL, .hotstart_stack = NULL, + # retrieve weights from a task, if it has weights and if the user did not + # deactivate weight usage through `self$use_weights`. + # - `task`: Task to retrieve weights from + # - `no_weights_val`: Value to return if no weights are found (default NULL) + # return: Numeric vector of weights or `no_weights_val` (default NULL) + .get_weights = function(task, no_weights_val = NULL) { + if ("weights" %nin% self$properties) { + stop("private$.get_weights should not be used in Learners that do not have the 'weights_learner' property.") + } + if (self$use_weights == "use" && "weights_learner" %in% task$properties) { + task$weights_learner$weight + } else { + no_weights_val + } + }, + deep_clone = function(name, value) { switch(name, .param_set = value$clone(deep = TRUE), diff --git a/R/LearnerClassifRpart.R b/R/LearnerClassifRpart.R index 314917736..d35cffed5 100644 --- a/R/LearnerClassifRpart.R +++ b/R/LearnerClassifRpart.R @@ -78,7 +78,7 @@ LearnerClassifRpart = R6Class("LearnerClassifRpart", inherit = LearnerClassif, .train = function(task) { pv = self$param_set$get_values(tags = "train") names(pv) = replace(names(pv), names(pv) == "keep_model", "model") - pv = get_weights(task$weights_learner$weight, pv) + pv$weights = private$.get_weights(task) invoke(rpart::rpart, formula = task$formula(), data = task$data(), .args = pv, .opts = allow_partial_matching) }, diff --git a/R/LearnerRegrRpart.R b/R/LearnerRegrRpart.R index 5d3d66475..fd9cc344d 100644 --- a/R/LearnerRegrRpart.R +++ b/R/LearnerRegrRpart.R @@ -78,7 +78,7 @@ LearnerRegrRpart = R6Class("LearnerRegrRpart", inherit = LearnerRegr, .train = function(task) { pv = self$param_set$get_values(tags = "train") names(pv) = replace(names(pv), names(pv) == "keep_model", "model") - pv = get_weights(task$weights_learner$weight, pv) + pv$weights = private$.get_weights(task) invoke(rpart::rpart, formula = task$formula(), data = task$data(), .args = pv, .opts = allow_partial_matching) }, diff --git a/R/Task.R b/R/Task.R index 79e6c6787..151fdda4b 100644 --- a/R/Task.R +++ b/R/Task.R @@ -886,7 +886,7 @@ Task = R6Class("Task", #' #' * `"strata"`: The task is resampled using one or more stratification variables (role `"stratum"`). #' * `"groups"`: The task comes with grouping/blocking information (role `"group"`). - #' * `"weights_learnering"`: If the task has observation weights with this role, they are passed to the [Learner] during train. + #' * `"weights_learner"`: If the task has observation weights with this role, they are passed to the [Learner] during train. #' The use of weights can be disabled via by setting the learner's hyperparameter `use_weights` to `FALSE`. #' * `"weights_measure"`: If the task has observation weights with this role, they are passed to the [Measure] for weighted scoring. #' The use of weights can be disabled via by setting the measure's hyperparameter `use_weights` to `FALSE`. @@ -945,7 +945,7 @@ Task = R6Class("Task", #' For each resampling iteration, observations of the same group will be exclusively assigned to be either in the training set or in the test set. #' Not more than a single column can be associated with this role. #' * `"stratum"`: Stratification variables. Multiple discrete columns may have this role. - #' * `"weights_learnering"`: If the task has observation weights with this role, they are passed to the [Learner] during train. + #' * `"weights_learner"`: If the task has observation weights with this role, they are passed to the [Learner] during train. #' The use of weights can be disabled via by setting the learner's hyperparameter `use_weights` to `FALSE`. #' * `"weights_measure"`: If the task has observation weights with this role, they are passed to the [Measure] for weighted scoring. #' The use of weights can be disabled via by setting the measure's hyperparameter `use_weights` to `FALSE`. diff --git a/R/assertions.R b/R/assertions.R index e20a95184..cfc798e30 100644 --- a/R/assertions.R +++ b/R/assertions.R @@ -160,6 +160,16 @@ assert_learnable = function(task, learner) { if (task$task_type == "unsupervised") { stopf("%s cannot be trained with %s", learner$format(), task$format()) } + # we only need to check whether the learner wants to error on weights in training, + # since weights_learner are always ignored during prediction. + if (learner$use_weights == "error" && "weights_learner" %in% task$properties) { + stopf("%s cannot be trained with weights in %s%s", learner$format(), task$format(), + if ("weights_learner" %in% learner$properties) { + " since 'use_weights' was set to 'error'." + } else { + " since the Learner does not support weights.\nYou may set 'use_weights' to 'ignore' if you want the Learner to ignore weights." + }) + } assert_task_learner(task, learner) } diff --git a/R/helper.R b/R/helper.R index e5c930924..dd3d67158 100644 --- a/R/helper.R +++ b/R/helper.R @@ -67,13 +67,6 @@ assert_validate = function(x) { assert_choice(x, c("predefined", "test"), null.ok = TRUE) } -get_weights = function(weights, pv, id = "weights") { - if (!is.null(weights) && pv$use_weights) { - pv[[id]] = weights - } - pv$use_weights = NULL - pv -} get_obs_loss = function(tab, measures) { for (measure in measures) { diff --git a/tests/testthat/test_weights.R b/tests/testthat/test_weights.R new file mode 100644 index 000000000..ab384f7e2 --- /dev/null +++ b/tests/testthat/test_weights.R @@ -0,0 +1,16 @@ + + + +task = TaskRegr$new("foo", as_data_backend(cbind(iris, data.frame(w = rep(c(1, 10, 100), each = 50)))), target = "Sepal.Length") +task$set_col_roles("w", character()) +learner = lrn("regr.rpart", use_weights = "use") + +learner$train(task) +p1 = learner$predict(task) + +task$set_col_roles("w", "weights_learner") +learner$train(task) +p2 = learner$predict(task) + +expect_lt(p1$score(), p2$score()) + From 37baa67a349a59b31f2ef4d5bbc4dc32f01b9310 Mon Sep 17 00:00:00 2001 From: Bernd Bischl Date: Wed, 28 Aug 2024 19:38:28 +0200 Subject: [PATCH 6/8] ... --- R/Learner.R | 2 +- man/Learner.Rd | 13 +++++++++++++ man/Task.Rd | 4 ++-- man/mlr_learners_classif.rpart.Rd | 1 - man/mlr_learners_regr.rpart.Rd | 1 - tests/testthat/test_Learner.R | 14 +++++++++----- tests/testthat/test_benchmark.R | 18 +++++++++--------- .../testthat/test_mlr_learners_classif_rpart.R | 9 +++------ tests/testthat/test_mlr_learners_regr_rpart.R | 15 --------------- 9 files changed, 37 insertions(+), 40 deletions(-) diff --git a/R/Learner.R b/R/Learner.R index 5d730a1d1..4cb365767 100644 --- a/R/Learner.R +++ b/R/Learner.R @@ -235,7 +235,7 @@ Learner = R6Class("Learner", if ("weights" %in% self$properties) { self$use_weights = "use" } else { - self$use_weights = "ignore" + self$use_weights = "error" } private$.param_set = param_set diff --git a/man/Learner.Rd b/man/Learner.Rd index 123d397b6..3802365c5 100644 --- a/man/Learner.Rd +++ b/man/Learner.Rd @@ -262,6 +262,19 @@ Defaults to \code{NA}, but can be set by child classes.} \section{Active bindings}{ \if{html}{\out{
}} \describe{ +\item{\code{use_weights}}{(\code{character(1)})\cr +How to use weights. +Settings are \code{"use"} \code{"ignore"}, and \code{"error"}. +\itemize{ +\item \code{"use"}: use weights, as supported by the underlying \code{Learner}. +\item \code{"ignore"}: do not use weights. +\item \code{"error"}: throw an error if weights are present in the training \code{Task}. +} + +For \code{Learner}s with the property \code{"weights_learner"}, this is initialized as \code{"use"}. +For \code{Learner}s that do not support weights, i.e. without the \code{"weights_learner"} property, this is initialized as \code{"error"}. +This behaviour is to avoid cases where a user erroneously assumes that a \code{Learner} supports weights when it does not.} + \item{\code{data_formats}}{(\code{character()})\cr Supported data format. Always \code{"data.table"}.. This is deprecated and will be removed in the future.} diff --git a/man/Task.Rd b/man/Task.Rd index 1b7ac925c..22db9179c 100644 --- a/man/Task.Rd +++ b/man/Task.Rd @@ -195,7 +195,7 @@ The following properties are currently standardized and understood by tasks in \ \itemize{ \item \code{"strata"}: The task is resampled using one or more stratification variables (role \code{"stratum"}). \item \code{"groups"}: The task comes with grouping/blocking information (role \code{"group"}). -\item \code{"weights_learnering"}: If the task has observation weights with this role, they are passed to the \link{Learner} during train. +\item \code{"weights_learner"}: If the task has observation weights with this role, they are passed to the \link{Learner} during train. The use of weights can be disabled via by setting the learner's hyperparameter \code{use_weights} to \code{FALSE}. \item \code{"weights_measure"}: If the task has observation weights with this role, they are passed to the \link{Measure} for weighted scoring. The use of weights can be disabled via by setting the measure's hyperparameter \code{use_weights} to \code{FALSE}. @@ -227,7 +227,7 @@ Columns must be sortable with \code{\link[=order]{order()}}. For each resampling iteration, observations of the same group will be exclusively assigned to be either in the training set or in the test set. Not more than a single column can be associated with this role. \item \code{"stratum"}: Stratification variables. Multiple discrete columns may have this role. -\item \code{"weights_learnering"}: If the task has observation weights with this role, they are passed to the \link{Learner} during train. +\item \code{"weights_learner"}: If the task has observation weights with this role, they are passed to the \link{Learner} during train. The use of weights can be disabled via by setting the learner's hyperparameter \code{use_weights} to \code{FALSE}. \item \code{"weights_measure"}: If the task has observation weights with this role, they are passed to the \link{Measure} for weighted scoring. The use of weights can be disabled via by setting the measure's hyperparameter \code{use_weights} to \code{FALSE}. diff --git a/man/mlr_learners_classif.rpart.Rd b/man/mlr_learners_classif.rpart.Rd index 776993ed8..9f7232549 100644 --- a/man/mlr_learners_classif.rpart.Rd +++ b/man/mlr_learners_classif.rpart.Rd @@ -55,7 +55,6 @@ lrn("classif.rpart") surrogatestyle \tab integer \tab 0 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr usesurrogate \tab integer \tab 2 \tab \tab \eqn{[0, 2]}{[0, 2]} \cr xval \tab integer \tab 10 \tab \tab \eqn{[0, \infty)}{[0, Inf)} \cr - use_weights \tab logical \tab TRUE \tab TRUE, FALSE \tab - \cr } } diff --git a/man/mlr_learners_regr.rpart.Rd b/man/mlr_learners_regr.rpart.Rd index 0c6283561..81af8a259 100644 --- a/man/mlr_learners_regr.rpart.Rd +++ b/man/mlr_learners_regr.rpart.Rd @@ -55,7 +55,6 @@ lrn("regr.rpart") surrogatestyle \tab integer \tab 0 \tab \tab \eqn{[0, 1]}{[0, 1]} \cr usesurrogate \tab integer \tab 2 \tab \tab \eqn{[0, 2]}{[0, 2]} \cr xval \tab integer \tab 10 \tab \tab \eqn{[0, \infty)}{[0, Inf)} \cr - use_weights \tab logical \tab TRUE \tab TRUE, FALSE \tab - \cr } } diff --git a/tests/testthat/test_Learner.R b/tests/testthat/test_Learner.R index bd98bc901..edd4e1c16 100644 --- a/tests/testthat/test_Learner.R +++ b/tests/testthat/test_Learner.R @@ -269,7 +269,7 @@ test_that("weights", { task = TaskClassif$new("weighted_task", data, "Species") task$set_col_roles("w", "weights_learner") - learner = lrn("classif.rpart", use_weights = TRUE) + learner = lrn("classif.rpart", use_weights = "use") learner$train(task) conf = learner$predict(task)$confusion @@ -625,8 +625,12 @@ test_that("predict time is cumulative", { expect_true(t1 > t2) }) -test_that("weight parameters is added to parameter set", { - learner = lrn("classif.rpart") - expect_subset("weights", learner$properties) - expect_subset("use_weights", learner$param_set$ids()) +test_that("weights properties and defaults", { + ll = lrn("classif.rpart") + expect_true("weights" %in% ll$properties) + expect_equal(ll$use_weights, "use") + + ll = lrn("classif.debug") + expect_true("weights" %nin% ll$properties) + expect_equal(ll$use_weights, "error") }) diff --git a/tests/testthat/test_benchmark.R b/tests/testthat/test_benchmark.R index 1c123fc4f..9584ea660 100644 --- a/tests/testthat/test_benchmark.R +++ b/tests/testthat/test_benchmark.R @@ -180,24 +180,24 @@ test_that("custom resampling (#245)", { expect_data_table(design, nrows = 1) }) -test_that("extract params", { - # some params, some not +test_that("extract params in aggregate and score", { + # set params differently in a few learners lrns = list( - lrn("classif.rpart", id = "rp1", xval = 0, use_weights = FALSE), - lrn("classif.rpart", id = "rp2", xval = 0, use_weights = FALSE, cp = 0.2, minsplit = 2), - lrn("classif.rpart", id = "rp3", xval = 0, use_weights = FALSE, cp = 0.1) + lrn("classif.rpart", id = "rp1", xval = 0), + lrn("classif.rpart", id = "rp2", xval = 0, cp = 0.2, minsplit = 2), + lrn("classif.rpart", id = "rp3", xval = 0, cp = 0.1) ) bmr = benchmark(benchmark_grid(tsk("wine"), lrns, rsmp("cv", folds = 3))) aggr = bmr$aggregate(params = TRUE) setorder(aggr, "learner_id") - expect_list(aggr$params[[1]], names = "unique", len = 2L) - expect_list(aggr$params[[2]], names = "unique", len = 4L) - expect_list(aggr$params[[3]], names = "unique", len = 3L) + expect_list(aggr$params[[1]], names = "unique", len = 1L) + expect_list(aggr$params[[2]], names = "unique", len = 3L) + expect_list(aggr$params[[3]], names = "unique", len = 2L) scores = bmr$score() pvs = map(scores$learner, function(l) l$param_set$values) expect_true(all(sapply(split(lengths(pvs), scores$nr), uniqueN) == 1)) - expect_set_equal(lengths(pvs), 2:4) + expect_set_equal(lengths(pvs), 1:3) # only one params lrns = mlr_learners$mget("classif.featureless") diff --git a/tests/testthat/test_mlr_learners_classif_rpart.R b/tests/testthat/test_mlr_learners_classif_rpart.R index 7adf40d04..11b494408 100644 --- a/tests/testthat/test_mlr_learners_classif_rpart.R +++ b/tests/testthat/test_mlr_learners_classif_rpart.R @@ -36,18 +36,15 @@ test_that("selected_features", { expect_subset(sf, task$feature_names, empty.ok = FALSE) }) -test_that("use_weights parameter works", { +test_that("use_weights actually influences the model", { task = TaskClassif$new("foo", as_data_backend(cbind(iris, data.frame(w = rep(c(1, 10, 100), each = 50)))), target = "Species") task$set_col_roles("w", "weights_learner") - learner = lrn("classif.rpart", use_weights = TRUE) - + learner = lrn("classif.rpart", use_weights = "use") learner$train(task) c1 = learner$predict(task)$confusion - - learner = lrn("classif.rpart", use_weights = FALSE) + learner = lrn("classif.rpart", use_weights = "ignore") learner$train(task) c2 = learner$predict(task)$confusion - expect_false(all(c1 == c2)) }) diff --git a/tests/testthat/test_mlr_learners_regr_rpart.R b/tests/testthat/test_mlr_learners_regr_rpart.R index 4056e7342..edf8a50e4 100644 --- a/tests/testthat/test_mlr_learners_regr_rpart.R +++ b/tests/testthat/test_mlr_learners_regr_rpart.R @@ -36,21 +36,6 @@ test_that("selected_features", { expect_subset(sf, task$feature_names, empty.ok = FALSE) }) -test_that("weights", { - task = TaskRegr$new("foo", as_data_backend(cbind(iris, data.frame(w = rep(c(1, 10, 100), each = 50)))), target = "Sepal.Length") - task$set_col_roles("w", character()) - learner = lrn("regr.rpart", use_weights = TRUE) - - learner$train(task) - p1 = learner$predict(task) - - task$set_col_roles("w", "weights_learner") - learner$train(task) - p2 = learner$predict(task) - - expect_lt(p1$score(), p2$score()) -}) - test_that("default_values on rpart", { learner = lrn("regr.rpart") search_space = ps( From c32bc21fbeeeaec4b2a92d800a25d04186cf9747 Mon Sep 17 00:00:00 2001 From: Bernd Bischl Date: Wed, 28 Aug 2024 19:43:07 +0200 Subject: [PATCH 7/8] ... --- inst/testthat/helper_autotest.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R index 31eb5ad4b..eb9421322 100644 --- a/inst/testthat/helper_autotest.R +++ b/inst/testthat/helper_autotest.R @@ -236,13 +236,8 @@ run_experiment = function(task, learner, seed = NULL, configure_learner = NULL) # enable weights if ("weights" %in% learner$properties) { - # learner must have flag "use_weights" - msg = checkmate::check_subset("use_weights", learner$param_set$ids(), empty.ok = FALSE) - if (!isTRUE(msg)) { - return(err(msg)) - } - if ("weights_learner" %in% task$properties) { - learner$param_set$values$use_weights = TRUE + if (learner$use_weights != "use") { + return(err("use_weights != 'use' for learner with property 'weights' on init!")) } } From e70f0281d2362abf7ea53593b34a2b7f7de8a9f3 Mon Sep 17 00:00:00 2001 From: Bernd Bischl Date: Wed, 28 Aug 2024 20:27:17 +0200 Subject: [PATCH 8/8] ... --- inst/testthat/helper_autotest.R | 6 ++++++ tests/testthat/test_predict.R | 13 ++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R index eb9421322..11b2060a3 100644 --- a/inst/testthat/helper_autotest.R +++ b/inst/testthat/helper_autotest.R @@ -235,10 +235,16 @@ run_experiment = function(task, learner, seed = NULL, configure_learner = NULL) stage = "train()" # enable weights + # the next lines are maybe not strictly necessary, but test that the defaults are + # what they should be if ("weights" %in% learner$properties) { if (learner$use_weights != "use") { return(err("use_weights != 'use' for learner with property 'weights' on init!")) } + } else { + if (learner$use_weights != "error") { + return(err("use_weights != 'error' for learner without property 'weights' on init!")) + } } ok = try(learner$train(task), silent = TRUE) diff --git a/tests/testthat/test_predict.R b/tests/testthat/test_predict.R index 9e4aa743b..fedaff89b 100644 --- a/tests/testthat/test_predict.R +++ b/tests/testthat/test_predict.R @@ -50,16 +50,19 @@ test_that("missing predictions are handled gracefully / regr", { test_that("predict_newdata with weights (#519)", { + # we had a problem where predict did not work if weights were present in the task + # especially with the "predict_newdata" function task = tsk("boston_housing") task$set_col_roles("nox", "weights_learner") - learner = lrn("regr.featureless") + learner = lrn("regr.rpart") learner$train(task) - expect_prediction(learner$predict(task)) - # w/o weights + # predict with different API calls + # normal predict on the task + expect_prediction(learner$predict(task)) + # w/o weights in the new-df expect_prediction(learner$predict_newdata(task$data())) - - # w weights + # w weights in the new-df expect_prediction(learner$predict_newdata(task$data(cols = c(task$target_names, task$feature_names, "nox")))) })