Skip to content

Commit

Permalink
Merge pull request #1233 from tidymodels/phase-out-factor-checking-in…
Browse files Browse the repository at this point in the history
…step_dummy
  • Loading branch information
EmilHvitfeldt authored Nov 2, 2023
2 parents fec8b92 + 9f41c62 commit a63ad80
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 52 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* Documentation for tidy methods for all steps has been added when missing and improved to describe the return value more accurately. (#936)

* `step_dummy()` will now error if passed character instead of loudly ignoring them. Only applicable when setting `strings_as_factors = FALSE`. (#1233)

* Fixed bug where `tidy.step_cut()` always returned zero row tibbles for trained recipes. (#1229)

* It is now documented that `step_spline_b()` can be made periodic. (#1223)
Expand Down
33 changes: 3 additions & 30 deletions R/dummy.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Create traditional dummy variables
#'
#' `step_dummy()` creates a *specification* of a recipe step that will convert
#' nominal data (e.g. characters or factors) into one or more numeric binary
#' model terms corresponding to the levels of the original data.
#' nominal data (e.g. factors) into one or more numeric binary model terms
#' corresponding to the levels of the original data.
#'
#' @inheritParams step_pca
#' @inheritParams step_center
Expand Down Expand Up @@ -171,11 +171,9 @@ step_dummy_new <-
#' @export
prep.step_dummy <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_names], types = c("string", "factor", "ordered"))
check_type(training[, col_names], types = c("factor", "ordered"))

if (length(col_names) > 0) {
col_names <- check_factor_vars(training, col_names, "step_dummy")

## I hate doing this but currently we are going to have
## to save the terms object from the original (= training)
## data
Expand Down Expand Up @@ -221,31 +219,6 @@ prep.step_dummy <- function(x, training, info = NULL, ...) {
)
}

check_factor_vars <- function(data, col_names, step_name, call = caller_env()) {
fac_check <- vapply(data[, col_names], is.factor, logical(1))
if (any(!fac_check)) {
rlang::warn(
paste0(
"The following variables are not factor vectors and will be ignored: ",
paste0("`", names(fac_check)[!fac_check], "`", collapse = ", ")
)
)
}
col_names <- col_names[fac_check]
if (length(col_names) == 0) {
rlang::abort(
paste0(
"The `terms` argument in `",
step_name,
"` did not select ",
"any factor columns."
),
call = call
)
}
col_names
}

warn_new_levels <- function(dat, lvl, details = NULL) {
ind <- which(!(dat %in% lvl))
if (length(ind) > 0) {
Expand Down
4 changes: 1 addition & 3 deletions R/dummy_extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
#' \item{columns}{character, names of resulting columns}
#' \item{id}{character, id of this step}
#' }
#'
#'
#' The return value is ordered according to the frequency of `columns` entries in the training data set.
#'
#' @template case-weights-unsupervised
Expand Down Expand Up @@ -168,8 +168,6 @@ prep.step_dummy_extract <- function(x, training, info = NULL, ...) {
}

if (length(col_names) > 0) {
col_names <- check_factor_vars(training, col_names, "step_dummy_extract")

levels <- vector(mode = "list", length = length(col_names))
names(levels) <- col_names
for (col_name in col_names) {
Expand Down
4 changes: 2 additions & 2 deletions man/step_dummy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 7 additions & 8 deletions tests/testthat/_snaps/dummy.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
# dummy variables with non-factor inputs
# dummy variables errors with character inputs

Code
prep(dummy, training = sacr, verbose = FALSE, strings_as_factors = FALSE)
Condition
Warning:
The following variables are not factor vectors and will be ignored: `city`, `zip`
Error in `step_dummy()`:
Caused by error in `prep()`:
! The `terms` argument in `step_dummy` did not select any factor columns.
x All columns selected for the step should be factor or ordered.
* 2 string variables found: `city` and `zip`

---
# check_type() is used

Code
recipe(sqft ~ zip + price + city, data = sacr_fac_ish) %>% step_dummy(city, zip,
price) %>% prep(training = sacr_fac_ish, verbose = FALSE, strings_as_factors = FALSE)
recipe(sqft ~ zip + price + city, data = sacr) %>% step_dummy(city, zip, price) %>%
prep()
Condition
Error in `step_dummy()`:
Caused by error in `prep()`:
x All columns selected for the step should be string, factor, or ordered.
x All columns selected for the step should be factor or ordered.
* 1 integer variable found: `price`

# tests for NA values in factor
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/_snaps/misc.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
Code
conditionMessage(attr(res, "condition"))
Output
[1] "Error in `step_dummy()`:\nCaused by error in `prep()`:\nx All columns selected for the step should be string, factor, or ordered.\n* 11 double variables found: `mpg`, `cyl`, `disp`, `hp`, ..."
[1] "Error in `step_dummy()`:\nCaused by error in `prep()`:\nx All columns selected for the step should be factor or ordered.\n* 11 double variables found: `mpg`, `cyl`, `disp`, `hp`, ..."

# check_training_set errors are thrown

Expand Down
15 changes: 7 additions & 8 deletions tests/testthat/test-dummy.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,23 +65,22 @@ test_that("dummy variables with factor inputs", {
)
})

test_that("dummy variables with non-factor inputs", {
test_that("dummy variables errors with character inputs", {
rec <- recipe(sqft ~ zip + city, data = sacr)
dummy <- rec %>% step_dummy(city, zip)

expect_snapshot(error = TRUE,
expect_snapshot(
error = TRUE,
prep(dummy, training = sacr, verbose = FALSE, strings_as_factors = FALSE)
)
})

sacr_fac_ish <-
sacr_fac %>%
mutate(city = as.character(city))

test_that("check_type() is used", {
expect_snapshot(
error = TRUE,
recipe(sqft ~ zip + price + city, data = sacr_fac_ish) %>%
recipe(sqft ~ zip + price + city, data = sacr) %>%
step_dummy(city, zip, price) %>%
prep(training = sacr_fac_ish, verbose = FALSE, strings_as_factors = FALSE)
prep()
)
})

Expand Down

0 comments on commit a63ad80

Please sign in to comment.