Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning message when scaling steps return NaN #1252

Merged
merged 9 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

* When errors are thrown about wrongly typed input to steps, the offending variables and their types are now listed. (#1217)

* Added warnings when `step_scale()`, `step_normalise()`, `step_center()` or `step_range()` result in `NaN` columns. (@mastoffel, #1221)

# recipes 1.0.8

## Improvements
Expand Down
7 changes: 7 additions & 0 deletions R/center.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ prep.step_center <- function(x, training, info = NULL, ...) {

means <- averages(training[, col_names], wts, na_rm = x$na_rm)

inf_cols <- col_names[is.infinite(means)]
if (length(inf_cols) > 0) {
cli::cli_warn(
"Column{?s} {.var {inf_cols}} returned NaN. \\
Consider avoiding `Inf` values before normalising.")
}

step_center_new(
terms = x$terms,
role = x$role,
Expand Down
10 changes: 10 additions & 0 deletions R/normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,16 @@ sd_check <- function(x) {
)
x[zero_sd] <- 1
}

na_sd <- which(is.na(x))
if (length(na_sd) > 0) {
cli::cli_warn(
"Column{?s} {.var {names(na_sd)}} returned NaN, because variance \\
cannot be calculated and scaling cannot be used. Consider avoiding \\
`Inf` or `-Inf` values and/or setting `na_rm = TRUE` before \\
normalizing."
)
}
x
}

Expand Down
15 changes: 15 additions & 0 deletions R/range.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,21 @@ prep.step_range <- function(x, training, info = NULL, ...) {
vapply(training[, col_names], min, c(min = 0), na.rm = TRUE)
maxs <-
vapply(training[, col_names], max, c(max = 0), na.rm = TRUE)

inf_cols <- col_names[is.infinite(mins) | is.infinite(maxs)]
if (length(inf_cols) > 0) {
cli::cli_warn(
"Column{?s} {.var {inf_cols}} returned NaN. \\
Consider avoiding `Inf` values before normalising.")
}
zero_range_cols <- col_names[maxs - mins == 0]
if (length(zero_range_cols) > 0) {
cli::cli_warn(
"Column{?s} {.var {zero_range_cols}} returned NaN. Consider using \\
`step_zv()` to remove variables containing only a single value."
)
}

step_range_new(
terms = x$terms,
role = x$role,
Expand Down
42 changes: 42 additions & 0 deletions tests/testthat/_snaps/center.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,48 @@
-- Operations
* Centering for: cyl, disp, hp, drat, qsec, vs, ... | Trained, ignored weights

# warns when NaN is returned due to Inf or -Inf

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN. Consider avoiding `Inf` values before normalising.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 4 data points and no incomplete rows.

-- Operations
* Centering for: x | Trained

---

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN. Consider avoiding `Inf` values before normalising.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 4 data points and no incomplete rows.

-- Operations
* Centering for: x | Trained

# empty printing

Code
Expand Down
51 changes: 51 additions & 0 deletions tests/testthat/_snaps/normalize.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# na_rm argument works for step_normalize

Code
rec_no_na_rm <- recipe(~., data = mtcars_na) %>% step_normalize(all_predictors(),
na_rm = FALSE) %>% prep()
Condition
Warning:
Columns `mpg`, `cyl`, `disp`, and `hp` returned NaN, because variance cannot be calculated and scaling cannot be used. Consider avoiding `Inf` or `-Inf` values and/or setting `na_rm = TRUE` before normalizing.

# warns on zv

Code
Expand Down Expand Up @@ -61,6 +70,48 @@
* Centering and scaling for: cyl, disp, hp, drat, ... | Trained, ignored
weights

# warns when NaN is returned due to Inf or -Inf

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN, because variance cannot be calculated and scaling cannot be used. Consider avoiding `Inf` or `-Inf` values and/or setting `na_rm = TRUE` before normalizing.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 4 data points and no incomplete rows.

-- Operations
* Centering and scaling for: x | Trained

---

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN, because variance cannot be calculated and scaling cannot be used. Consider avoiding `Inf` or `-Inf` values and/or setting `na_rm = TRUE` before normalizing.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 4 data points and no incomplete rows.

-- Operations
* Centering and scaling for: x | Trained

# empty printing

Code
Expand Down
63 changes: 63 additions & 0 deletions tests/testthat/_snaps/range.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,66 @@
# warns when NaN is returned due to zero variance

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN. Consider using `step_zv()` to remove variables containing only a single value.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 10 data points and no incomplete rows.

-- Operations
* Range scaling to [0,1] for: x | Trained

# warns when NaN is returned due to Inf or -Inf

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN. Consider avoiding `Inf` values before normalising.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 4 data points and no incomplete rows.

-- Operations
* Range scaling to [0,1] for: x | Trained

---

Code
prep(rec)
Condition
Warning:
Column `x` returned NaN. Consider avoiding `Inf` values before normalising.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
predictor: 1

-- Training information
Training data contained 4 data points and no incomplete rows.

-- Operations
* Range scaling to [0,1] for: x | Trained

# empty printing

Code
Expand Down
32 changes: 32 additions & 0 deletions tests/testthat/_snaps/scale.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@
Warning:
Scaling `factor` should take either a value of 1 or 2

# na_rm argument works for step_scale

Code
rec_no_na_rm <- recipe(~., data = mtcars_na) %>% step_scale(all_predictors(),
na_rm = FALSE) %>% prep()
Condition
Warning:
Columns `mpg`, `cyl`, `disp`, and `hp` returned NaN, because variance cannot be calculated and scaling cannot be used. Consider avoiding `Inf` or `-Inf` values and/or setting `na_rm = TRUE` before normalizing.

# warns on zv

Code
Expand All @@ -29,6 +38,29 @@
-- Operations
* Scaling for: carbon, hydrogen, oxygen, nitrogen, sulfur, ... | Trained

# warns when NaN is returned

Code
prep(rec1)
Condition
Warning:
Column `sulfur` returned NaN, because variance cannot be calculated and scaling cannot be used. Consider avoiding `Inf` or `-Inf` values and/or setting `na_rm = TRUE` before normalizing.
Message

-- Recipe ----------------------------------------------------------------------

-- Inputs
Number of variables by role
outcome: 1
predictor: 5

-- Training information
Training data contained 536 data points and no incomplete rows.

-- Operations
* Log transformation on: sulfur | Trained
* Scaling for: sulfur | Trained

# scaling with case weights

Code
Expand Down
8 changes: 8 additions & 0 deletions tests/testthat/_snaps/update-role-requirements.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@
! Can't update the `bake` requirement of the "outcome" role.
i The "outcome" role is never required at `bake()` time.

# will still error if a step actually used a role that set `bake = FALSE`

Code
rec <- prep(rec, df)
Condition
Warning:
Column `x` returned NaN, because variance cannot be calculated and scaling cannot be used. Consider avoiding `Inf` or `-Inf` values and/or setting `na_rm = TRUE` before normalizing.

# can update `bake` requirements after prepping

Code
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/test-center.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,15 @@ test_that("centering with case weights", {
expect_snapshot(rec)
})

test_that("warns when NaN is returned due to Inf or -Inf",{
rec <- recipe(~., data = data.frame(x = c(2, 3, 4, Inf))) |>
step_center(x)
expect_snapshot(prep(rec))

rec <- recipe(~., data = data.frame(x = c(2, 3, 4, -Inf))) |>
step_center(x)
expect_snapshot(prep(rec))
})
# Infrastructure ---------------------------------------------------------------

test_that("bake method errors when needed non-standard role columns are missing", {
Expand Down
18 changes: 15 additions & 3 deletions tests/testthat/test-normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,11 @@ test_that("na_rm argument works for step_normalize", {
mtcars_na <- mtcars
mtcars_na[1, 1:4] <- NA

rec_no_na_rm <- recipe(~., data = mtcars_na) %>%
step_normalize(all_predictors(), na_rm = FALSE) %>%
prep()
expect_snapshot(
rec_no_na_rm <- recipe(~., data = mtcars_na) %>%
step_normalize(all_predictors(), na_rm = FALSE) %>%
prep()
)

rec_na_rm <- recipe(~., data = mtcars_na) %>%
step_normalize(all_predictors(), na_rm = TRUE) %>%
Expand Down Expand Up @@ -140,6 +142,16 @@ test_that("normalizing with case weights", {
expect_snapshot(rec)
})

test_that("warns when NaN is returned due to Inf or -Inf",{
rec <- recipe(~., data = data.frame(x = c(2, 3, 4, Inf))) |>
step_normalize(x)
expect_snapshot(prep(rec))

rec <- recipe(~., data = data.frame(x = c(2, 3, 4, -Inf))) |>
step_normalize(x)
expect_snapshot(prep(rec))
})

# Infrastructure ---------------------------------------------------------------

test_that("bake method errors when needed non-standard role columns are missing", {
Expand Down
16 changes: 16 additions & 0 deletions tests/testthat/test-range.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,22 @@ test_that("backwards compatibility for before clipping <= 1.0.2 (#1090)", {
expect_equal(exp_pred, obs_pred)
})

test_that("warns when NaN is returned due to zero variance",{
rec <- recipe(~., data = data.frame(x = rep(1, 10))) |>
step_range(x)
expect_snapshot(prep(rec))
})

test_that("warns when NaN is returned due to Inf or -Inf",{
rec <- recipe(~., data = data.frame(x = c(2, 3, 4, Inf))) |>
step_range(x)
expect_snapshot(prep(rec))

rec <- recipe(~., data = data.frame(x = c(2, 3, 4, -Inf))) |>
step_range(x)
expect_snapshot(prep(rec))
})

# Infrastructure ---------------------------------------------------------------

test_that("bake method errors when needed non-standard role columns are missing", {
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/test-scale.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,11 @@ test_that("na_rm argument works for step_scale", {
mtcars_na <- mtcars
mtcars_na[1, 1:4] <- NA

expect_snapshot({
rec_no_na_rm <- recipe(~., data = mtcars_na) %>%
step_scale(all_predictors(), na_rm = FALSE) %>%
prep()
})

rec_na_rm <- recipe(~., data = mtcars_na) %>%
step_scale(all_predictors(), na_rm = TRUE) %>%
Expand All @@ -115,6 +117,13 @@ test_that("warns on zv",{
expect_snapshot(prep(rec1))
})

test_that("warns when NaN is returned",{
rec1 <- rec %>%
step_log(sulfur) %>%
step_scale(sulfur)
expect_snapshot(prep(rec1))
})

test_that("scaling with case weights", {
mtcars_freq <- mtcars
mtcars_freq$cyl <- frequency_weights(mtcars_freq$cyl)
Expand Down
Loading
Loading