From 6afc57a2c0db2213ceeef3cf332470edcaafe0b4 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 12 Aug 2024 16:06:28 +0200 Subject: [PATCH 1/6] more stable yeo johnson roundtrips --- mesmer/stats/_power_transformer.py | 16 +++++++++------- tests/unit/test_power_transformer.py | 20 ++++++++++++++++++-- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/mesmer/stats/_power_transformer.py b/mesmer/stats/_power_transformer.py index 0f74c146..c824cfc4 100644 --- a/mesmer/stats/_power_transformer.py +++ b/mesmer/stats/_power_transformer.py @@ -57,7 +57,7 @@ def _yeo_johnson_transform_np(data, lambdas): eps = np.finfo(np.float64).eps - transformed = np.zeros_like(data) + transformed = np.empty_like(data) # get positions of four cases: # NOTE: this code is copied from sklearn's PowerTransformer, see # https://github.com/scikit-learn/scikit-learn/blob/8721245511de2f225ff5f9aa5f5fadce663cd4a3/sklearn/preprocessing/_data.py#L3396 @@ -69,10 +69,11 @@ def _yeo_johnson_transform_np(data, lambdas): # assign values for the four cases transformed[sel_a] = np.log1p(data[sel_a]) - transformed[sel_b] = (np.power(data[sel_b] + 1, lambdas[sel_b]) - 1) / lambdas[ + transformed[sel_b] = (np.expm1(np.log1p(data[sel_b]) * lambdas[sel_b])) / lambdas[ sel_b ] - transformed[sel_c] = -(np.power(-data[sel_c] + 1, 2 - lambdas[sel_c]) - 1) / ( + + transformed[sel_c] = -(np.expm1(np.log1p(-data[sel_c]) * (2 - lambdas[sel_c]))) / ( 2 - lambdas[sel_c] ) transformed[sel_d] = -np.log1p(-data[sel_d]) @@ -104,7 +105,7 @@ def _yeo_johnson_inverse_transform_np(data, lambdas): eps = np.finfo(np.float64).eps - transformed = np.zeros_like(data) + transformed = np.empty_like(data) # get positions of four cases: pos_a = (data >= 0) & (np.abs(lambdas) < eps) pos_b = (data >= 0) & (np.abs(lambdas) >= eps) @@ -114,10 +115,11 @@ def _yeo_johnson_inverse_transform_np(data, lambdas): # assign values for the four cases transformed[pos_a] = np.exp(data[pos_a]) - 1 transformed[pos_b] = ( - np.power(data[pos_b] * lambdas[pos_b] + 1, 1 / lambdas[pos_b]) - 1 + np.exp(np.log1p(data[pos_b] * lambdas[pos_b]) / lambdas[pos_b]) - 1 ) - transformed[pos_c] = 1 - np.power( - (lambdas[pos_c] - 2) * data[pos_c] + 1, 1 / (2 - lambdas[pos_c]) + + transformed[pos_c] = 1 - np.exp( + np.log1p(-(2 - lambdas[pos_c]) * data[pos_c]) / (2 - lambdas[pos_c]) ) transformed[pos_d] = 1 - np.exp(-data[pos_d]) diff --git a/tests/unit/test_power_transformer.py b/tests/unit/test_power_transformer.py index 0b2e3e31..96d23dc2 100644 --- a/tests/unit/test_power_transformer.py +++ b/tests/unit/test_power_transformer.py @@ -119,7 +119,7 @@ def test_yeo_johnson_transform_np_sklearn(): pt_sklearn.lambdas_ = np.array([2.0]) expected = pt_sklearn.transform(monthly_residuals.reshape(-1, 1)) - np.testing.assert_equal(result, expected.reshape(-1)) + np.testing.assert_allclose(result, expected.reshape(-1)) def test_transform_roundtrip(): @@ -137,6 +137,22 @@ def test_transform_roundtrip(): np.testing.assert_allclose(result, monthly_residuals, atol=1e-7) +@pytest.mark.parametrize("value", (-2, -1, 0, 1, 2)) +@pytest.mark.parametrize("lmbda", (0, 1, 2)) +@pytest.mark.parametrize("lambda_delta", (-4e-16, 0, 4e-16)) +def test_transform_roundtrip_special_cases(value, lmbda, lambda_delta): + # test yeo_johnson for lambdas close to the boundary points + + x = np.array([value], dtype=float) + + lambdas = np.array([lmbda + lambda_delta], dtype=float) + + transformed = _yeo_johnson_transform_np(x, lambdas) + result = _yeo_johnson_inverse_transform_np(transformed, lambdas) + + np.testing.assert_allclose(result, x, atol=1e-7) + + def test_yeo_johnson_inverse_transform_np_sklearn(): # test if our inverse power transform is the same as sklearn's for constant lambda np.random.seed(0) @@ -151,7 +167,7 @@ def test_yeo_johnson_inverse_transform_np_sklearn(): pt_sklearn.lambdas_ = np.array([2.0]) expected = pt_sklearn.inverse_transform(monthly_residuals.reshape(-1, 1)) - np.testing.assert_equal(result, expected.reshape(-1)) + np.testing.assert_allclose(result, expected.reshape(-1)) def test_yeo_johnson_optimize_lambda_sklearn(): From c31eaff6a24dd21688951348c81506bef2807bc5 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 12 Aug 2024 16:11:33 +0200 Subject: [PATCH 2/6] revert unrelated change --- mesmer/stats/_power_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mesmer/stats/_power_transformer.py b/mesmer/stats/_power_transformer.py index c824cfc4..5bc40c98 100644 --- a/mesmer/stats/_power_transformer.py +++ b/mesmer/stats/_power_transformer.py @@ -57,7 +57,7 @@ def _yeo_johnson_transform_np(data, lambdas): eps = np.finfo(np.float64).eps - transformed = np.empty_like(data) + transformed = np.zeros_like(data) # get positions of four cases: # NOTE: this code is copied from sklearn's PowerTransformer, see # https://github.com/scikit-learn/scikit-learn/blob/8721245511de2f225ff5f9aa5f5fadce663cd4a3/sklearn/preprocessing/_data.py#L3396 @@ -105,7 +105,7 @@ def _yeo_johnson_inverse_transform_np(data, lambdas): eps = np.finfo(np.float64).eps - transformed = np.empty_like(data) + transformed = np.zeros_like(data) # get positions of four cases: pos_a = (data >= 0) & (np.abs(lambdas) < eps) pos_b = (data >= 0) & (np.abs(lambdas) >= eps) From 5da62a60ce80978d6de969da260896f2bcc6d9ac Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 12 Aug 2024 16:13:23 +0200 Subject: [PATCH 3/6] default allclose tolerance --- tests/unit/test_power_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_power_transformer.py b/tests/unit/test_power_transformer.py index 96d23dc2..cfb369ce 100644 --- a/tests/unit/test_power_transformer.py +++ b/tests/unit/test_power_transformer.py @@ -150,7 +150,7 @@ def test_transform_roundtrip_special_cases(value, lmbda, lambda_delta): transformed = _yeo_johnson_transform_np(x, lambdas) result = _yeo_johnson_inverse_transform_np(transformed, lambdas) - np.testing.assert_allclose(result, x, atol=1e-7) + np.testing.assert_allclose(result, x) def test_yeo_johnson_inverse_transform_np_sklearn(): From ef314c368b0955fb8bd781c35655f137db0b9c84 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 12 Aug 2024 16:17:08 +0200 Subject: [PATCH 4/6] changelog --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fcd124b9..5f365610 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -90,6 +90,8 @@ Yeo-Johnson power transformer `#455 `_) - adjust the first guess to assume the data is normally distributed ( `#429 `_) +- make (back-) transformations more stable by using `np.expm1` and `np.log1p` + (`#494 `_) - rewrite power transformer to work with xarray, and refactor from a class structure to functions ( `#442 `_, and `#474 `_) From 3f8ac3f31e1c516ee3aa2940f0e06b17b4b28573 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 12 Aug 2024 16:17:19 +0200 Subject: [PATCH 5/6] comment on change --- tests/unit/test_power_transformer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_power_transformer.py b/tests/unit/test_power_transformer.py index cfb369ce..18f6611f 100644 --- a/tests/unit/test_power_transformer.py +++ b/tests/unit/test_power_transformer.py @@ -119,6 +119,7 @@ def test_yeo_johnson_transform_np_sklearn(): pt_sklearn.lambdas_ = np.array([2.0]) expected = pt_sklearn.transform(monthly_residuals.reshape(-1, 1)) + # only approximately the same due to #494 np.testing.assert_allclose(result, expected.reshape(-1)) @@ -167,6 +168,7 @@ def test_yeo_johnson_inverse_transform_np_sklearn(): pt_sklearn.lambdas_ = np.array([2.0]) expected = pt_sklearn.inverse_transform(monthly_residuals.reshape(-1, 1)) + # only approximately the same due to #494 np.testing.assert_allclose(result, expected.reshape(-1)) From 3c12f014f69050c4c130bdbce6706055f1f683e0 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 12 Aug 2024 17:26:44 +0200 Subject: [PATCH 6/6] changelog: fix indentation --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5f365610..8e583391 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -90,8 +90,8 @@ Yeo-Johnson power transformer `#455 `_) - adjust the first guess to assume the data is normally distributed ( `#429 `_) -- make (back-) transformations more stable by using `np.expm1` and `np.log1p` - (`#494 `_) +- make (back-) transformations more stable by using `np.expm1` and `np.log1p` + (`#494 `_) - rewrite power transformer to work with xarray, and refactor from a class structure to functions ( `#442 `_, and `#474 `_)