diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fcd124b9..8e583391 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -90,6 +90,8 @@ Yeo-Johnson power transformer `#455 `_) - adjust the first guess to assume the data is normally distributed ( `#429 `_) +- make (back-) transformations more stable by using `np.expm1` and `np.log1p` + (`#494 `_) - rewrite power transformer to work with xarray, and refactor from a class structure to functions ( `#442 `_, and `#474 `_) diff --git a/mesmer/stats/_power_transformer.py b/mesmer/stats/_power_transformer.py index 0f74c146..5bc40c98 100644 --- a/mesmer/stats/_power_transformer.py +++ b/mesmer/stats/_power_transformer.py @@ -69,10 +69,11 @@ def _yeo_johnson_transform_np(data, lambdas): # assign values for the four cases transformed[sel_a] = np.log1p(data[sel_a]) - transformed[sel_b] = (np.power(data[sel_b] + 1, lambdas[sel_b]) - 1) / lambdas[ + transformed[sel_b] = (np.expm1(np.log1p(data[sel_b]) * lambdas[sel_b])) / lambdas[ sel_b ] - transformed[sel_c] = -(np.power(-data[sel_c] + 1, 2 - lambdas[sel_c]) - 1) / ( + + transformed[sel_c] = -(np.expm1(np.log1p(-data[sel_c]) * (2 - lambdas[sel_c]))) / ( 2 - lambdas[sel_c] ) transformed[sel_d] = -np.log1p(-data[sel_d]) @@ -114,10 +115,11 @@ def _yeo_johnson_inverse_transform_np(data, lambdas): # assign values for the four cases transformed[pos_a] = np.exp(data[pos_a]) - 1 transformed[pos_b] = ( - np.power(data[pos_b] * lambdas[pos_b] + 1, 1 / lambdas[pos_b]) - 1 + np.exp(np.log1p(data[pos_b] * lambdas[pos_b]) / lambdas[pos_b]) - 1 ) - transformed[pos_c] = 1 - np.power( - (lambdas[pos_c] - 2) * data[pos_c] + 1, 1 / (2 - lambdas[pos_c]) + + transformed[pos_c] = 1 - np.exp( + np.log1p(-(2 - lambdas[pos_c]) * data[pos_c]) / (2 - lambdas[pos_c]) ) transformed[pos_d] = 1 - np.exp(-data[pos_d]) diff --git a/tests/unit/test_power_transformer.py b/tests/unit/test_power_transformer.py index 0b2e3e31..18f6611f 100644 --- a/tests/unit/test_power_transformer.py +++ b/tests/unit/test_power_transformer.py @@ -119,7 +119,8 @@ def test_yeo_johnson_transform_np_sklearn(): pt_sklearn.lambdas_ = np.array([2.0]) expected = pt_sklearn.transform(monthly_residuals.reshape(-1, 1)) - np.testing.assert_equal(result, expected.reshape(-1)) + # only approximately the same due to #494 + np.testing.assert_allclose(result, expected.reshape(-1)) def test_transform_roundtrip(): @@ -137,6 +138,22 @@ def test_transform_roundtrip(): np.testing.assert_allclose(result, monthly_residuals, atol=1e-7) +@pytest.mark.parametrize("value", (-2, -1, 0, 1, 2)) +@pytest.mark.parametrize("lmbda", (0, 1, 2)) +@pytest.mark.parametrize("lambda_delta", (-4e-16, 0, 4e-16)) +def test_transform_roundtrip_special_cases(value, lmbda, lambda_delta): + # test yeo_johnson for lambdas close to the boundary points + + x = np.array([value], dtype=float) + + lambdas = np.array([lmbda + lambda_delta], dtype=float) + + transformed = _yeo_johnson_transform_np(x, lambdas) + result = _yeo_johnson_inverse_transform_np(transformed, lambdas) + + np.testing.assert_allclose(result, x) + + def test_yeo_johnson_inverse_transform_np_sklearn(): # test if our inverse power transform is the same as sklearn's for constant lambda np.random.seed(0) @@ -151,7 +168,8 @@ def test_yeo_johnson_inverse_transform_np_sklearn(): pt_sklearn.lambdas_ = np.array([2.0]) expected = pt_sklearn.inverse_transform(monthly_residuals.reshape(-1, 1)) - np.testing.assert_equal(result, expected.reshape(-1)) + # only approximately the same due to #494 + np.testing.assert_allclose(result, expected.reshape(-1)) def test_yeo_johnson_optimize_lambda_sklearn():