From d8e84673c815db422f7198fb056215084ef5131e Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 3 Oct 2024 16:45:47 +0200 Subject: [PATCH 1/3] first test for distrib cov --- mesmer/mesmer_x/train_l_distrib_mesmerx.py | 13 +- tests/unit/test_mesmer_x_distrib_cov.py | 144 +++++++++++++++++++++ 2 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_mesmer_x_distrib_cov.py diff --git a/mesmer/mesmer_x/train_l_distrib_mesmerx.py b/mesmer/mesmer_x/train_l_distrib_mesmerx.py index 2d6fe767..41c762cd 100644 --- a/mesmer/mesmer_x/train_l_distrib_mesmerx.py +++ b/mesmer/mesmer_x/train_l_distrib_mesmerx.py @@ -37,7 +37,7 @@ def wrapper(*args, **kwargs): return wrapper - +# TODO: would want to switch this, have a distrib class that takes xarrays and have a training func that potentially works on xarrays def xr_train_distrib( predictors, target, @@ -293,11 +293,13 @@ def __init__( ---------- data_targ : numpy array 1D Sample of the target for fit of a conditional distribution + Normally the timeseries of the target at one gridpoint. data_pred : dict of 1D vectors Covariates for the conditional distribution. Each key must be the exact name of the inputs used in 'expr_fit', and the values must be aligned with the values in 'data_targ'. + Normally the timeseries of the global mean predictor. expr_fit : class 'expression' Expression to train. The string provided to the class can be found in @@ -353,7 +355,8 @@ def __init__( * type_fun_optim: string, default: "NLL" If 'NLL', will optimize using the negative log likelihood. If 'fcNLL', will use the full conditional negative log likelihood based on the - stopping rule. + stopping rule. The arguments `threshold_stopping_rule`, `ind_year_thres` + and `exclude_trigger` only apply to 'fcNLL'. * weighted_NLL: boolean, default: False If True, the optimization function will based on the weighted sum of the @@ -614,8 +617,8 @@ def __init__( ): raise ValueError( "Lack of consistency on the options 'type_fun_optim'," - " 'threshold_stopping_rule' and 'ind_year_thres', not sure if the" - " stopping rule will be employed" + " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule", + "and 'ind_year_thres' must be used together, and only for 'fcNLL'", ) def get_weights(self, n_bins_density=40): @@ -649,7 +652,7 @@ def _get_weights_nll(self, n_bins_density=40): # interpolating over whole region gmt_hist, edges = np.histogramdd(sample=tmp, bins=bins.T) - gmt_bins_center = [0.5 * (edge[1:] + edges[:-1]) for edge in edges] + gmt_bins_center = [0.5 * (edge[1:] + edge[:-1]) for edge in edges] interp = RegularGridInterpolator(points=gmt_bins_center, values=gmt_hist) weights_driver = 1 / interp(tmp) # inverse of density diff --git a/tests/unit/test_mesmer_x_distrib_cov.py b/tests/unit/test_mesmer_x_distrib_cov.py new file mode 100644 index 00000000..8f258942 --- /dev/null +++ b/tests/unit/test_mesmer_x_distrib_cov.py @@ -0,0 +1,144 @@ +from coverage import data +from networkx.algorithms import threshold +import numpy as np +import pytest +import scipy as sp +from toolz import first +import xarray as xr + +import mesmer +from mesmer.mesmer_x import distrib_cov, Expression + +def test_distrib_cov_init_all_default(): + rng = np.random.default_rng(0) + n = 250 + pred = np.linspace(0, 1, n) + targ = rng.normal(loc=2*pred, scale=0.1, size=n) + + expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") + + dist = distrib_cov(targ, {"tas": pred}, expression) + + np.testing.assert_equal(dist.data_targ, targ) + np.testing.assert_equal(dist.data_pred, {"tas": pred}) + np.testing.assert_equal(dist.weights_driver, np.ones(n) / n) + assert dist.n_sample == n + assert dist.expr_fit == expression + assert dist.add_test == False + assert dist.data_targ_addtest is None + assert dist.data_preds_addtest is None + assert dist.threshold_min_proba == 1e-09 + assert dist.boundaries_params == expression.boundaries_parameters + assert dist.boundaries_coeffs == {} + assert dist.first_guess == None + assert dist.func_first_guess == None + assert dist.n_coeffs == 2 + assert dist.scores_fit == ["func_optim", "NLL", "BIC"] + assert dist.xtol_req == 1e-06 + assert dist.ftol_req == 1e-06 + assert dist.maxiter == 1000 * dist.n_coeffs * np.log(dist.n_coeffs) + assert dist.maxfev == 1000 * dist.n_coeffs * np.log(dist.n_coeffs) + assert dist.method_fit == "Powell" + assert dist.name_ftol == "ftol" + assert dist.name_xtol == "xtol" + assert dist.error_failedfit == False + assert dist.fg_with_global_opti == False + assert dist.weighted_NLL == False + assert dist.type_fun_optim == "NLL" + assert dist.threshold_stopping_rule == None + assert dist.exclude_trigger == None + assert dist.ind_year_thres == None + +def test_distrib_cov_init(): + rng = np.random.default_rng(0) + n = 250 + pred = np.linspace(0, 1, n) + targ = rng.normal(loc=2*pred, scale=0.1, size=n) + + expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") + + data_targ_addtest = rng.normal(loc=2*pred, scale=0.1, size=n) + data_preds_addtest = {"tas": np.linspace(0, 0.9, n)} + threshold_min_proba = 0.1 + boundaries_params = {"loc": [-10, 10], "scale": [0, 1]} + boundaries_coeffs = {"c1": [0, 5], "c2": [0, 1]} + first_guess = np.array([1, 0.1]) + func_first_guess = None + scores_fit = ["func_optim", "NLL"] + options_optim = {"type_fun_optim": "fcNLL", + "weighted_NLL": True, + "threshold_stopping_rule": 0.1, + "ind_year_thres": 10, + "exclude_trigger": True, + } + options_solver = { + "method_fit": "Nelder-Mead", + "xtol_req": 0.1, + "ftol_req": 0.01, + "maxiter": 10_000, + "maxfev": 12_000, + "error_failedfit": True, + "fg_with_global_opti": True, + } + + dist = distrib_cov(targ, {"tas": pred}, expression, + data_targ_addtest=data_targ_addtest, + data_preds_addtest=data_preds_addtest, + threshold_min_proba=threshold_min_proba, + boundaries_params=boundaries_params, + boundaries_coeffs=boundaries_coeffs, + first_guess=first_guess, + func_first_guess=func_first_guess, + scores_fit=scores_fit, + options_optim=options_optim, + options_solver=options_solver) + + np.testing.assert_equal(dist.data_targ, targ) + np.testing.assert_equal(dist.data_pred, {"tas": pred}) + np.testing.assert_equal(dist.weights_driver, dist.get_weights()) + # np.testing.assert_equal(dist.weights_driver, dist._get_weights_nll()) # WHY NOT??? + np.testing.assert_equal(dist.first_guess, first_guess) + assert dist.n_sample == n + assert dist.expr_fit == expression + assert dist.add_test == True + assert dist.data_targ_addtest is data_targ_addtest + assert dist.data_preds_addtest is data_preds_addtest + assert dist.threshold_min_proba == threshold_min_proba + assert dist.boundaries_params == boundaries_params + assert dist.boundaries_coeffs == boundaries_coeffs + assert dist.func_first_guess == None + assert dist.n_coeffs == 2 + assert dist.scores_fit == scores_fit + assert dist.xtol_req == 0.1 + assert dist.ftol_req == 0.01 + assert dist.maxiter == 10_000 + assert dist.maxfev == 12_000 + assert dist.method_fit == "Nelder-Mead" + assert dist.name_ftol == "fatol" + assert dist.name_xtol == "xatol" + assert dist.error_failedfit == True + assert dist.fg_with_global_opti == True + assert dist.weighted_NLL == True + assert dist.type_fun_optim == "fcNLL" + assert dist.threshold_stopping_rule == 0.1 + assert dist.exclude_trigger == True + assert dist.ind_year_thres == 10 + +def test_distrib_cov_init_errors(): + expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") + + with pytest.raises(ValueError, match = "NaN or infinite values in target of fit"): + distrib_cov(np.array([1, 2, np.nan]), {"tas": np.array([1, 2, 3])}, expression) + + with pytest.raises(ValueError, match = "NaN or infinite values in target of fit"): + distrib_cov(np.array([1, 2, np.inf]), {"tas": np.array([1, 2, 3])}, expression) + + with pytest.raises(ValueError, match = "NaN or infinite values in predictors of fit"): + distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.nan])}, expression) + + with pytest.raises(ValueError, match = "NaN or infinite values in predictors of fit"): + distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.inf])}, expression) + + with pytest.raises(ValueError, match = "NaN or infinite values in predictors of fit"): + distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.inf]), + "tas2": np.array([1,2, np.nan])}, expression) \ No newline at end of file From 55d71d3d8c2bee68503fe98d53fb72ef608ee884 Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 3 Oct 2024 16:50:37 +0200 Subject: [PATCH 2/3] linting --- mesmer/mesmer_x/train_l_distrib_mesmerx.py | 1 + tests/unit/test_mesmer_x_distrib_cov.py | 127 +++++++++++---------- 2 files changed, 67 insertions(+), 61 deletions(-) diff --git a/mesmer/mesmer_x/train_l_distrib_mesmerx.py b/mesmer/mesmer_x/train_l_distrib_mesmerx.py index 41c762cd..392c8f72 100644 --- a/mesmer/mesmer_x/train_l_distrib_mesmerx.py +++ b/mesmer/mesmer_x/train_l_distrib_mesmerx.py @@ -37,6 +37,7 @@ def wrapper(*args, **kwargs): return wrapper + # TODO: would want to switch this, have a distrib class that takes xarrays and have a training func that potentially works on xarrays def xr_train_distrib( predictors, diff --git a/tests/unit/test_mesmer_x_distrib_cov.py b/tests/unit/test_mesmer_x_distrib_cov.py index 8f258942..0c48bced 100644 --- a/tests/unit/test_mesmer_x_distrib_cov.py +++ b/tests/unit/test_mesmer_x_distrib_cov.py @@ -1,19 +1,14 @@ -from coverage import data -from networkx.algorithms import threshold import numpy as np import pytest -import scipy as sp -from toolz import first -import xarray as xr -import mesmer -from mesmer.mesmer_x import distrib_cov, Expression +from mesmer.mesmer_x import Expression, distrib_cov + def test_distrib_cov_init_all_default(): rng = np.random.default_rng(0) n = 250 pred = np.linspace(0, 1, n) - targ = rng.normal(loc=2*pred, scale=0.1, size=n) + targ = rng.normal(loc=2 * pred, scale=0.1, size=n) expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") @@ -24,14 +19,14 @@ def test_distrib_cov_init_all_default(): np.testing.assert_equal(dist.weights_driver, np.ones(n) / n) assert dist.n_sample == n assert dist.expr_fit == expression - assert dist.add_test == False + assert not dist.add_test assert dist.data_targ_addtest is None assert dist.data_preds_addtest is None assert dist.threshold_min_proba == 1e-09 assert dist.boundaries_params == expression.boundaries_parameters assert dist.boundaries_coeffs == {} - assert dist.first_guess == None - assert dist.func_first_guess == None + assert dist.first_guess is None + assert dist.func_first_guess is None assert dist.n_coeffs == 2 assert dist.scores_fit == ["func_optim", "NLL", "BIC"] assert dist.xtol_req == 1e-06 @@ -41,23 +36,24 @@ def test_distrib_cov_init_all_default(): assert dist.method_fit == "Powell" assert dist.name_ftol == "ftol" assert dist.name_xtol == "xtol" - assert dist.error_failedfit == False - assert dist.fg_with_global_opti == False - assert dist.weighted_NLL == False + assert not dist.error_failedfit + assert not dist.fg_with_global_opti + assert not dist.weighted_NLL assert dist.type_fun_optim == "NLL" - assert dist.threshold_stopping_rule == None - assert dist.exclude_trigger == None - assert dist.ind_year_thres == None + assert dist.threshold_stopping_rule is None + assert dist.exclude_trigger is None + assert dist.ind_year_thres is None + def test_distrib_cov_init(): rng = np.random.default_rng(0) n = 250 pred = np.linspace(0, 1, n) - targ = rng.normal(loc=2*pred, scale=0.1, size=n) + targ = rng.normal(loc=2 * pred, scale=0.1, size=n) expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") - data_targ_addtest = rng.normal(loc=2*pred, scale=0.1, size=n) + data_targ_addtest = rng.normal(loc=2 * pred, scale=0.1, size=n) data_preds_addtest = {"tas": np.linspace(0, 0.9, n)} threshold_min_proba = 0.1 boundaries_params = {"loc": [-10, 10], "scale": [0, 1]} @@ -65,48 +61,53 @@ def test_distrib_cov_init(): first_guess = np.array([1, 0.1]) func_first_guess = None scores_fit = ["func_optim", "NLL"] - options_optim = {"type_fun_optim": "fcNLL", - "weighted_NLL": True, - "threshold_stopping_rule": 0.1, - "ind_year_thres": 10, - "exclude_trigger": True, - } + options_optim = { + "type_fun_optim": "fcNLL", + "weighted_NLL": True, + "threshold_stopping_rule": 0.1, + "ind_year_thres": 10, + "exclude_trigger": True, + } options_solver = { - "method_fit": "Nelder-Mead", - "xtol_req": 0.1, - "ftol_req": 0.01, - "maxiter": 10_000, - "maxfev": 12_000, - "error_failedfit": True, - "fg_with_global_opti": True, - } - - dist = distrib_cov(targ, {"tas": pred}, expression, - data_targ_addtest=data_targ_addtest, - data_preds_addtest=data_preds_addtest, - threshold_min_proba=threshold_min_proba, - boundaries_params=boundaries_params, - boundaries_coeffs=boundaries_coeffs, - first_guess=first_guess, - func_first_guess=func_first_guess, - scores_fit=scores_fit, - options_optim=options_optim, - options_solver=options_solver) + "method_fit": "Nelder-Mead", + "xtol_req": 0.1, + "ftol_req": 0.01, + "maxiter": 10_000, + "maxfev": 12_000, + "error_failedfit": True, + "fg_with_global_opti": True, + } + + dist = distrib_cov( + targ, + {"tas": pred}, + expression, + data_targ_addtest=data_targ_addtest, + data_preds_addtest=data_preds_addtest, + threshold_min_proba=threshold_min_proba, + boundaries_params=boundaries_params, + boundaries_coeffs=boundaries_coeffs, + first_guess=first_guess, + func_first_guess=func_first_guess, + scores_fit=scores_fit, + options_optim=options_optim, + options_solver=options_solver, + ) np.testing.assert_equal(dist.data_targ, targ) np.testing.assert_equal(dist.data_pred, {"tas": pred}) np.testing.assert_equal(dist.weights_driver, dist.get_weights()) # np.testing.assert_equal(dist.weights_driver, dist._get_weights_nll()) # WHY NOT??? np.testing.assert_equal(dist.first_guess, first_guess) + np.testing.assert_equal(dist.data_targ_addtest, data_targ_addtest) + np.testing.assert_equal(dist.data_preds_addtest, data_preds_addtest) assert dist.n_sample == n assert dist.expr_fit == expression - assert dist.add_test == True - assert dist.data_targ_addtest is data_targ_addtest - assert dist.data_preds_addtest is data_preds_addtest + assert dist.add_test # is True assert dist.threshold_min_proba == threshold_min_proba assert dist.boundaries_params == boundaries_params assert dist.boundaries_coeffs == boundaries_coeffs - assert dist.func_first_guess == None + assert dist.func_first_guess is None assert dist.n_coeffs == 2 assert dist.scores_fit == scores_fit assert dist.xtol_req == 0.1 @@ -116,29 +117,33 @@ def test_distrib_cov_init(): assert dist.method_fit == "Nelder-Mead" assert dist.name_ftol == "fatol" assert dist.name_xtol == "xatol" - assert dist.error_failedfit == True - assert dist.fg_with_global_opti == True - assert dist.weighted_NLL == True + assert dist.error_failedfit # is True + assert dist.fg_with_global_opti # is True + assert dist.weighted_NLL # is True assert dist.type_fun_optim == "fcNLL" assert dist.threshold_stopping_rule == 0.1 - assert dist.exclude_trigger == True + assert dist.exclude_trigger # is True assert dist.ind_year_thres == 10 + def test_distrib_cov_init_errors(): expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") - with pytest.raises(ValueError, match = "NaN or infinite values in target of fit"): + with pytest.raises(ValueError, match="NaN or infinite values in target of fit"): distrib_cov(np.array([1, 2, np.nan]), {"tas": np.array([1, 2, 3])}, expression) - with pytest.raises(ValueError, match = "NaN or infinite values in target of fit"): + with pytest.raises(ValueError, match="NaN or infinite values in target of fit"): distrib_cov(np.array([1, 2, np.inf]), {"tas": np.array([1, 2, 3])}, expression) - - with pytest.raises(ValueError, match = "NaN or infinite values in predictors of fit"): + + with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"): distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.nan])}, expression) - with pytest.raises(ValueError, match = "NaN or infinite values in predictors of fit"): + with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"): distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.inf])}, expression) - with pytest.raises(ValueError, match = "NaN or infinite values in predictors of fit"): - distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.inf]), - "tas2": np.array([1,2, np.nan])}, expression) \ No newline at end of file + with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, np.inf]), "tas2": np.array([1, 2, np.nan])}, + expression, + ) From fb8d7c9184795cd39645efadef211c24700f87ce Mon Sep 17 00:00:00 2001 From: veni-vidi-vici-dormivi Date: Thu, 3 Oct 2024 17:26:52 +0200 Subject: [PATCH 3/3] finish tests on init --- mesmer/mesmer_x/train_l_distrib_mesmerx.py | 18 ++--- tests/unit/test_mesmer_x_distrib_cov.py | 85 ++++++++++++++++++++++ 2 files changed, 93 insertions(+), 10 deletions(-) diff --git a/mesmer/mesmer_x/train_l_distrib_mesmerx.py b/mesmer/mesmer_x/train_l_distrib_mesmerx.py index 392c8f72..fdfa7dc9 100644 --- a/mesmer/mesmer_x/train_l_distrib_mesmerx.py +++ b/mesmer/mesmer_x/train_l_distrib_mesmerx.py @@ -544,21 +544,19 @@ def __init__( elif isinstance(options_solver, dict): default_options_solver.update(options_solver) else: - raise ValueError("options_solver must be a dictionary") + raise ValueError("`options_solver` must be a dictionary") self.xtol_req = default_options_solver["xtol_req"] self.ftol_req = default_options_solver["ftol_req"] self.maxiter = default_options_solver["maxiter"] self.maxfev = default_options_solver["maxfev"] self.method_fit = default_options_solver["method_fit"] - if self.method_fit in [ - "dogleg", - "trust-ncg", - "trust-krylov", - "trust-exact", - "COBYLA", - "SLSQP", - "CG", - "Newton-CG", + if self.method_fit not in [ + "BFGS", + "L-BFGS-B", + "Nelder-Mead", + "Powell", + "TNC", + "trust-constr", ]: raise ValueError("method for this fit not prepared, to avoid") else: diff --git a/tests/unit/test_mesmer_x_distrib_cov.py b/tests/unit/test_mesmer_x_distrib_cov.py index 0c48bced..fb14d513 100644 --- a/tests/unit/test_mesmer_x_distrib_cov.py +++ b/tests/unit/test_mesmer_x_distrib_cov.py @@ -147,3 +147,88 @@ def test_distrib_cov_init_errors(): {"tas": np.array([1, 2, np.inf]), "tas2": np.array([1, 2, np.nan])}, expression, ) + + with pytest.raises(ValueError, match="Only one of "): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + data_targ_addtest=np.array([1, 2, 3]), + ) + + with pytest.raises(ValueError, match="Only one of "): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + data_preds_addtest={"tas": np.array([1, 2, 3])}, + ) + + with pytest.raises(ValueError, match="`threshold_min_proba` must be in"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + threshold_min_proba=-1, + ) + with pytest.raises(ValueError, match="`threshold_min_proba` must be in"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + threshold_min_proba=2, + ) + + with pytest.raises( + ValueError, match="The provided first guess does not have the correct shape:" + ): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + first_guess=np.array([1, 2, 3]), + ) + + with pytest.raises(ValueError, match="`options_solver` must be a dictionary"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_solver="this is not a dictionary", + ) + + with pytest.raises(ValueError, match="`options_optim` must be a dictionary"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_optim="this is not a dictionary", + ) + + with pytest.raises(ValueError, match="method for this fit not prepared, to avoid"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_solver={"method_fit": "this is not a method"}, + ) + + with pytest.raises( + ValueError, match="Lack of consistency on the options 'type_fun_optim'" + ): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_optim={"type_fun_optim": "NLL", "threshold_stopping_rule": 0.1}, + ) + + with pytest.raises( + ValueError, match="Lack of consistency on the options 'type_fun_optim'" + ): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_optim={"type_fun_optim": "fcNLL", "threshold_stopping_rule": None}, + )