diff --git a/mesmer/mesmer_x/train_l_distrib_mesmerx.py b/mesmer/mesmer_x/train_l_distrib_mesmerx.py index 2d6fe767..fdfa7dc9 100644 --- a/mesmer/mesmer_x/train_l_distrib_mesmerx.py +++ b/mesmer/mesmer_x/train_l_distrib_mesmerx.py @@ -38,6 +38,7 @@ def wrapper(*args, **kwargs): return wrapper +# TODO: would want to switch this, have a distrib class that takes xarrays and have a training func that potentially works on xarrays def xr_train_distrib( predictors, target, @@ -293,11 +294,13 @@ def __init__( ---------- data_targ : numpy array 1D Sample of the target for fit of a conditional distribution + Normally the timeseries of the target at one gridpoint. data_pred : dict of 1D vectors Covariates for the conditional distribution. Each key must be the exact name of the inputs used in 'expr_fit', and the values must be aligned with the values in 'data_targ'. + Normally the timeseries of the global mean predictor. expr_fit : class 'expression' Expression to train. The string provided to the class can be found in @@ -353,7 +356,8 @@ def __init__( * type_fun_optim: string, default: "NLL" If 'NLL', will optimize using the negative log likelihood. If 'fcNLL', will use the full conditional negative log likelihood based on the - stopping rule. + stopping rule. The arguments `threshold_stopping_rule`, `ind_year_thres` + and `exclude_trigger` only apply to 'fcNLL'. * weighted_NLL: boolean, default: False If True, the optimization function will based on the weighted sum of the @@ -540,21 +544,19 @@ def __init__( elif isinstance(options_solver, dict): default_options_solver.update(options_solver) else: - raise ValueError("options_solver must be a dictionary") + raise ValueError("`options_solver` must be a dictionary") self.xtol_req = default_options_solver["xtol_req"] self.ftol_req = default_options_solver["ftol_req"] self.maxiter = default_options_solver["maxiter"] self.maxfev = default_options_solver["maxfev"] self.method_fit = default_options_solver["method_fit"] - if self.method_fit in [ - "dogleg", - "trust-ncg", - "trust-krylov", - "trust-exact", - "COBYLA", - "SLSQP", - "CG", - "Newton-CG", + if self.method_fit not in [ + "BFGS", + "L-BFGS-B", + "Nelder-Mead", + "Powell", + "TNC", + "trust-constr", ]: raise ValueError("method for this fit not prepared, to avoid") else: @@ -614,8 +616,8 @@ def __init__( ): raise ValueError( "Lack of consistency on the options 'type_fun_optim'," - " 'threshold_stopping_rule' and 'ind_year_thres', not sure if the" - " stopping rule will be employed" + " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule", + "and 'ind_year_thres' must be used together, and only for 'fcNLL'", ) def get_weights(self, n_bins_density=40): @@ -649,7 +651,7 @@ def _get_weights_nll(self, n_bins_density=40): # interpolating over whole region gmt_hist, edges = np.histogramdd(sample=tmp, bins=bins.T) - gmt_bins_center = [0.5 * (edge[1:] + edges[:-1]) for edge in edges] + gmt_bins_center = [0.5 * (edge[1:] + edge[:-1]) for edge in edges] interp = RegularGridInterpolator(points=gmt_bins_center, values=gmt_hist) weights_driver = 1 / interp(tmp) # inverse of density diff --git a/tests/unit/test_mesmer_x_distrib_cov.py b/tests/unit/test_mesmer_x_distrib_cov.py new file mode 100644 index 00000000..fb14d513 --- /dev/null +++ b/tests/unit/test_mesmer_x_distrib_cov.py @@ -0,0 +1,234 @@ +import numpy as np +import pytest + +from mesmer.mesmer_x import Expression, distrib_cov + + +def test_distrib_cov_init_all_default(): + rng = np.random.default_rng(0) + n = 250 + pred = np.linspace(0, 1, n) + targ = rng.normal(loc=2 * pred, scale=0.1, size=n) + + expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") + + dist = distrib_cov(targ, {"tas": pred}, expression) + + np.testing.assert_equal(dist.data_targ, targ) + np.testing.assert_equal(dist.data_pred, {"tas": pred}) + np.testing.assert_equal(dist.weights_driver, np.ones(n) / n) + assert dist.n_sample == n + assert dist.expr_fit == expression + assert not dist.add_test + assert dist.data_targ_addtest is None + assert dist.data_preds_addtest is None + assert dist.threshold_min_proba == 1e-09 + assert dist.boundaries_params == expression.boundaries_parameters + assert dist.boundaries_coeffs == {} + assert dist.first_guess is None + assert dist.func_first_guess is None + assert dist.n_coeffs == 2 + assert dist.scores_fit == ["func_optim", "NLL", "BIC"] + assert dist.xtol_req == 1e-06 + assert dist.ftol_req == 1e-06 + assert dist.maxiter == 1000 * dist.n_coeffs * np.log(dist.n_coeffs) + assert dist.maxfev == 1000 * dist.n_coeffs * np.log(dist.n_coeffs) + assert dist.method_fit == "Powell" + assert dist.name_ftol == "ftol" + assert dist.name_xtol == "xtol" + assert not dist.error_failedfit + assert not dist.fg_with_global_opti + assert not dist.weighted_NLL + assert dist.type_fun_optim == "NLL" + assert dist.threshold_stopping_rule is None + assert dist.exclude_trigger is None + assert dist.ind_year_thres is None + + +def test_distrib_cov_init(): + rng = np.random.default_rng(0) + n = 250 + pred = np.linspace(0, 1, n) + targ = rng.normal(loc=2 * pred, scale=0.1, size=n) + + expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") + + data_targ_addtest = rng.normal(loc=2 * pred, scale=0.1, size=n) + data_preds_addtest = {"tas": np.linspace(0, 0.9, n)} + threshold_min_proba = 0.1 + boundaries_params = {"loc": [-10, 10], "scale": [0, 1]} + boundaries_coeffs = {"c1": [0, 5], "c2": [0, 1]} + first_guess = np.array([1, 0.1]) + func_first_guess = None + scores_fit = ["func_optim", "NLL"] + options_optim = { + "type_fun_optim": "fcNLL", + "weighted_NLL": True, + "threshold_stopping_rule": 0.1, + "ind_year_thres": 10, + "exclude_trigger": True, + } + options_solver = { + "method_fit": "Nelder-Mead", + "xtol_req": 0.1, + "ftol_req": 0.01, + "maxiter": 10_000, + "maxfev": 12_000, + "error_failedfit": True, + "fg_with_global_opti": True, + } + + dist = distrib_cov( + targ, + {"tas": pred}, + expression, + data_targ_addtest=data_targ_addtest, + data_preds_addtest=data_preds_addtest, + threshold_min_proba=threshold_min_proba, + boundaries_params=boundaries_params, + boundaries_coeffs=boundaries_coeffs, + first_guess=first_guess, + func_first_guess=func_first_guess, + scores_fit=scores_fit, + options_optim=options_optim, + options_solver=options_solver, + ) + + np.testing.assert_equal(dist.data_targ, targ) + np.testing.assert_equal(dist.data_pred, {"tas": pred}) + np.testing.assert_equal(dist.weights_driver, dist.get_weights()) + # np.testing.assert_equal(dist.weights_driver, dist._get_weights_nll()) # WHY NOT??? + np.testing.assert_equal(dist.first_guess, first_guess) + np.testing.assert_equal(dist.data_targ_addtest, data_targ_addtest) + np.testing.assert_equal(dist.data_preds_addtest, data_preds_addtest) + assert dist.n_sample == n + assert dist.expr_fit == expression + assert dist.add_test # is True + assert dist.threshold_min_proba == threshold_min_proba + assert dist.boundaries_params == boundaries_params + assert dist.boundaries_coeffs == boundaries_coeffs + assert dist.func_first_guess is None + assert dist.n_coeffs == 2 + assert dist.scores_fit == scores_fit + assert dist.xtol_req == 0.1 + assert dist.ftol_req == 0.01 + assert dist.maxiter == 10_000 + assert dist.maxfev == 12_000 + assert dist.method_fit == "Nelder-Mead" + assert dist.name_ftol == "fatol" + assert dist.name_xtol == "xatol" + assert dist.error_failedfit # is True + assert dist.fg_with_global_opti # is True + assert dist.weighted_NLL # is True + assert dist.type_fun_optim == "fcNLL" + assert dist.threshold_stopping_rule == 0.1 + assert dist.exclude_trigger # is True + assert dist.ind_year_thres == 10 + + +def test_distrib_cov_init_errors(): + expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1") + + with pytest.raises(ValueError, match="NaN or infinite values in target of fit"): + distrib_cov(np.array([1, 2, np.nan]), {"tas": np.array([1, 2, 3])}, expression) + + with pytest.raises(ValueError, match="NaN or infinite values in target of fit"): + distrib_cov(np.array([1, 2, np.inf]), {"tas": np.array([1, 2, 3])}, expression) + + with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"): + distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.nan])}, expression) + + with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"): + distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.inf])}, expression) + + with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, np.inf]), "tas2": np.array([1, 2, np.nan])}, + expression, + ) + + with pytest.raises(ValueError, match="Only one of "): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + data_targ_addtest=np.array([1, 2, 3]), + ) + + with pytest.raises(ValueError, match="Only one of "): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + data_preds_addtest={"tas": np.array([1, 2, 3])}, + ) + + with pytest.raises(ValueError, match="`threshold_min_proba` must be in"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + threshold_min_proba=-1, + ) + with pytest.raises(ValueError, match="`threshold_min_proba` must be in"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + threshold_min_proba=2, + ) + + with pytest.raises( + ValueError, match="The provided first guess does not have the correct shape:" + ): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + first_guess=np.array([1, 2, 3]), + ) + + with pytest.raises(ValueError, match="`options_solver` must be a dictionary"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_solver="this is not a dictionary", + ) + + with pytest.raises(ValueError, match="`options_optim` must be a dictionary"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_optim="this is not a dictionary", + ) + + with pytest.raises(ValueError, match="method for this fit not prepared, to avoid"): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_solver={"method_fit": "this is not a method"}, + ) + + with pytest.raises( + ValueError, match="Lack of consistency on the options 'type_fun_optim'" + ): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_optim={"type_fun_optim": "NLL", "threshold_stopping_rule": 0.1}, + ) + + with pytest.raises( + ValueError, match="Lack of consistency on the options 'type_fun_optim'" + ): + distrib_cov( + np.array([1, 2, 3]), + {"tas": np.array([1, 2, 3])}, + expression, + options_optim={"type_fun_optim": "fcNLL", "threshold_stopping_rule": None}, + )