MESMER-group · veni-vidi-vici-dormivi · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · mathause
diff --git a/mesmer/mesmer_x/train_l_distrib_mesmerx.py b/mesmer/mesmer_x/train_l_distrib_mesmerx.py
@@ -38,6 +38,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+# TODO: would want to switch this, have a distrib class that takes xarrays and have a training func that potentially works on xarrays
-# TODO: would want to switch this, have a distrib class that takes xarrays and have a training func that potentially works on xarrays
+# TODO: enable distrib class and training func for xarray objs
-# TODO: would want to switch this, have a distrib class that takes xarrays and have a training func that potentially works on xarrays
+# TODO: enable distrib class and training func for xarray objs
 def xr_train_distrib(
     predictors,
     target,
@@ -293,11 +294,13 @@ def __init__(
         ----------
         data_targ : numpy array 1D
             Sample of the target for fit of a conditional distribution
+            Normally the timeseries of the target at one gridpoint.
 
         data_pred : dict of 1D vectors
             Covariates for the conditional distribution. Each key must be the exact name
             of the inputs used in 'expr_fit', and the values must be aligned with the
             values in 'data_targ'.
+            Normally the timeseries of the global mean predictor.
 
         expr_fit : class 'expression'
             Expression to train. The string provided to the class can be found in
@@ -353,7 +356,8 @@ def __init__(
             * type_fun_optim: string, default: "NLL"
                 If 'NLL', will optimize using the negative log likelihood. If 'fcNLL',
                 will use the full conditional negative log likelihood based on the
-                stopping rule.
+                stopping rule. The arguments `threshold_stopping_rule`, `ind_year_thres`
+                and `exclude_trigger` only apply to 'fcNLL'.
 
             * weighted_NLL: boolean, default: False
                 If True, the optimization function will based on the weighted sum of the
@@ -540,21 +544,19 @@ def __init__(
         elif isinstance(options_solver, dict):
             default_options_solver.update(options_solver)
         else:
-            raise ValueError("options_solver must be a dictionary")
+            raise ValueError("`options_solver` must be a dictionary")
         self.xtol_req = default_options_solver["xtol_req"]
         self.ftol_req = default_options_solver["ftol_req"]
         self.maxiter = default_options_solver["maxiter"]
         self.maxfev = default_options_solver["maxfev"]
         self.method_fit = default_options_solver["method_fit"]
-        if self.method_fit in [
-            "dogleg",
-            "trust-ncg",
-            "trust-krylov",
-            "trust-exact",
-            "COBYLA",
-            "SLSQP",
-            "CG",
-            "Newton-CG",
+        if self.method_fit not in [
+            "BFGS",
+            "L-BFGS-B",
+            "Nelder-Mead",
+            "Powell",
+            "TNC",
+            "trust-constr",
         ]:
             raise ValueError("method for this fit not prepared, to avoid")
         else:
@@ -614,8 +616,8 @@ def __init__(
         ):
             raise ValueError(
                 "Lack of consistency on the options 'type_fun_optim',"
-                " 'threshold_stopping_rule' and 'ind_year_thres', not sure if the"
-                " stopping rule will be employed"
+                " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule",
+                "and 'ind_year_thres' must be used together, and only for 'fcNLL'",
-                " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule",
-                "and 'ind_year_thres' must be used together, and only for 'fcNLL'",
+                " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule,"
+                " and 'ind_year_thres' must be used together, and only for 'fcNLL'",
-                " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule",
-                "and 'ind_year_thres' must be used together, and only for 'fcNLL'",
+                " 'threshold_stopping_rule' and 'ind_year_thres', threshold_stopping_rule,"
+                " and 'ind_year_thres' must be used together, and only for 'fcNLL'",
             )
 
     def get_weights(self, n_bins_density=40):
@@ -649,7 +651,7 @@ def _get_weights_nll(self, n_bins_density=40):
         # interpolating over whole region
         gmt_hist, edges = np.histogramdd(sample=tmp, bins=bins.T)
 
-        gmt_bins_center = [0.5 * (edge[1:] + edges[:-1]) for edge in edges]
+        gmt_bins_center = [0.5 * (edge[1:] + edge[:-1]) for edge in edges]
         interp = RegularGridInterpolator(points=gmt_bins_center, values=gmt_hist)
         weights_driver = 1 / interp(tmp)  # inverse of density
 

diff --git a/tests/unit/test_mesmer_x_distrib_cov.py b/tests/unit/test_mesmer_x_distrib_cov.py
@@ -0,0 +1,234 @@
+import numpy as np
+import pytest
+
+from mesmer.mesmer_x import Expression, distrib_cov
+
+
+def test_distrib_cov_init_all_default():
+    rng = np.random.default_rng(0)
+    n = 250
+    pred = np.linspace(0, 1, n)
+    targ = rng.normal(loc=2 * pred, scale=0.1, size=n)
+
+    expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1")
+
+    dist = distrib_cov(targ, {"tas": pred}, expression)
+
+    np.testing.assert_equal(dist.data_targ, targ)
+    np.testing.assert_equal(dist.data_pred, {"tas": pred})
+    np.testing.assert_equal(dist.weights_driver, np.ones(n) / n)
+    assert dist.n_sample == n
+    assert dist.expr_fit == expression
-    assert dist.expr_fit == expression
+    assert dist.expr_fit is expression
-    assert dist.expr_fit == expression
+    assert dist.expr_fit is expression
+    assert not dist.add_test
+    assert dist.data_targ_addtest is None
+    assert dist.data_preds_addtest is None
+    assert dist.threshold_min_proba == 1e-09
+    assert dist.boundaries_params == expression.boundaries_parameters
+    assert dist.boundaries_coeffs == {}
+    assert dist.first_guess is None
+    assert dist.func_first_guess is None
+    assert dist.n_coeffs == 2
+    assert dist.scores_fit == ["func_optim", "NLL", "BIC"]
+    assert dist.xtol_req == 1e-06
+    assert dist.ftol_req == 1e-06
+    assert dist.maxiter == 1000 * dist.n_coeffs * np.log(dist.n_coeffs)
+    assert dist.maxfev == 1000 * dist.n_coeffs * np.log(dist.n_coeffs)
+    assert dist.method_fit == "Powell"
+    assert dist.name_ftol == "ftol"
+    assert dist.name_xtol == "xtol"
+    assert not dist.error_failedfit
+    assert not dist.fg_with_global_opti
+    assert not dist.weighted_NLL
+    assert dist.type_fun_optim == "NLL"
+    assert dist.threshold_stopping_rule is None
+    assert dist.exclude_trigger is None
+    assert dist.ind_year_thres is None
+
+
+def test_distrib_cov_init():
+    rng = np.random.default_rng(0)
+    n = 250
+    pred = np.linspace(0, 1, n)
+    targ = rng.normal(loc=2 * pred, scale=0.1, size=n)
+
+    expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1")
+
+    data_targ_addtest = rng.normal(loc=2 * pred, scale=0.1, size=n)
+    data_preds_addtest = {"tas": np.linspace(0, 0.9, n)}
+    threshold_min_proba = 0.1
+    boundaries_params = {"loc": [-10, 10], "scale": [0, 1]}
-    boundaries_params = {"loc": [-10, 10], "scale": [0, 1]}
+    boundaries_params = {"loc": [-10, 10], "scale": [-1, 1]}
-    boundaries_params = {"loc": [-10, 10], "scale": [0, 1]}
+    boundaries_params = {"loc": [-10, 10], "scale": [-1, 1]}
+    boundaries_coeffs = {"c1": [0, 5], "c2": [0, 1]}
+    first_guess = np.array([1, 0.1])
+    func_first_guess = None
+    scores_fit = ["func_optim", "NLL"]
+    options_optim = {
+        "type_fun_optim": "fcNLL",
+        "weighted_NLL": True,
+        "threshold_stopping_rule": 0.1,
+        "ind_year_thres": 10,
+        "exclude_trigger": True,
+    }
+    options_solver = {
+        "method_fit": "Nelder-Mead",
+        "xtol_req": 0.1,
+        "ftol_req": 0.01,
+        "maxiter": 10_000,
+        "maxfev": 12_000,
+        "error_failedfit": True,
+        "fg_with_global_opti": True,
+    }
+
+    dist = distrib_cov(
+        targ,
+        {"tas": pred},
+        expression,
+        data_targ_addtest=data_targ_addtest,
+        data_preds_addtest=data_preds_addtest,
+        threshold_min_proba=threshold_min_proba,
+        boundaries_params=boundaries_params,
+        boundaries_coeffs=boundaries_coeffs,
+        first_guess=first_guess,
+        func_first_guess=func_first_guess,
+        scores_fit=scores_fit,
+        options_optim=options_optim,
+        options_solver=options_solver,
+    )
+
+    np.testing.assert_equal(dist.data_targ, targ)
+    np.testing.assert_equal(dist.data_pred, {"tas": pred})
+    np.testing.assert_equal(dist.weights_driver, dist.get_weights())
+    # np.testing.assert_equal(dist.weights_driver, dist._get_weights_nll()) # WHY NOT???
+    np.testing.assert_equal(dist.first_guess, first_guess)
+    np.testing.assert_equal(dist.data_targ_addtest, data_targ_addtest)
+    np.testing.assert_equal(dist.data_preds_addtest, data_preds_addtest)
+    assert dist.n_sample == n
+    assert dist.expr_fit == expression
-    assert dist.expr_fit == expression
+    assert dist.expr_fit is expression
-    assert dist.expr_fit == expression
+    assert dist.expr_fit is expression
+    assert dist.add_test  # is True
+    assert dist.threshold_min_proba == threshold_min_proba
+    assert dist.boundaries_params == boundaries_params
+    assert dist.boundaries_coeffs == boundaries_coeffs
+    assert dist.func_first_guess is None
+    assert dist.n_coeffs == 2
+    assert dist.scores_fit == scores_fit
+    assert dist.xtol_req == 0.1
+    assert dist.ftol_req == 0.01
+    assert dist.maxiter == 10_000
+    assert dist.maxfev == 12_000
+    assert dist.method_fit == "Nelder-Mead"
+    assert dist.name_ftol == "fatol"
+    assert dist.name_xtol == "xatol"
+    assert dist.error_failedfit  # is True
+    assert dist.fg_with_global_opti  # is True
+    assert dist.weighted_NLL  # is True
+    assert dist.type_fun_optim == "fcNLL"
+    assert dist.threshold_stopping_rule == 0.1
+    assert dist.exclude_trigger  # is True
+    assert dist.ind_year_thres == 10
+
+
+def test_distrib_cov_init_errors():
+    expression = Expression("norm(loc=c1 * __tas__, scale=c2)", expr_name="exp1")
+
+    with pytest.raises(ValueError, match="NaN or infinite values in target of fit"):
+        distrib_cov(np.array([1, 2, np.nan]), {"tas": np.array([1, 2, 3])}, expression)
+
+    with pytest.raises(ValueError, match="NaN or infinite values in target of fit"):
+        distrib_cov(np.array([1, 2, np.inf]), {"tas": np.array([1, 2, 3])}, expression)
+
+    with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"):
+        distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.nan])}, expression)
+
+    with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"):
+        distrib_cov(np.array([1, 2, 3]), {"tas": np.array([1, 2, np.inf])}, expression)
+
+    with pytest.raises(ValueError, match="NaN or infinite values in predictors of fit"):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, np.inf]), "tas2": np.array([1, 2, np.nan])},
+            expression,
+        )
+
+    with pytest.raises(ValueError, match="Only one of "):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            data_targ_addtest=np.array([1, 2, 3]),
+        )
+
+    with pytest.raises(ValueError, match="Only one of "):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            data_preds_addtest={"tas": np.array([1, 2, 3])},
+        )
+
+    with pytest.raises(ValueError, match="`threshold_min_proba` must be in"):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            threshold_min_proba=-1,
+        )
+    with pytest.raises(ValueError, match="`threshold_min_proba` must be in"):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            threshold_min_proba=2,
+        )
+
+    with pytest.raises(
+        ValueError, match="The provided first guess does not have the correct shape:"
+    ):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            first_guess=np.array([1, 2, 3]),
+        )
+
+    with pytest.raises(ValueError, match="`options_solver` must be a dictionary"):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            options_solver="this is not a dictionary",
+        )
+
+    with pytest.raises(ValueError, match="`options_optim` must be a dictionary"):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            options_optim="this is not a dictionary",
+        )
+
+    with pytest.raises(ValueError, match="method for this fit not prepared, to avoid"):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            options_solver={"method_fit": "this is not a method"},
+        )
+
+    with pytest.raises(
+        ValueError, match="Lack of consistency on the options 'type_fun_optim'"
+    ):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            options_optim={"type_fun_optim": "NLL", "threshold_stopping_rule": 0.1},
+        )
+
+    with pytest.raises(
+        ValueError, match="Lack of consistency on the options 'type_fun_optim'"
+    ):
+        distrib_cov(
+            np.array([1, 2, 3]),
+            {"tas": np.array([1, 2, 3])},
+            expression,
+            options_optim={"type_fun_optim": "fcNLL", "threshold_stopping_rule": None},
+        )