MESMER-group · veni-vidi-vici-dormivi · Aug 19, 2024 · Aug 19, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -86,6 +86,7 @@ that this led to some numerical changes compared to the MESMER-M publication
 - move the harmonic model and power transformer functionalities to the stats module (
   `#484 <https://github.com/MESMER-group/mesmer/pull/484>`_).
 - add example script for MESMER-M workflow (`#491 <https://github.com/MESMER-group/mesmer/pull/491>`_)
+- add integration tests for MESMER-M (`#501 <https://github.com/MESMER-group/mesmer/pull/501>`_)
 
 Auto-Regression
 ~~~~~~~~~~~~~~~

diff --git a/examples/example_mesmer_m.ipynb b/examples/example_mesmer_m.ipynb
@@ -271,7 +271,7 @@
    "metadata": {},
    "source": [
     "### time coordinate\n",
-    "We need to get the original time coordinate to be able to validate our results later on. If it is not needed to align the final emulations with the original data, this can be omitted, the time coordinates are then given by to forcing data set or can later be generated for example with \n",
+    "We need to get the original time coordinate to be able to validate our results later on. If it is not needed to align the final emulations with the original data, this can be omitted, the time coordinates can later be generated for example with \n",
     "\n",
     "\n",
     "```python\n",

diff --git a/mesmer/stats/_auto_regression.py b/mesmer/stats/_auto_regression.py
@@ -675,7 +675,7 @@ def fit_auto_regression_monthly(monthly_data, time_dim="time"):
             vectorize=True,
         )
 
-        ar_params.append(xr.Dataset({"slope": slope, "intercept": intercept}))
+        ar_params.append(xr.Dataset({"ar1_slope": slope, "ar1_intercept": intercept}))
         residuals.append(resids.assign_coords({time_dim: cur_month[time_dim]}))
 
     month = xr.Variable("month", np.arange(1, 13))
@@ -742,8 +742,8 @@ def draw_auto_regression_monthly(
         Dataset containing the estimated parameters of the AR1 process. Must contain the
         following DataArray objects:
 
-        - intercept
-        - slope
+        - ar1_intercept
+        - ar1_slope
 
         both of shape (12, n_gridpoints).
     covariance : xr.DataArray of shape (12, n_gridpoints, n_gridpoints)
@@ -774,25 +774,30 @@ def draw_auto_regression_monthly(
 
     """
     # check input
-    _check_dataset_form(ar_params, "ar_params", required_vars=("intercept", "slope"))
-    month_dim, gridcell_dim = ar_params.intercept.dims
-    n_months, size = ar_params.intercept.shape
+    _check_dataset_form(
+        ar_params, "ar_params", required_vars=("ar1_intercept", "ar1_slope")
+    )
+    month_dim, gridcell_dim = ar_params.ar1_intercept.dims
+    n_months, size = ar_params.ar1_intercept.shape
     _check_dataarray_form(
-        ar_params.intercept,
+        ar_params.ar1_intercept,
         "intercept",
         ndim=2,
         required_dims=(month_dim, gridcell_dim),
     )
     _check_dataarray_form(
-        ar_params.slope, "slope", ndim=2, required_dims=(month_dim, gridcell_dim)
+        ar_params.ar1_slope,
+        "ar1_slope",
+        ndim=2,
+        required_dims=(month_dim, gridcell_dim),
     )
     _check_dataarray_form(
         covariance, "covariance", ndim=3, shape=(n_months, size, size)
     )
 
     result = _draw_ar_corr_monthly_xr_internal(
-        intercept=ar_params.intercept,
-        slope=ar_params.slope,
+        intercept=ar_params.ar1_intercept,
+        slope=ar_params.ar1_slope,
         covariance=covariance,
         time=time,
         realisation=n_realisations,

diff --git a/mesmer/stats/_harmonic_model.py b/mesmer/stats/_harmonic_model.py
@@ -147,14 +147,16 @@ def _fit_fourier_coeffs_np(yearly_predictor, monthly_target, first_guess):
 
     """
 
-    def func(coeffs, yearly_predictor, mon_target):
+    def residuals_from_fourier_series(coeffs, yearly_predictor, mon_target):
         return _generate_fourier_series_np(yearly_predictor, coeffs) - mon_target
 
+    # use least_squares to optimize the coefficients
     minimize_result = sp.optimize.least_squares(
-        func,
+        residuals_from_fourier_series,
         first_guess,
         args=(yearly_predictor, monthly_target),
         loss="linear",
+        method="lm",
     )
 
     coeffs = minimize_result.x
@@ -296,12 +298,14 @@ def fit_harmonic_model(yearly_predictor, monthly_target, max_order=6, time_dim="
         kwargs={"max_order": max_order},
     )
 
+    coeffs = coeffs.assign_coords({"coeff": np.arange(coeffs.sizes["coeff"])})
+
     preds = yearly_predictor + preds
 
     data_vars = {
-        "selected_order": selected_order,
-        "coeffs": coeffs,
-        "predictions": preds.transpose(time_dim, ...),
+        "hm_selected_order": selected_order,
+        "hm_coeffs": coeffs,
+        "hm_predictions": preds.transpose(time_dim, ...),
     }
 
     return xr.Dataset(data_vars)
diff --git a/mesmer/stats/_power_transformer.py b/mesmer/stats/_power_transformer.py
@@ -193,9 +193,9 @@ def get_lambdas_from_covariates(coeffs, yearly_pred):
 
     Parameters
     ----------
-    lambda_coeffs : ``xr.Dataset``
+    lambda_coeffs : ``xr.DataArray``
         The parameters of the power transformation for each gridcell and month,
-        containing ``lambda_coeffs`` with dims (months, n_gridcells, coeff) calculated
+        containing ``lambda_coeffs`` with dims (months, coeff, n_gridcells) calculated
         using ``fit_yeo_johnson_transform``.
     yearly_pred : ``xr.DataArray``
         yearly values used as predictors for the lambdas. Must have shape shape
@@ -207,15 +207,15 @@ def get_lambdas_from_covariates(coeffs, yearly_pred):
         The parameters of the power transformation for each gridcell month and year
 
     """
-    if not isinstance(coeffs, xr.Dataset):
-        raise TypeError(f"Expected a `xr.Dataset`, got {type(coeffs)}")
+    if not isinstance(coeffs, xr.DataArray):
+        raise TypeError(f"Expected a `xr.DataArray`, got {type(coeffs)}")
 
     if not isinstance(yearly_pred, xr.DataArray):
         raise TypeError(f"Expected a `xr.DataArray`, got {type(yearly_pred)}")
 
     lambdas = xr.apply_ufunc(
         lambda_function,
-        coeffs.lambda_coeffs,
+        coeffs,
         yearly_pred,
         input_core_dims=[("coeff",), []],
         output_core_dims=[[]],
@@ -248,10 +248,9 @@ def fit_yeo_johnson_transform(monthly_residuals, yearly_pred, time_dim="time"):
 
     Returns
     -------
-    :obj:`xr.Dataset`
-        Dataset containing the estimated coefficients xi_0 and xi_1 needed to estimate
-        lambda with dimensions (months, n_gridcells) and the lambdas themselves with
-        dimensions (months, n_gridcells, n_years).
+    :obj:`xr.DataArray`
+        Dataset containing the estimated coefficients needed to estimate
+        lambda with dimensions (months, coeff, n_gridcells).
 
     """
     # TODO allow passing func instead of our fixed lambda_function?
@@ -279,7 +278,7 @@ def fit_yeo_johnson_transform(monthly_residuals, yearly_pred, time_dim="time"):
             output_dtypes=[float],
             vectorize=True,
         )
-
+        res = res.assign_coords({"coeff": np.arange(len(res.coeff))})
         coeffs.append(xr.Dataset({"lambda_coeffs": res}))
 
     return xr.concat(coeffs, dim="month")
@@ -295,9 +294,9 @@ def yeo_johnson_transform(monthly_residuals, coeffs, yearly_pred):
     monthly_residuals : ``xr.DataArray`` of shape (n_years*12, n_gridcells)
         Monthly residuals after removing harmonic model fits, used to fit for the
         optimal transformation parameters (lambdas).
-    coeffs : ``xr.Dataset``
+    coeffs : ``xr.DataArray``
         The parameters of the power transformation containing ``lambda_coeffs`` of shape
-        (months, n_gridcells, coeff) for each gridcell, calculated using
+        (months, coeff, n_gridcells) for each gridcell, calculated using
         :func:`lambda_function <mesmer.stats.lambda_function>`.
     yearly_pred : ``xr.DataArray`` of shape (n_years, n_gridcells)
         yearly values used as predictors for the lambdas.
@@ -330,8 +329,8 @@ def yeo_johnson_transform(monthly_residuals, coeffs, yearly_pred):
     if not isinstance(yearly_pred, xr.DataArray):
         raise TypeError(f"Expected a `xr.DataArray`, got {type(yearly_pred)}")
 
-    if not isinstance(coeffs, xr.Dataset):
-        raise TypeError(f"Expected a `xr.Dataset`, got {type(monthly_residuals)}")
+    if not isinstance(coeffs, xr.DataArray):
+        raise TypeError(f"Expected a `xr.DataArray`, got {type(monthly_residuals)}")
 
     lambdas = get_lambdas_from_covariates(coeffs, yearly_pred).rename({"time": "year"})
     lambdas_stacked = lambdas.stack(stack=["year", "month"])
@@ -356,9 +355,9 @@ def inverse_yeo_johnson_transform(monthly_residuals, coeffs, yearly_pred):
     ----------
     monthly_residuals : ``xr.DataArray`` of shape (n_years, n_gridcells)
         The data to be transformed back to the original scale.
-    coeffs : ``xr.Dataset``
+    coeffs : ``xr.DataArray``
         The parameters of the power transformation containing ``lambda_coeffs`` of shape
-        (months, n_gridcells, coeff) for each gridcell, calculated using
+        (months, coeff, n_gridcells) for each gridcell, calculated using
         :func:`lambda_function <mesmer.stats.lambda_function>`.
     yearly_pred : ``xr.DataArray`` of shape (n_years, n_gridcells)
         yearly values used as predictors for the lambdas.
@@ -390,8 +389,8 @@ def inverse_yeo_johnson_transform(monthly_residuals, coeffs, yearly_pred):
     if not isinstance(yearly_pred, xr.DataArray):
         raise TypeError(f"Expected a `xr.DataArray`, got {type(yearly_pred)}")
 
-    if not isinstance(coeffs, xr.Dataset):
-        raise TypeError(f"Expected a `xr.Dataset`, got {type(monthly_residuals)}")
+    if not isinstance(coeffs, xr.DataArray):
+        raise TypeError(f"Expected a `xr.DataArray`, got {type(monthly_residuals)}")
 
     lambdas = get_lambdas_from_covariates(coeffs, yearly_pred).rename({"time": "year"})
     lambdas_stacked = lambdas.stack(stack=["year", "month"])

diff --git a/tests/integration/test_calibrate_mesmer_m.py b/tests/integration/test_calibrate_mesmer_m.py
@@ -0,0 +1,175 @@
+import importlib
+
+import pytest
+import xarray as xr
+
+import mesmer
+
+
+def test_calibrate_mesmer_m(update_expected_files=False):
+    # define config values
+    THRESHOLD_LAND = 1 / 3
+
+    REFERENCE_PERIOD = slice("1850", "1900")
+
+    LOCALISATION_RADII = list(range(1250, 6251, 250)) + list(range(6500, 8501, 500))
+
+    esm = "IPSL-CM6A-LR"
+
+    # define paths and load data
+    TEST_DATA_PATH = importlib.resources.files("mesmer").parent / "tests" / "test-data"
+    TEST_PATH = TEST_DATA_PATH / "output" / "tas" / "mon"
+    cmip6_data_path = TEST_DATA_PATH / "calibrate-coarse-grid" / "cmip6-ng"
+
+    path_tas_ann = cmip6_data_path / "tas" / "ann" / "g025"
+    fN_hist_ann = path_tas_ann / f"tas_ann_{esm}_historical_r1i1p1f1_g025.nc"
+    fN_proj_ann = path_tas_ann / f"tas_ann_{esm}_ssp585_r1i1p1f1_g025.nc"
+
+    tas_y = xr.open_mfdataset(
+        [fN_hist_ann, fN_proj_ann],
+        combine="by_coords",
+        use_cftime=True,
+        combine_attrs="override",
+        data_vars="minimal",
+        compat="override",
+        coords="minimal",
+        drop_variables=["height", "file_qf"],
+    ).load()
+
+    path_tas_mon = cmip6_data_path / "tas" / "mon" / "g025"
+    fN_hist_mon = path_tas_mon / f"tas_mon_{esm}_historical_r1i1p1f1_g025.nc"
+    fN_proj_mon = path_tas_mon / f"tas_mon_{esm}_ssp585_r1i1p1f1_g025.nc"
+    tas_m = xr.open_mfdataset(
+        [fN_hist_mon, fN_proj_mon],
+        combine="by_coords",
+        use_cftime=True,
+        combine_attrs="override",
+        data_vars="minimal",
+        compat="override",
+        coords="minimal",
+        drop_variables=["height", "file_qf"],
+    ).load()
+
+    # data preprocessing
+    ref_y = tas_y.sel(time=REFERENCE_PERIOD).mean("time", keep_attrs=True)
+    ref_m = tas_m.sel(time=REFERENCE_PERIOD).mean("time", keep_attrs=True)
+
+    tas_y = tas_y - ref_y
+    tas_m = tas_m - ref_m
+
+    # create local gridded tas data
+    def mask_and_stack(ds, threshold_land):
+        ds = mesmer.mask.mask_ocean_fraction(ds, threshold_land)
+        ds = mesmer.mask.mask_antarctica(ds)
+        ds = mesmer.grid.stack_lat_lon(ds)
+        return ds
+
+    tas_stacked_y = mask_and_stack(tas_y, threshold_land=THRESHOLD_LAND)
+    tas_stacked_m = mask_and_stack(tas_m, threshold_land=THRESHOLD_LAND)
+
+    # we need to get the original time coordinate to be able to validate our results
+    m_time = tas_stacked_m.time
+
+    # fit harmonic model
+    harmonic_model_fit = mesmer.stats.fit_harmonic_model(
+        tas_stacked_y.tas, tas_stacked_m.tas
+    )
+
+    # train power transformer
+    resids_after_hm = tas_stacked_m - harmonic_model_fit.hm_predictions
+    pt_coefficients = mesmer.stats.fit_yeo_johnson_transform(
+        resids_after_hm.tas, tas_stacked_y.tas
+    )
+    transformed_hm_resids = mesmer.stats.yeo_johnson_transform(
+        resids_after_hm.tas, pt_coefficients.lambda_coeffs, tas_stacked_y.tas
+    )
+
+    # fit cyclo-stationary AR(1) process
+    AR1_fit = mesmer.stats.fit_auto_regression_monthly(
+        transformed_hm_resids.transformed, time_dim="time"
+    )
+
+    # work out covariance matrix
+    geodist = mesmer.geospatial.geodist_exact(tas_stacked_y.lon, tas_stacked_y.lat)
+
+    phi_gc_localizer = mesmer.stats.gaspari_cohn_correlation_matrices(
+        geodist, localisation_radii=LOCALISATION_RADII
+    )
+
+    weights = xr.ones_like(AR1_fit.residuals.isel(gridcell=0))
+    weights.name = "weights"
+
+    localized_ecov = mesmer.stats.find_localized_empirical_covariance_monthly(
+        AR1_fit.residuals, weights, phi_gc_localizer, "time", 30
+    )
+
+    # merge into one dataset
+    harmonic_model_fit = harmonic_model_fit.drop_vars("hm_predictions")
+    AR1_fit = AR1_fit.drop_vars("residuals")
+    localized_ecov = localized_ecov.drop_vars("covariance")
+    m_time = m_time.rename("monthly_time")
+    calibrated_params = xr.merge(
+        [harmonic_model_fit, pt_coefficients, AR1_fit, localized_ecov, m_time]
+    )
+
+    # save params
+    if update_expected_files:
+        calibrated_params.to_netcdf(TEST_PATH / "test-mesmer_m-params.nc")
+        pytest.skip("Updated test-mesmer_m-params.nc")
+
+    # testing
+    else:
+        # load expected values
+        expected_params = xr.open_dataset(
+            TEST_PATH / "test-mesmer_m-params.nc", use_cftime=True
+        )
+
+        # the following parameters should be exactly the same
+        exact_exp_params = xr.merge(
+            [
+                expected_params.hm_selected_order,
+                expected_params.localization_radius,
+                expected_params.monthly_time,
+            ]
+        )
+        exact_cal_params = xr.merge(
+            [
+                calibrated_params.hm_selected_order,
+                calibrated_params.localization_radius,
+                calibrated_params.monthly_time,
+            ]
+        )
+
+        xr.testing.assert_equal(exact_exp_params, exact_cal_params)
+
+        # compare the rest
+        # using numpy because it outputs the differences and how many values are off
+        import numpy as np
+
+        # the tols are set to the best we can do
+        # NOTE: it is always rather few values that are off
+        np.testing.assert_allclose(
+            expected_params.hm_coeffs,
+            calibrated_params.hm_coeffs,
+            atol=1e-5,
+            rtol=1 / 3,
+        )
+        # NOTE: would have to be atol is 1e12 here - not doing that
+        np.testing.assert_allclose(
+            expected_params.lambda_coeffs, calibrated_params.lambda_coeffs, rtol=0.6
+        )
+        np.testing.assert_allclose(
+            expected_params.ar1_slope, calibrated_params.ar1_slope, atol=1e-5, rtol=1e-4
+        )
+        np.testing.assert_allclose(
+            expected_params.ar1_intercept,
+            calibrated_params.ar1_intercept,
+            atol=1e-4,
+            rtol=1e-2 / 3,
+        )
+        np.testing.assert_allclose(
+            expected_params.localized_covariance,
+            calibrated_params.localized_covariance,
+            atol=1e-4,
+            rtol=1e-2,
+        )