From f8141713307d85bcc5cd20e30f421121ab52b026 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 13:02:07 +0200 Subject: [PATCH 01/44] start colocation_setup.py --- pyaerocom/colocation_setup.py | 515 ++++++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) create mode 100644 pyaerocom/colocation_setup.py diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py new file mode 100644 index 000000000..bcf0df56e --- /dev/null +++ b/pyaerocom/colocation_setup.py @@ -0,0 +1,515 @@ +import logging +import os +from pathlib import Path + +import numpy as np +from pydantic import BaseModel + +from pyaerocom import const +from pyaerocom._lowlevel_helpers import chk_make_subdir +from pyaerocom.config import ALL_REGION_NAME +from pyaerocom.helpers import start_stop +from pyaerocom.io.pyaro.pyaro_config import PyaroConfig + +logger = logging.getLogger(__name__) + + +class ColocationSetup(BaseModel): + """ + Setup class for high-level model / obs co-location. + + An instance of this setup class can be used to run a colocation analysis + between a model and an observation network and will create a number of + :class:`pya.ColocatedData` instances, which can be saved automatically + as NetCDF files. + + Apart from co-location, this class also handles reading of the input data + for co-location. Supported co-location options are: + + 1. gridded vs. ungridded data + For instance 3D model data (instance of :class:`GriddedData`) with lat, + lon and time dimension that is co-located with station based observations + which are represented in pyaerocom through :class:`UngriddedData` objects. + The co-location function used is + :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of + co-location, the output co-located data object will be 3-dimensional, + with dimensions `data_source` (index 0: obs, index 1: model), `time` and + `station_name`. + + 2. gridded vs. gridded data + For instance 3D model data that is co-located with 3D satellite data + (both instances of :class:`GriddedData`), both objects with lat, + lon and time dimensions. The co-location function used + is :func:`pyaerocom.colocation.colocated_gridded_gridded`. + For this type of co-location, the output co-located data object will be + 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: + model), `time` and `latitude` and `longitude`. + + + + Attributes + ---------- + model_id : str + ID of model to be used. + + obs_config: PyaroConfig + In the case Pyaro is used, a config must be provided. In that case obs_id(see below) + is ignored and only the config is used. + obs_id : str + ID of observation network to be used. + obs_vars : list + Variables to be analysed (need to be available in input obs dataset). + Variables that are not available in the model data output will be + skipped. Alternatively, model variables to be used for a given obs + variable can also be specified via attributes :attr:`model_use_vars` + and :attr:`model_add_vars`. + ts_type : str + String specifying colocation output frequency. + start + Start time of colocation. Input can be integer denoting the year or + anything that can be converted into :class:`pandas.Timestamp` using + :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first + available date in the model data is used. + stop + stop time of colocation. int or anything that can be converted into + :class:`pandas.Timestamp` using + :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if + ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` + will be automatically set to the end of that year. Else, it will be + set to the last available timestamp in the model data. + filter_name : str + name of filter to be applied. If None, no filter is used + (to be precise, if None, then + :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should + default to `ALL-wMOUNTAINS`, that is, no filtering). + basedir_coldata : str + Base directory for storing of colocated data files. + save_coldata : bool + if True, colocated data objects are saved as NetCDF file. + obs_name : str, optional + if provided, this string will be used in colocated data filename to + specify obsnetwork, else obs_id will be used. + obs_data_dir : str, optional + location of obs data. If None, attempt to infer obs location based on + obs ID. + obs_use_climatology : bool + BETA if True, pyaerocom default climatology is computed from observation + stations (so far only possible for unrgidded / gridded colocation). + obs_vert_type : str + AeroCom vertical code encoded in the model filenames (only AeroCom 3 + and later). Specifies which model file should be read in case there are + multiple options (e.g. surface level data can be read from a + *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is + string (e.g. 'Surface'), then the corresponding vertical type code is + used for reading of all variables that are colocated (i.e. that are + specified in :attr:`obs_vars`). + obs_ts_type_read : str or dict, optional + may be specified to explicitly define the reading frequency of the + observation data (so far, this does only apply to gridded obsdata such + as satellites), either as str (same for all obs variables) or variable + specific as dict. For ungridded reading, the frequency may be specified + via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). + Not to be confused with :attr:`ts_type`, which specifies the + frequency used for colocation. Can be specified variable specific in + form of dictionary. + obs_filters : dict + filters applied to the observational dataset before co-location. + In case of gridded / gridded, these are filters that can be passed to + :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, + or `constraints`. In case the obsdata is ungridded (gridded / ungridded + co-locations) these are filters that are handled through keyword + `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters + are applied to the :class:`UngriddedData` objects after reading and + caching the data, so changing them, will not invalidate the latest + cache of the :class:`UngriddedData`. + read_opts_ungridded : dict, optional + dictionary that specifies reading constraints for ungridded reading, + and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. + Note that - other than for `obs_filters` these filters are applied + during the reading of the :class:`UngriddedData` objects and specifying + them will deactivate caching. + model_name : str, optional + if provided, this string will be used in colocated data filename to + specify model, else obs_id will be used. + model_data_dir : str, optional + Location of model data. If None, attempt to infer model location based + on model ID. + model_read_opts : dict, optional + options for model reading (passed as keyword args to + :func:`pyaerocom.io.ReadUngridded.read`). + model_use_vars : dict, optional + dictionary that specifies mapping of model variables. Keys are + observation variables, values are the corresponding model variables + (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your + observation has var *od550aer* but your model model uses a different + variable name for that variable, say *od550*. Then, you can specify + this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, + a model variable *od550aer* will be ignored, even if it exists + (cf :attr:`model_add_vars`). + model_rename_vars : dict, optional + rename certain model variables **after** co-location, before storing + the associated :class:`ColocatedData` object on disk. Keys are model + variables, values are new names + (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). + Note: this does not impact which variables are read from the model. + model_add_vars : dict, optional + additional model variables to be processed for one obs variable. E.g. + `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would + co-locate both model SO4 AOD (od550so4) and model coarse mode AOD + (od550gt1aer) with total AOD (od550aer) from obs (in addition to + od550aer vs od550aer if applicable). + model_to_stp : bool + ALPHA (please do not use): convert model data values to STP conditions + after co-location. Note: this only works for very particular settings + at the moment and needs revision, as it relies on access to + meteorological data. + model_ts_type_read : str or dict, optional + may be specified to explicitly define the reading frequency of the + model data, either as str (same for all obs variables) or variable + specific as dict. Not to be confused with :attr:`ts_type`, which + specifies the output frequency of the co-located data. + model_read_aux : dict, optional + may be used to specify additional computation methods of variables from + models. Keys are variables to be computed, values are dictionaries with + keys `vars_required` (list of required variables for computation of var + and `fun` (method that takes list of read data objects and computes + and returns var). + model_use_climatology : bool + if True, attempt to use climatological model data field. Note: this + only works if model data is in AeroCom conventions (climatological + fields are indicated with 9999 as year in the filename) and if this is + active, only single year analysis are supported (i.e. provide int to + :attr:`start` to specify the year and leave :attr:`stop` empty). + gridded_reader_id : dict + BETA: dictionary specifying which gridded reader is supposed to be used + for model (and gridded obs) reading. Note: this is a workaround + solution and will likely be removed in the future when the gridded + reading API is more harmonised + (see https://github.com/metno/pyaerocom/issues/174). + flex_ts_type : bool + Bboolean specifying whether reading frequency of gridded data is + allowed to be flexible. This includes all gridded data, whether it is + model or gridded observation (e.g. satellites). Defaults to True. + min_num_obs : dict or int, optional + time resampling constraints applied, defaults to None, in which case + no constraints are applied. For instance, say your input is in daily + resolution and you want output in monthly and you want to make sure to + have roughly 50% daily coverage for the monthly averages. Then you may + specify `min_num_obs=15` which will ensure that at least 15 daily + averages are available to compute a monthly average. However, you may + also define a hierarchical scheme that first goes from daily to + weekly and then from weekly to monthly, via a dict. E.g. + `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would + ensure that each week has at least 3 daily values, as well as that each + month has at least 4 weekly values. + resample_how : str or dict, optional + string specifying how data should be aggregated when resampling in time. + Default is "mean". Can also be a nested dictionary, e.g. + `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the + maximum value to aggregate from hourly to daily for variable conco3, + rather than the mean. + obs_remove_outliers : bool + if True, outliers are removed from obs data before colocation, + else not. Default is False. + Custom outlier ranges for each variable can be specified via + :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom + default outlier ranges are used. The latter are specified in + `variables.ini` file via `minimum` and `maximum` attributes and can + also be accessed through :attr:`pyaerocom.variable.Variable.minimum` + and :attr:`pyaerocom.variable.Variable.maximum`, respectively. + model_remove_outliers : bool + if True, outliers are removed from model data (normally this should be + set to False, as the models are supposed to be assessed, including + outlier cases). Default is False. + Custom outlier ranges for each variable can be specified via + :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom + default outlier ranges are used. The latter are specified in + `variables.ini` file via `minimum` and `maximum` attributes and can + also be accessed through :attr:`pyaerocom.variable.Variable.minimum` + and :attr:`pyaerocom.variable.Variable.maximum`, respectively. + obs_outlier_ranges : dict, optional + dictionary specifying outlier ranges for individual obs variables. + (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant + if :attr:`obs_remove_outliers` is True. + model_outlier_ranges : dict, optional + like :attr:`obs_outlier_ranges` but for model variables. Only relevant + if :attr:`model_remove_outliers` is True. + zeros_to_nan : bool + If True, zero's in output co-located data object will be converted to + NaN. Default is False. + harmonise_units : bool + if True, units are attempted to be harmonised during co-location + (note: raises Exception if True and in case units cannot be harmonised). + regrid_res_deg : int, optional + resolution in degrees for regridding of model grid (done before + co-location). Default is None. + colocate_time : bool + if True and if obs and model sampling frequency (e.g. daily) are higher + than output colocation frequency (e.g. monthly), then the datasets are + first colocated in time (e.g. on a daily basis), before the monthly + averages are calculated. Default is False. + reanalyse_existing : bool + if True, always redo co-location, even if there is already an existing + co-located NetCDF file (under the output location specified by + :attr:`basedir_coldata` ) for the given variable combination to be + co-located. If False and output already exists, then co-location is + skipped for the associated variable. Default is True. + raise_exceptions : bool + if True, Exceptions that may occur for individual variables to be + processed, are raised, else the analysis is skipped for such cases. + keep_data : bool + if True, then all colocated data objects computed when running + :func:`run` will be stored in :attr:`data`. Defaults to True. + add_meta : dict + additional metadata that is supposed to be added to each output + :class:`ColocatedData` object. + """ + + #: Dictionary specifying alternative vertical types that may be used to + #: read model data. E.g. consider the variable is ec550aer, + #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). + #: Now, if a model that is used for the analysis does not contain a data + #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the + #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this + #: exists, it will load it and extract the surface level. + OBS_VERT_TYPES_ALT: dict[str, str] = {"Surface": "ModelLevel", "2D": "2D"} + + #: do not raise Exception if invalid item is attempted to be assigned + #: (Overwritten from base class) + CRASH_ON_INVALID: bool = False + + FORBIDDEN_KEYS: list[str] = [ + "var_outlier_ranges", # deprecated since v0.12.0 + "var_ref_outlier_ranges", # deprecated since v0.12.0 + "remove_outliers", # deprecated since v0.12.0 + ] + + ts_type: str = "monthly" + obs_vars: list[str] + + def __init__( + self, + model_id=None, + obs_config: PyaroConfig | None = None, + obs_id=None, + obs_vars=None, + ts_type=None, + start=None, + stop=None, + basedir_coldata=None, + save_coldata=False, + **kwargs, + ): + self.model_id = model_id + self._obs_id = None + self._obs_config = None + + self.obs_id = obs_id + self.obs_config = obs_config + + self.obs_vars = obs_vars + + self.ts_type = ts_type + self.start = start + self.stop = stop + + # crashes if input filter name is invalid + self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" + + if basedir_coldata is not None: + basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) + else: + basedir_coldata = const.COLOCATEDDATADIR + self.basedir_coldata = basedir_coldata + self.save_coldata = save_coldata + + # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER + # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED + + # Options related to obs reading and processing + self.obs_name = None + self.obs_data_dir = None + + self.obs_use_climatology = False + + self._obs_cache_only = False # only relevant if obs is ungridded + self.obs_vert_type = None + self.obs_ts_type_read = None + self.obs_filters = {} + self._obs_is_vertical_profile = False + self.colocation_layer_limits = None + self.profile_layer_limits = None + + self.read_opts_ungridded = {} + + # Attributes related to model data + self.model_name = None + self.model_data_dir = None + + self.model_read_opts = {} + + self.model_use_vars = {} + self.model_rename_vars = {} + self.model_add_vars = {} + self.model_to_stp = False + + self.model_ts_type_read = None + self.model_read_aux = {} + self.model_use_climatology = False + + self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} + + self.flex_ts_type = True + + # Options related to time resampling + self.min_num_obs = None + self.resample_how = "mean" + + # Options related to outlier removal + self.obs_remove_outliers = False + self.model_remove_outliers = False + + # Custom outlier ranges for model and obs + self.obs_outlier_ranges = {} + self.model_outlier_ranges = {} + + self.zeros_to_nan = False + self.harmonise_units = False + self.regrid_res_deg = None + self.colocate_time = False + + self.reanalyse_existing = True + self.raise_exceptions = False + self.keep_data = True + + self.add_meta = {} + self.update(**kwargs) + + def _check_input_basedir_coldata(self, basedir_coldata): + """ + Make sure input basedir_coldata is str and exists + + Parameters + ---------- + basedir_coldata : str or Path + basic output directory for colocated data + + Raises + ------ + ValueError + If input is invalid. + + Returns + ------- + str + valid output directory + + """ + if isinstance(basedir_coldata, Path): + basedir_coldata = str(basedir_coldata) + if isinstance(basedir_coldata, str): + if not os.path.exists(basedir_coldata): + os.mkdir(basedir_coldata) + return basedir_coldata + raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") + + def _check_basedir_coldata(self): + """ + Make sure output directory for colocated data files exists + + Raises + ------ + FileNotFoundError + If :attr:`basedir_coldata` does not exist and cannot be created. + + Returns + ------- + str + current value of :attr:`basedir_coldata` + + """ + basedir_coldata = self.basedir_coldata + if basedir_coldata is None: + basedir_coldata = const.COLOCATEDDATADIR + if not os.path.exists(basedir_coldata): + logger.info(f"Creating directory: {basedir_coldata}") + os.mkdir(basedir_coldata) + elif isinstance(basedir_coldata, Path): + basedir_coldata = str(basedir_coldata) + if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): + os.mkdir(basedir_coldata) + if not os.path.exists(basedir_coldata): + raise FileNotFoundError( + f"Output directory for colocated data files {basedir_coldata} does not exist" + ) + self.basedir_coldata = basedir_coldata + return basedir_coldata + + @property + def basedir_logfiles(self): + """Base directory for storing logfiles""" + p = chk_make_subdir(self.basedir_coldata, "logfiles") + return p + + @property + def obs_id(self) -> str: + return self._obs_id + + @obs_id.setter + def obs_id(self, val: str | None) -> None: + if self.obs_config is not None and val != self.obs_config.name: + logger.info( + f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" + ) + self.obs_config = None + + self._obs_id = val + + @property + def obs_config(self) -> PyaroConfig: + return self._obs_config + + @obs_config.setter + def obs_config(self, val: PyaroConfig | None) -> None: + if val is not None: + if isinstance(val, dict): + logger.info("Obs config was given as dict. Will try to convert to PyaroConfig") + val = PyaroConfig(**val) + if self.obs_id is not None and val.name != self.obs_id: + logger.info( + f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" + ) + self.obs_id = val.name + if self.obs_id is None: + self.obs_id = val.name + self._obs_config = val + + def add_glob_meta(self, **kwargs): + """ + Add global metadata to :attr:`add_meta` + + Parameters + ---------- + kwargs + metadata to be added + + Returns + ------- + None + + """ + self.add_meta.update(**kwargs) + + def __setitem__(self, key, val): + if key == "basedir_coldata": + val = self._check_input_basedir_coldata(val) + super().__setitem__(key, val) + + def _period_from_start_stop(self) -> str: + start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) + y0, y1 = start.year, stop.year + assert y0 <= y1 + if y0 == y1: + return str(y0) + else: + return f"{y0}-{y1}" From b7c3e6a2cc67df495818847ae99d7f1bd28058ad Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 13:31:55 +0200 Subject: [PATCH 02/44] first run through old init attrs --- pyaerocom/colocation_setup.py | 192 ++++++++++++++++------------------ 1 file changed, 93 insertions(+), 99 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index bcf0df56e..9594675b0 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -1,9 +1,10 @@ import logging import os from pathlib import Path +from typing import Literal -import numpy as np -from pydantic import BaseModel +import pandas as pd +from pydantic import BaseModel, ConfigDict, Field, field_validator from pyaerocom import const from pyaerocom._lowlevel_helpers import chk_make_subdir @@ -265,6 +266,11 @@ class ColocationSetup(BaseModel): :class:`ColocatedData` object. """ + ########################## + # Pydantic ConfigDict + ########################## + model_config = ConfigDict(arbitrary_types_allowed=True, allow="extra") + #: Dictionary specifying alternative vertical types that may be used to #: read model data. E.g. consider the variable is ec550aer, #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). @@ -287,103 +293,91 @@ class ColocationSetup(BaseModel): ts_type: str = "monthly" obs_vars: list[str] - def __init__( - self, - model_id=None, - obs_config: PyaroConfig | None = None, - obs_id=None, - obs_vars=None, - ts_type=None, - start=None, - stop=None, - basedir_coldata=None, - save_coldata=False, - **kwargs, - ): - self.model_id = model_id - self._obs_id = None - self._obs_config = None - - self.obs_id = obs_id - self.obs_config = obs_config - - self.obs_vars = obs_vars - - self.ts_type = ts_type - self.start = start - self.stop = stop - - # crashes if input filter name is invalid - self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" - - if basedir_coldata is not None: - basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) - else: - basedir_coldata = const.COLOCATEDDATADIR - self.basedir_coldata = basedir_coldata - self.save_coldata = save_coldata - - # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER - # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED - - # Options related to obs reading and processing - self.obs_name = None - self.obs_data_dir = None - - self.obs_use_climatology = False - - self._obs_cache_only = False # only relevant if obs is ungridded - self.obs_vert_type = None - self.obs_ts_type_read = None - self.obs_filters = {} - self._obs_is_vertical_profile = False - self.colocation_layer_limits = None - self.profile_layer_limits = None - - self.read_opts_ungridded = {} - - # Attributes related to model data - self.model_name = None - self.model_data_dir = None - - self.model_read_opts = {} - - self.model_use_vars = {} - self.model_rename_vars = {} - self.model_add_vars = {} - self.model_to_stp = False - - self.model_ts_type_read = None - self.model_read_aux = {} - self.model_use_climatology = False - - self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} - - self.flex_ts_type = True - - # Options related to time resampling - self.min_num_obs = None - self.resample_how = "mean" - - # Options related to outlier removal - self.obs_remove_outliers = False - self.model_remove_outliers = False - - # Custom outlier ranges for model and obs - self.obs_outlier_ranges = {} - self.model_outlier_ranges = {} - - self.zeros_to_nan = False - self.harmonise_units = False - self.regrid_res_deg = None - self.colocate_time = False - - self.reanalyse_existing = True - self.raise_exceptions = False - self.keep_data = True - - self.add_meta = {} - self.update(**kwargs) + model_id: str + # _obs_id : str | None = None + # _obs_config: PyaroConfig | None = None + obs_id: str + obs_config: PyaroConfig | None + ts_type: str + start: pd.Timestamp + stop: pd.Timestamp + + # crashes if input filter name is invalid + filter_name: str = f"{ALL_REGION_NAME}-wMOUNTAINS" + + basedir_coldata: Path | str = Field(default=const.COLOCATEDDATADIR, validate_default=True) + + @field_validator("basedir_coldata") + @classmethod + def validate_basedirs(cls, v): + if not os.path.exists(v): + tmp = Path(v) if isinstance(v, str) else v + tmp.mkdir(parents=True, exist_ok=True) + return v + + save_coldata: bool = False + + # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER + # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED + + # Options related to obs reading and processing + obs_name: str | None = None + obs_data_dir: str | None = None + + obs_use_climatology: bool = False + + _obs_cache_only: bool = False # only relevant if obs is ungridded + obs_vert_type: str | None = None + obs_ts_type_read: str | dict | None = None + obs_filters: dict = {} + _obs_is_vertical_profile: bool = False + colocation_layer_limits: dict[str:float] | None = None + profile_layer_limits: dict | None = None + read_opts_ungridded: dict | None = {} + + # Attributes related to model data + model_name: str | None = None + model_data_dir: Path | str = None + + model_read_opts: dict | None = {} + + model_use_vars: dict[str, str] | None = {} + model_rename_vars: dict[str, str] | None = {} + model_add_vars: dict[str, list[str]] | None = {} + model_to_stp: bool = False + + model_ts_type_read = None + # LB: need to check this declaration + model_read_aux: dict[str, dict[Literal["vars_required", "fun"], list[str]]] | None = {} + model_use_climatology: bool = False + + # LB: check this as well + gridded_reader_id: dict[str, str] = {"model": "ReadGridded", "obs": "ReadGridded"} + + flex_ts_type: bool = True + + # Options related to time resampling + min_num_obs: int | None = None + resample_how: str | dict | None = "mean" + + # Options related to outlier removal + obs_remove_outliers: bool = False + model_remove_outliers: bool = False + + # Custom outlier ranges for model and obs + obs_outlier_ranges = {} + model_outlier_ranges = {} + zeros_to_nan: bool = False + harmonise_units: bool = False + regrid_res_deg: float | None = None + colocate_time: bool = False + reanalyse_existing: bool = True + raise_exceptions: bool = False + keep_data: bool = True + add_meta: dict | None = {} + + # TODO: implelent field validators + # self.update(**kwargs) def _check_input_basedir_coldata(self, basedir_coldata): """ From f214498633f9a88ec2fe88ddf2ca64afaab35d99 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 16:41:58 +0200 Subject: [PATCH 03/44] TODO --- pyaerocom/colocation_setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 9594675b0..b95c48d5e 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -379,6 +379,8 @@ def validate_basedirs(cls, v): # TODO: implelent field validators # self.update(**kwargs) + # TODO: validator for extra arguments. what are they? + def _check_input_basedir_coldata(self, basedir_coldata): """ Make sure input basedir_coldata is str and exists From 84829cf57b8321fe2a79092c6dc3a73a00927550 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 18:08:58 +0200 Subject: [PATCH 04/44] clean up and validators --- pyaerocom/colocation_setup.py | 156 ++++++++++------------------------ 1 file changed, 47 insertions(+), 109 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index b95c48d5e..bf0703da9 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -1,5 +1,6 @@ import logging import os +from functools import cached_property from pathlib import Path from typing import Literal @@ -7,7 +8,6 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator from pyaerocom import const -from pyaerocom._lowlevel_helpers import chk_make_subdir from pyaerocom.config import ALL_REGION_NAME from pyaerocom.helpers import start_stop from pyaerocom.io.pyaro.pyaro_config import PyaroConfig @@ -271,6 +271,24 @@ class ColocationSetup(BaseModel): ########################## model_config = ConfigDict(arbitrary_types_allowed=True, allow="extra") + ######################### + # Required Input + ######################### + + # LB: remains to be seen if this can actually be required without chaning the code elsewhere + model_id: str + obs_id: str + obs_vars: list[str] + ts_type: str + start: pd.Timestamp | int + stop: pd.Timestamp | int + + ############################### + # Attributes with defaults + ############################### + + obs_config: PyaroConfig | None = None + #: Dictionary specifying alternative vertical types that may be used to #: read model data. E.g. consider the variable is ec550aer, #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). @@ -291,21 +309,11 @@ class ColocationSetup(BaseModel): ] ts_type: str = "monthly" - obs_vars: list[str] - - model_id: str - # _obs_id : str | None = None - # _obs_config: PyaroConfig | None = None - obs_id: str - obs_config: PyaroConfig | None - ts_type: str - start: pd.Timestamp - stop: pd.Timestamp # crashes if input filter name is invalid filter_name: str = f"{ALL_REGION_NAME}-wMOUNTAINS" - basedir_coldata: Path | str = Field(default=const.COLOCATEDDATADIR, validate_default=True) + basedir_coldata: str = Field(default=const.COLOCATEDDATADIR, validate_default=True) @field_validator("basedir_coldata") @classmethod @@ -381,104 +389,39 @@ def validate_basedirs(cls, v): # TODO: validator for extra arguments. what are they? - def _check_input_basedir_coldata(self, basedir_coldata): - """ - Make sure input basedir_coldata is str and exists - - Parameters - ---------- - basedir_coldata : str or Path - basic output directory for colocated data - - Raises - ------ - ValueError - If input is invalid. - - Returns - ------- - str - valid output directory - - """ - if isinstance(basedir_coldata, Path): - basedir_coldata = str(basedir_coldata) - if isinstance(basedir_coldata, str): - if not os.path.exists(basedir_coldata): - os.mkdir(basedir_coldata) - return basedir_coldata - raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") - - def _check_basedir_coldata(self): - """ - Make sure output directory for colocated data files exists - - Raises - ------ - FileNotFoundError - If :attr:`basedir_coldata` does not exist and cannot be created. - - Returns - ------- - str - current value of :attr:`basedir_coldata` - - """ - basedir_coldata = self.basedir_coldata - if basedir_coldata is None: - basedir_coldata = const.COLOCATEDDATADIR - if not os.path.exists(basedir_coldata): - logger.info(f"Creating directory: {basedir_coldata}") - os.mkdir(basedir_coldata) - elif isinstance(basedir_coldata, Path): - basedir_coldata = str(basedir_coldata) - if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): - os.mkdir(basedir_coldata) - if not os.path.exists(basedir_coldata): - raise FileNotFoundError( - f"Output directory for colocated data files {basedir_coldata} does not exist" - ) - self.basedir_coldata = basedir_coldata - return basedir_coldata - - @property + @cached_property def basedir_logfiles(self): - """Base directory for storing logfiles""" - p = chk_make_subdir(self.basedir_coldata, "logfiles") - return p - - @property - def obs_id(self) -> str: - return self._obs_id - - @obs_id.setter - def obs_id(self, val: str | None) -> None: - if self.obs_config is not None and val != self.obs_config.name: + p = Path(self.basedir_coldata) / "logfiles" + if not p.exists(): + p.mkdir(parents=True, exist_ok=True) + return str(p) # LB: not sure why pyaerocom insists these be strings as this point + + @field_validator("obs_id") + def validate_obs_id(cls, v: str): + if cls.obs_config is not None and v != cls.obs.config.name: + logger + + # LB: Think we need a validator on the PyaroConfig, not the obs_id. + # Combining the validation logic from those two things here. needs testing. + @field_validator("obs_config") + def validate_obs_config(cls, v: PyaroConfig): + if cls.obs_config is not None and cls.obs.config.name != cls.obs_id: logger.info( - f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" + f"Data ID in Pyaro config {cls.obs_config.name} does not match obs_id {cls.obs_id}. Setting Pyaro config to None!" ) - self.obs_config = None - - self._obs_id = val - - @property - def obs_config(self) -> PyaroConfig: - return self._obs_config - - @obs_config.setter - def obs_config(self, val: PyaroConfig | None) -> None: - if val is not None: - if isinstance(val, dict): + cls.obs_config = None + if v is not None: + if isinstance(v, dict): logger.info("Obs config was given as dict. Will try to convert to PyaroConfig") - val = PyaroConfig(**val) - if self.obs_id is not None and val.name != self.obs_id: + v = PyaroConfig(**v) + if v.name != cls.obs_id: logger.info( - f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" + f"Data ID in Pyaro config {v.name} does not match obs_id {cls.obs_id}. Setting Obs ID to match Pyaro Config!" ) - self.obs_id = val.name - if self.obs_id is None: - self.obs_id = val.name - self._obs_config = val + cls.obs_id = v.name + if cls.obs_id is None: + cls.obs_id = v.name + return v def add_glob_meta(self, **kwargs): """ @@ -496,11 +439,6 @@ def add_glob_meta(self, **kwargs): """ self.add_meta.update(**kwargs) - def __setitem__(self, key, val): - if key == "basedir_coldata": - val = self._check_input_basedir_coldata(val) - super().__setitem__(key, val) - def _period_from_start_stop(self) -> str: start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) y0, y1 = start.year, stop.year From 5d3e3622e2a03e830caa4b6a9498da9e136dff58 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 18:24:06 +0200 Subject: [PATCH 05/44] imports are completely broken. will revisit --- pyaerocom/__init__.py | 3 +- pyaerocom/colocation_auto.py | 998 +++++++++++++++++----------------- pyaerocom/colocation_setup.py | 6 +- 3 files changed, 504 insertions(+), 503 deletions(-) diff --git a/pyaerocom/__init__.py b/pyaerocom/__init__.py index 4397e400b..bae25e093 100644 --- a/pyaerocom/__init__.py +++ b/pyaerocom/__init__.py @@ -48,7 +48,8 @@ from .ungriddeddata import UngriddedData from .filter import Filter from .colocateddata import ColocatedData -from .colocation_auto import ColocationSetup, Colocator +from .colocation_setup import ColocationSetup +from .colocation_auto import Colocator from .tstype import TsType from .time_resampler import TimeResampler from .io.helpers import search_data_dir_aerocom diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index d03f967c3..bc96d8312 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -22,6 +22,7 @@ correct_model_stp_coldata, ) from pyaerocom.colocation_3d import ColocatedDataLists, colocate_vertical_profile_gridded +from pyaerocom.colocation_setup import ColocationSetup from pyaerocom.config import ALL_REGION_NAME from pyaerocom.exceptions import ColocationError, ColocationSetupError, DataCoverageError from pyaerocom.helpers import ( @@ -38,505 +39,504 @@ logger = logging.getLogger(__name__) -class ColocationSetup(BrowseDict): - """ - Setup class for high-level model / obs co-location. - - An instance of this setup class can be used to run a colocation analysis - between a model and an observation network and will create a number of - :class:`pya.ColocatedData` instances, which can be saved automatically - as NetCDF files. - - Apart from co-location, this class also handles reading of the input data - for co-location. Supported co-location options are: - - 1. gridded vs. ungridded data - For instance 3D model data (instance of :class:`GriddedData`) with lat, - lon and time dimension that is co-located with station based observations - which are represented in pyaerocom through :class:`UngriddedData` objects. - The co-location function used is - :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of - co-location, the output co-located data object will be 3-dimensional, - with dimensions `data_source` (index 0: obs, index 1: model), `time` and - `station_name`. - - 2. gridded vs. gridded data - For instance 3D model data that is co-located with 3D satellite data - (both instances of :class:`GriddedData`), both objects with lat, - lon and time dimensions. The co-location function used - is :func:`pyaerocom.colocation.colocated_gridded_gridded`. - For this type of co-location, the output co-located data object will be - 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: - model), `time` and `latitude` and `longitude`. - - - - Attributes - ---------- - model_id : str - ID of model to be used. - - obs_config: PyaroConfig - In the case Pyaro is used, a config must be provided. In that case obs_id(see below) - is ignored and only the config is used. - obs_id : str - ID of observation network to be used. - obs_vars : list - Variables to be analysed (need to be available in input obs dataset). - Variables that are not available in the model data output will be - skipped. Alternatively, model variables to be used for a given obs - variable can also be specified via attributes :attr:`model_use_vars` - and :attr:`model_add_vars`. - ts_type : str - String specifying colocation output frequency. - start - Start time of colocation. Input can be integer denoting the year or - anything that can be converted into :class:`pandas.Timestamp` using - :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first - available date in the model data is used. - stop - stop time of colocation. int or anything that can be converted into - :class:`pandas.Timestamp` using - :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if - ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` - will be automatically set to the end of that year. Else, it will be - set to the last available timestamp in the model data. - filter_name : str - name of filter to be applied. If None, no filter is used - (to be precise, if None, then - :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should - default to `ALL-wMOUNTAINS`, that is, no filtering). - basedir_coldata : str - Base directory for storing of colocated data files. - save_coldata : bool - if True, colocated data objects are saved as NetCDF file. - obs_name : str, optional - if provided, this string will be used in colocated data filename to - specify obsnetwork, else obs_id will be used. - obs_data_dir : str, optional - location of obs data. If None, attempt to infer obs location based on - obs ID. - obs_use_climatology : bool - BETA if True, pyaerocom default climatology is computed from observation - stations (so far only possible for unrgidded / gridded colocation). - obs_vert_type : str - AeroCom vertical code encoded in the model filenames (only AeroCom 3 - and later). Specifies which model file should be read in case there are - multiple options (e.g. surface level data can be read from a - *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is - string (e.g. 'Surface'), then the corresponding vertical type code is - used for reading of all variables that are colocated (i.e. that are - specified in :attr:`obs_vars`). - obs_ts_type_read : str or dict, optional - may be specified to explicitly define the reading frequency of the - observation data (so far, this does only apply to gridded obsdata such - as satellites), either as str (same for all obs variables) or variable - specific as dict. For ungridded reading, the frequency may be specified - via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). - Not to be confused with :attr:`ts_type`, which specifies the - frequency used for colocation. Can be specified variable specific in - form of dictionary. - obs_filters : dict - filters applied to the observational dataset before co-location. - In case of gridded / gridded, these are filters that can be passed to - :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, - or `constraints`. In case the obsdata is ungridded (gridded / ungridded - co-locations) these are filters that are handled through keyword - `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters - are applied to the :class:`UngriddedData` objects after reading and - caching the data, so changing them, will not invalidate the latest - cache of the :class:`UngriddedData`. - read_opts_ungridded : dict, optional - dictionary that specifies reading constraints for ungridded reading, - and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. - Note that - other than for `obs_filters` these filters are applied - during the reading of the :class:`UngriddedData` objects and specifying - them will deactivate caching. - model_name : str, optional - if provided, this string will be used in colocated data filename to - specify model, else obs_id will be used. - model_data_dir : str, optional - Location of model data. If None, attempt to infer model location based - on model ID. - model_read_opts : dict, optional - options for model reading (passed as keyword args to - :func:`pyaerocom.io.ReadUngridded.read`). - model_use_vars : dict, optional - dictionary that specifies mapping of model variables. Keys are - observation variables, values are the corresponding model variables - (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your - observation has var *od550aer* but your model model uses a different - variable name for that variable, say *od550*. Then, you can specify - this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, - a model variable *od550aer* will be ignored, even if it exists - (cf :attr:`model_add_vars`). - model_rename_vars : dict, optional - rename certain model variables **after** co-location, before storing - the associated :class:`ColocatedData` object on disk. Keys are model - variables, values are new names - (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). - Note: this does not impact which variables are read from the model. - model_add_vars : dict, optional - additional model variables to be processed for one obs variable. E.g. - `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would - co-locate both model SO4 AOD (od550so4) and model coarse mode AOD - (od550gt1aer) with total AOD (od550aer) from obs (in addition to - od550aer vs od550aer if applicable). - model_to_stp : bool - ALPHA (please do not use): convert model data values to STP conditions - after co-location. Note: this only works for very particular settings - at the moment and needs revision, as it relies on access to - meteorological data. - model_ts_type_read : str or dict, optional - may be specified to explicitly define the reading frequency of the - model data, either as str (same for all obs variables) or variable - specific as dict. Not to be confused with :attr:`ts_type`, which - specifies the output frequency of the co-located data. - model_read_aux : dict, optional - may be used to specify additional computation methods of variables from - models. Keys are variables to be computed, values are dictionaries with - keys `vars_required` (list of required variables for computation of var - and `fun` (method that takes list of read data objects and computes - and returns var). - model_use_climatology : bool - if True, attempt to use climatological model data field. Note: this - only works if model data is in AeroCom conventions (climatological - fields are indicated with 9999 as year in the filename) and if this is - active, only single year analysis are supported (i.e. provide int to - :attr:`start` to specify the year and leave :attr:`stop` empty). - gridded_reader_id : dict - BETA: dictionary specifying which gridded reader is supposed to be used - for model (and gridded obs) reading. Note: this is a workaround - solution and will likely be removed in the future when the gridded - reading API is more harmonised - (see https://github.com/metno/pyaerocom/issues/174). - flex_ts_type : bool - Bboolean specifying whether reading frequency of gridded data is - allowed to be flexible. This includes all gridded data, whether it is - model or gridded observation (e.g. satellites). Defaults to True. - min_num_obs : dict or int, optional - time resampling constraints applied, defaults to None, in which case - no constraints are applied. For instance, say your input is in daily - resolution and you want output in monthly and you want to make sure to - have roughly 50% daily coverage for the monthly averages. Then you may - specify `min_num_obs=15` which will ensure that at least 15 daily - averages are available to compute a monthly average. However, you may - also define a hierarchical scheme that first goes from daily to - weekly and then from weekly to monthly, via a dict. E.g. - `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would - ensure that each week has at least 3 daily values, as well as that each - month has at least 4 weekly values. - resample_how : str or dict, optional - string specifying how data should be aggregated when resampling in time. - Default is "mean". Can also be a nested dictionary, e.g. - `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the - maximum value to aggregate from hourly to daily for variable conco3, - rather than the mean. - obs_remove_outliers : bool - if True, outliers are removed from obs data before colocation, - else not. Default is False. - Custom outlier ranges for each variable can be specified via - :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom - default outlier ranges are used. The latter are specified in - `variables.ini` file via `minimum` and `maximum` attributes and can - also be accessed through :attr:`pyaerocom.variable.Variable.minimum` - and :attr:`pyaerocom.variable.Variable.maximum`, respectively. - model_remove_outliers : bool - if True, outliers are removed from model data (normally this should be - set to False, as the models are supposed to be assessed, including - outlier cases). Default is False. - Custom outlier ranges for each variable can be specified via - :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom - default outlier ranges are used. The latter are specified in - `variables.ini` file via `minimum` and `maximum` attributes and can - also be accessed through :attr:`pyaerocom.variable.Variable.minimum` - and :attr:`pyaerocom.variable.Variable.maximum`, respectively. - obs_outlier_ranges : dict, optional - dictionary specifying outlier ranges for individual obs variables. - (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant - if :attr:`obs_remove_outliers` is True. - model_outlier_ranges : dict, optional - like :attr:`obs_outlier_ranges` but for model variables. Only relevant - if :attr:`model_remove_outliers` is True. - zeros_to_nan : bool - If True, zero's in output co-located data object will be converted to - NaN. Default is False. - harmonise_units : bool - if True, units are attempted to be harmonised during co-location - (note: raises Exception if True and in case units cannot be harmonised). - regrid_res_deg : int, optional - resolution in degrees for regridding of model grid (done before - co-location). Default is None. - colocate_time : bool - if True and if obs and model sampling frequency (e.g. daily) are higher - than output colocation frequency (e.g. monthly), then the datasets are - first colocated in time (e.g. on a daily basis), before the monthly - averages are calculated. Default is False. - reanalyse_existing : bool - if True, always redo co-location, even if there is already an existing - co-located NetCDF file (under the output location specified by - :attr:`basedir_coldata` ) for the given variable combination to be - co-located. If False and output already exists, then co-location is - skipped for the associated variable. Default is True. - raise_exceptions : bool - if True, Exceptions that may occur for individual variables to be - processed, are raised, else the analysis is skipped for such cases. - keep_data : bool - if True, then all colocated data objects computed when running - :func:`run` will be stored in :attr:`data`. Defaults to True. - add_meta : dict - additional metadata that is supposed to be added to each output - :class:`ColocatedData` object. - """ - - #: Dictionary specifying alternative vertical types that may be used to - #: read model data. E.g. consider the variable is ec550aer, - #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). - #: Now, if a model that is used for the analysis does not contain a data - #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the - #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this - #: exists, it will load it and extract the surface level. - OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} - - #: do not raise Exception if invalid item is attempted to be assigned - #: (Overwritten from base class) - CRASH_ON_INVALID = False - - FORBIDDEN_KEYS = [ - "var_outlier_ranges", # deprecated since v0.12.0 - "var_ref_outlier_ranges", # deprecated since v0.12.0 - "remove_outliers", # deprecated since v0.12.0 - ] - - ts_type = StrWithDefault("monthly") - obs_vars = ListOfStrings() - - def __init__( - self, - model_id=None, - obs_config: Optional[PyaroConfig] = None, - obs_id=None, - obs_vars=None, - ts_type=None, - start=None, - stop=None, - basedir_coldata=None, - save_coldata=False, - **kwargs, - ): - self.model_id = model_id - self._obs_id = None - self._obs_config = None - - self.obs_id = obs_id - self.obs_config = obs_config - - self.obs_vars = obs_vars - - self.ts_type = ts_type - self.start = start - self.stop = stop - - # crashes if input filter name is invalid - self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" - - if basedir_coldata is not None: - basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) - else: - basedir_coldata = const.COLOCATEDDATADIR - self.basedir_coldata = basedir_coldata - self.save_coldata = save_coldata - - # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER - # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED - - # Options related to obs reading and processing - self.obs_name = None - self.obs_data_dir = None - - self.obs_use_climatology = False - - self._obs_cache_only = False # only relevant if obs is ungridded - self.obs_vert_type = None - self.obs_ts_type_read = None - self.obs_filters = {} - self._obs_is_vertical_profile = False - self.colocation_layer_limits = None - self.profile_layer_limits = None - - self.read_opts_ungridded = {} - - # Attributes related to model data - self.model_name = None - self.model_data_dir = None - - self.model_read_opts = {} - - self.model_use_vars = {} - self.model_rename_vars = {} - self.model_add_vars = {} - self.model_to_stp = False - - self.model_ts_type_read = None - self.model_read_aux = {} - self.model_use_climatology = False - - self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} - - self.flex_ts_type = True - - # Options related to time resampling - self.min_num_obs = None - self.resample_how = "mean" - - # Options related to outlier removal - self.obs_remove_outliers = False - self.model_remove_outliers = False - - # Custom outlier ranges for model and obs - self.obs_outlier_ranges = {} - self.model_outlier_ranges = {} - - self.zeros_to_nan = False - self.harmonise_units = False - self.regrid_res_deg = None - self.colocate_time = False - - self.reanalyse_existing = True - self.raise_exceptions = False - self.keep_data = True - - self.add_meta = {} - self.update(**kwargs) - - def _check_input_basedir_coldata(self, basedir_coldata): - """ - Make sure input basedir_coldata is str and exists - - Parameters - ---------- - basedir_coldata : str or Path - basic output directory for colocated data - - Raises - ------ - ValueError - If input is invalid. - - Returns - ------- - str - valid output directory - - """ - if isinstance(basedir_coldata, Path): - basedir_coldata = str(basedir_coldata) - if isinstance(basedir_coldata, str): - if not os.path.exists(basedir_coldata): - os.mkdir(basedir_coldata) - return basedir_coldata - raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") - - def _check_basedir_coldata(self): - """ - Make sure output directory for colocated data files exists - - Raises - ------ - FileNotFoundError - If :attr:`basedir_coldata` does not exist and cannot be created. - - Returns - ------- - str - current value of :attr:`basedir_coldata` - - """ - basedir_coldata = self.basedir_coldata - if basedir_coldata is None: - basedir_coldata = const.COLOCATEDDATADIR - if not os.path.exists(basedir_coldata): - logger.info(f"Creating directory: {basedir_coldata}") - os.mkdir(basedir_coldata) - elif isinstance(basedir_coldata, Path): - basedir_coldata = str(basedir_coldata) - if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): - os.mkdir(basedir_coldata) - if not os.path.exists(basedir_coldata): - raise FileNotFoundError( - f"Output directory for colocated data files {basedir_coldata} does not exist" - ) - self.basedir_coldata = basedir_coldata - return basedir_coldata - - @property - def basedir_logfiles(self): - """Base directory for storing logfiles""" - p = chk_make_subdir(self.basedir_coldata, "logfiles") - return p - - @property - def obs_id(self) -> str: - return self._obs_id - - @obs_id.setter - def obs_id(self, val: Optional[str]) -> None: - if self.obs_config is not None and val != self.obs_config.name: - logger.info( - f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" - ) - self.obs_config = None - - self._obs_id = val - - @property - def obs_config(self) -> PyaroConfig: - return self._obs_config - - @obs_config.setter - def obs_config(self, val: Optional[PyaroConfig]) -> None: - if val is not None: - if isinstance(val, dict): - logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") - val = PyaroConfig(**val) - if self.obs_id is not None and val.name != self.obs_id: - logger.info( - f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" - ) - self.obs_id = val.name - if self.obs_id is None: - self.obs_id = val.name - self._obs_config = val - - def add_glob_meta(self, **kwargs): - """ - Add global metadata to :attr:`add_meta` - - Parameters - ---------- - kwargs - metadata to be added - - Returns - ------- - None - - """ - self.add_meta.update(**kwargs) - - def __setitem__(self, key, val): - if key == "basedir_coldata": - val = self._check_input_basedir_coldata(val) - super().__setitem__(key, val) - - def _period_from_start_stop(self) -> str: - start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) - y0, y1 = start.year, stop.year - assert y0 <= y1 - if y0 == y1: - return str(y0) - else: - return f"{y0}-{y1}" +# class ColocationSetup(BrowseDict): +# """ +# Setup class for high-level model / obs co-location. + +# An instance of this setup class can be used to run a colocation analysis +# between a model and an observation network and will create a number of +# :class:`pya.ColocatedData` instances, which can be saved automatically +# as NetCDF files. + +# Apart from co-location, this class also handles reading of the input data +# for co-location. Supported co-location options are: + +# 1. gridded vs. ungridded data +# For instance 3D model data (instance of :class:`GriddedData`) with lat, +# lon and time dimension that is co-located with station based observations +# which are represented in pyaerocom through :class:`UngriddedData` objects. +# The co-location function used is +# :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of +# co-location, the output co-located data object will be 3-dimensional, +# with dimensions `data_source` (index 0: obs, index 1: model), `time` and +# `station_name`. + +# 2. gridded vs. gridded data +# For instance 3D model data that is co-located with 3D satellite data +# (both instances of :class:`GriddedData`), both objects with lat, +# lon and time dimensions. The co-location function used +# is :func:`pyaerocom.colocation.colocated_gridded_gridded`. +# For this type of co-location, the output co-located data object will be +# 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: +# model), `time` and `latitude` and `longitude`. + + +# Attributes +# ---------- +# model_id : str +# ID of model to be used. + +# obs_config: PyaroConfig +# In the case Pyaro is used, a config must be provided. In that case obs_id(see below) +# is ignored and only the config is used. +# obs_id : str +# ID of observation network to be used. +# obs_vars : list +# Variables to be analysed (need to be available in input obs dataset). +# Variables that are not available in the model data output will be +# skipped. Alternatively, model variables to be used for a given obs +# variable can also be specified via attributes :attr:`model_use_vars` +# and :attr:`model_add_vars`. +# ts_type : str +# String specifying colocation output frequency. +# start +# Start time of colocation. Input can be integer denoting the year or +# anything that can be converted into :class:`pandas.Timestamp` using +# :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first +# available date in the model data is used. +# stop +# stop time of colocation. int or anything that can be converted into +# :class:`pandas.Timestamp` using +# :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if +# ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` +# will be automatically set to the end of that year. Else, it will be +# set to the last available timestamp in the model data. +# filter_name : str +# name of filter to be applied. If None, no filter is used +# (to be precise, if None, then +# :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should +# default to `ALL-wMOUNTAINS`, that is, no filtering). +# basedir_coldata : str +# Base directory for storing of colocated data files. +# save_coldata : bool +# if True, colocated data objects are saved as NetCDF file. +# obs_name : str, optional +# if provided, this string will be used in colocated data filename to +# specify obsnetwork, else obs_id will be used. +# obs_data_dir : str, optional +# location of obs data. If None, attempt to infer obs location based on +# obs ID. +# obs_use_climatology : bool +# BETA if True, pyaerocom default climatology is computed from observation +# stations (so far only possible for unrgidded / gridded colocation). +# obs_vert_type : str +# AeroCom vertical code encoded in the model filenames (only AeroCom 3 +# and later). Specifies which model file should be read in case there are +# multiple options (e.g. surface level data can be read from a +# *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is +# string (e.g. 'Surface'), then the corresponding vertical type code is +# used for reading of all variables that are colocated (i.e. that are +# specified in :attr:`obs_vars`). +# obs_ts_type_read : str or dict, optional +# may be specified to explicitly define the reading frequency of the +# observation data (so far, this does only apply to gridded obsdata such +# as satellites), either as str (same for all obs variables) or variable +# specific as dict. For ungridded reading, the frequency may be specified +# via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). +# Not to be confused with :attr:`ts_type`, which specifies the +# frequency used for colocation. Can be specified variable specific in +# form of dictionary. +# obs_filters : dict +# filters applied to the observational dataset before co-location. +# In case of gridded / gridded, these are filters that can be passed to +# :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, +# or `constraints`. In case the obsdata is ungridded (gridded / ungridded +# co-locations) these are filters that are handled through keyword +# `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters +# are applied to the :class:`UngriddedData` objects after reading and +# caching the data, so changing them, will not invalidate the latest +# cache of the :class:`UngriddedData`. +# read_opts_ungridded : dict, optional +# dictionary that specifies reading constraints for ungridded reading, +# and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. +# Note that - other than for `obs_filters` these filters are applied +# during the reading of the :class:`UngriddedData` objects and specifying +# them will deactivate caching. +# model_name : str, optional +# if provided, this string will be used in colocated data filename to +# specify model, else obs_id will be used. +# model_data_dir : str, optional +# Location of model data. If None, attempt to infer model location based +# on model ID. +# model_read_opts : dict, optional +# options for model reading (passed as keyword args to +# :func:`pyaerocom.io.ReadUngridded.read`). +# model_use_vars : dict, optional +# dictionary that specifies mapping of model variables. Keys are +# observation variables, values are the corresponding model variables +# (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your +# observation has var *od550aer* but your model model uses a different +# variable name for that variable, say *od550*. Then, you can specify +# this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, +# a model variable *od550aer* will be ignored, even if it exists +# (cf :attr:`model_add_vars`). +# model_rename_vars : dict, optional +# rename certain model variables **after** co-location, before storing +# the associated :class:`ColocatedData` object on disk. Keys are model +# variables, values are new names +# (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). +# Note: this does not impact which variables are read from the model. +# model_add_vars : dict, optional +# additional model variables to be processed for one obs variable. E.g. +# `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would +# co-locate both model SO4 AOD (od550so4) and model coarse mode AOD +# (od550gt1aer) with total AOD (od550aer) from obs (in addition to +# od550aer vs od550aer if applicable). +# model_to_stp : bool +# ALPHA (please do not use): convert model data values to STP conditions +# after co-location. Note: this only works for very particular settings +# at the moment and needs revision, as it relies on access to +# meteorological data. +# model_ts_type_read : str or dict, optional +# may be specified to explicitly define the reading frequency of the +# model data, either as str (same for all obs variables) or variable +# specific as dict. Not to be confused with :attr:`ts_type`, which +# specifies the output frequency of the co-located data. +# model_read_aux : dict, optional +# may be used to specify additional computation methods of variables from +# models. Keys are variables to be computed, values are dictionaries with +# keys `vars_required` (list of required variables for computation of var +# and `fun` (method that takes list of read data objects and computes +# and returns var). +# model_use_climatology : bool +# if True, attempt to use climatological model data field. Note: this +# only works if model data is in AeroCom conventions (climatological +# fields are indicated with 9999 as year in the filename) and if this is +# active, only single year analysis are supported (i.e. provide int to +# :attr:`start` to specify the year and leave :attr:`stop` empty). +# gridded_reader_id : dict +# BETA: dictionary specifying which gridded reader is supposed to be used +# for model (and gridded obs) reading. Note: this is a workaround +# solution and will likely be removed in the future when the gridded +# reading API is more harmonised +# (see https://github.com/metno/pyaerocom/issues/174). +# flex_ts_type : bool +# Bboolean specifying whether reading frequency of gridded data is +# allowed to be flexible. This includes all gridded data, whether it is +# model or gridded observation (e.g. satellites). Defaults to True. +# min_num_obs : dict or int, optional +# time resampling constraints applied, defaults to None, in which case +# no constraints are applied. For instance, say your input is in daily +# resolution and you want output in monthly and you want to make sure to +# have roughly 50% daily coverage for the monthly averages. Then you may +# specify `min_num_obs=15` which will ensure that at least 15 daily +# averages are available to compute a monthly average. However, you may +# also define a hierarchical scheme that first goes from daily to +# weekly and then from weekly to monthly, via a dict. E.g. +# `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would +# ensure that each week has at least 3 daily values, as well as that each +# month has at least 4 weekly values. +# resample_how : str or dict, optional +# string specifying how data should be aggregated when resampling in time. +# Default is "mean". Can also be a nested dictionary, e.g. +# `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the +# maximum value to aggregate from hourly to daily for variable conco3, +# rather than the mean. +# obs_remove_outliers : bool +# if True, outliers are removed from obs data before colocation, +# else not. Default is False. +# Custom outlier ranges for each variable can be specified via +# :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom +# default outlier ranges are used. The latter are specified in +# `variables.ini` file via `minimum` and `maximum` attributes and can +# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` +# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. +# model_remove_outliers : bool +# if True, outliers are removed from model data (normally this should be +# set to False, as the models are supposed to be assessed, including +# outlier cases). Default is False. +# Custom outlier ranges for each variable can be specified via +# :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom +# default outlier ranges are used. The latter are specified in +# `variables.ini` file via `minimum` and `maximum` attributes and can +# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` +# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. +# obs_outlier_ranges : dict, optional +# dictionary specifying outlier ranges for individual obs variables. +# (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant +# if :attr:`obs_remove_outliers` is True. +# model_outlier_ranges : dict, optional +# like :attr:`obs_outlier_ranges` but for model variables. Only relevant +# if :attr:`model_remove_outliers` is True. +# zeros_to_nan : bool +# If True, zero's in output co-located data object will be converted to +# NaN. Default is False. +# harmonise_units : bool +# if True, units are attempted to be harmonised during co-location +# (note: raises Exception if True and in case units cannot be harmonised). +# regrid_res_deg : int, optional +# resolution in degrees for regridding of model grid (done before +# co-location). Default is None. +# colocate_time : bool +# if True and if obs and model sampling frequency (e.g. daily) are higher +# than output colocation frequency (e.g. monthly), then the datasets are +# first colocated in time (e.g. on a daily basis), before the monthly +# averages are calculated. Default is False. +# reanalyse_existing : bool +# if True, always redo co-location, even if there is already an existing +# co-located NetCDF file (under the output location specified by +# :attr:`basedir_coldata` ) for the given variable combination to be +# co-located. If False and output already exists, then co-location is +# skipped for the associated variable. Default is True. +# raise_exceptions : bool +# if True, Exceptions that may occur for individual variables to be +# processed, are raised, else the analysis is skipped for such cases. +# keep_data : bool +# if True, then all colocated data objects computed when running +# :func:`run` will be stored in :attr:`data`. Defaults to True. +# add_meta : dict +# additional metadata that is supposed to be added to each output +# :class:`ColocatedData` object. +# """ + +# #: Dictionary specifying alternative vertical types that may be used to +# #: read model data. E.g. consider the variable is ec550aer, +# #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). +# #: Now, if a model that is used for the analysis does not contain a data +# #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the +# #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this +# #: exists, it will load it and extract the surface level. +# OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} + +# #: do not raise Exception if invalid item is attempted to be assigned +# #: (Overwritten from base class) +# CRASH_ON_INVALID = False + +# FORBIDDEN_KEYS = [ +# "var_outlier_ranges", # deprecated since v0.12.0 +# "var_ref_outlier_ranges", # deprecated since v0.12.0 +# "remove_outliers", # deprecated since v0.12.0 +# ] + +# ts_type = StrWithDefault("monthly") +# obs_vars = ListOfStrings() + +# def __init__( +# self, +# model_id=None, +# obs_config: Optional[PyaroConfig] = None, +# obs_id=None, +# obs_vars=None, +# ts_type=None, +# start=None, +# stop=None, +# basedir_coldata=None, +# save_coldata=False, +# **kwargs, +# ): +# self.model_id = model_id +# self._obs_id = None +# self._obs_config = None + +# self.obs_id = obs_id +# self.obs_config = obs_config + +# self.obs_vars = obs_vars + +# self.ts_type = ts_type +# self.start = start +# self.stop = stop + +# # crashes if input filter name is invalid +# self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" + +# if basedir_coldata is not None: +# basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) +# else: +# basedir_coldata = const.COLOCATEDDATADIR +# self.basedir_coldata = basedir_coldata +# self.save_coldata = save_coldata + +# # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER +# # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED + +# # Options related to obs reading and processing +# self.obs_name = None +# self.obs_data_dir = None + +# self.obs_use_climatology = False + +# self._obs_cache_only = False # only relevant if obs is ungridded +# self.obs_vert_type = None +# self.obs_ts_type_read = None +# self.obs_filters = {} +# self._obs_is_vertical_profile = False +# self.colocation_layer_limits = None +# self.profile_layer_limits = None + +# self.read_opts_ungridded = {} + +# # Attributes related to model data +# self.model_name = None +# self.model_data_dir = None + +# self.model_read_opts = {} + +# self.model_use_vars = {} +# self.model_rename_vars = {} +# self.model_add_vars = {} +# self.model_to_stp = False + +# self.model_ts_type_read = None +# self.model_read_aux = {} +# self.model_use_climatology = False + +# self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} + +# self.flex_ts_type = True + +# # Options related to time resampling +# self.min_num_obs = None +# self.resample_how = "mean" + +# # Options related to outlier removal +# self.obs_remove_outliers = False +# self.model_remove_outliers = False + +# # Custom outlier ranges for model and obs +# self.obs_outlier_ranges = {} +# self.model_outlier_ranges = {} + +# self.zeros_to_nan = False +# self.harmonise_units = False +# self.regrid_res_deg = None +# self.colocate_time = False + +# self.reanalyse_existing = True +# self.raise_exceptions = False +# self.keep_data = True + +# self.add_meta = {} +# self.update(**kwargs) + +# def _check_input_basedir_coldata(self, basedir_coldata): +# """ +# Make sure input basedir_coldata is str and exists + +# Parameters +# ---------- +# basedir_coldata : str or Path +# basic output directory for colocated data + +# Raises +# ------ +# ValueError +# If input is invalid. + +# Returns +# ------- +# str +# valid output directory + +# """ +# if isinstance(basedir_coldata, Path): +# basedir_coldata = str(basedir_coldata) +# if isinstance(basedir_coldata, str): +# if not os.path.exists(basedir_coldata): +# os.mkdir(basedir_coldata) +# return basedir_coldata +# raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") + +# def _check_basedir_coldata(self): +# """ +# Make sure output directory for colocated data files exists + +# Raises +# ------ +# FileNotFoundError +# If :attr:`basedir_coldata` does not exist and cannot be created. + +# Returns +# ------- +# str +# current value of :attr:`basedir_coldata` + +# """ +# basedir_coldata = self.basedir_coldata +# if basedir_coldata is None: +# basedir_coldata = const.COLOCATEDDATADIR +# if not os.path.exists(basedir_coldata): +# logger.info(f"Creating directory: {basedir_coldata}") +# os.mkdir(basedir_coldata) +# elif isinstance(basedir_coldata, Path): +# basedir_coldata = str(basedir_coldata) +# if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): +# os.mkdir(basedir_coldata) +# if not os.path.exists(basedir_coldata): +# raise FileNotFoundError( +# f"Output directory for colocated data files {basedir_coldata} does not exist" +# ) +# self.basedir_coldata = basedir_coldata +# return basedir_coldata + +# @property +# def basedir_logfiles(self): +# """Base directory for storing logfiles""" +# p = chk_make_subdir(self.basedir_coldata, "logfiles") +# return p + +# @property +# def obs_id(self) -> str: +# return self._obs_id + +# @obs_id.setter +# def obs_id(self, val: Optional[str]) -> None: +# if self.obs_config is not None and val != self.obs_config.name: +# logger.info( +# f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" +# ) +# self.obs_config = None + +# self._obs_id = val + +# @property +# def obs_config(self) -> PyaroConfig: +# return self._obs_config + +# @obs_config.setter +# def obs_config(self, val: Optional[PyaroConfig]) -> None: +# if val is not None: +# if isinstance(val, dict): +# logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") +# val = PyaroConfig(**val) +# if self.obs_id is not None and val.name != self.obs_id: +# logger.info( +# f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" +# ) +# self.obs_id = val.name +# if self.obs_id is None: +# self.obs_id = val.name +# self._obs_config = val + +# def add_glob_meta(self, **kwargs): +# """ +# Add global metadata to :attr:`add_meta` + +# Parameters +# ---------- +# kwargs +# metadata to be added + +# Returns +# ------- +# None + +# """ +# self.add_meta.update(**kwargs) + +# def __setitem__(self, key, val): +# if key == "basedir_coldata": +# val = self._check_input_basedir_coldata(val) +# super().__setitem__(key, val) + +# def _period_from_start_stop(self) -> str: +# start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) +# y0, y1 = start.year, stop.year +# assert y0 <= y1 +# if y0 == y1: +# return str(y0) +# else: +# return f"{y0}-{y1}" class Colocator(ColocationSetup): diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index bf0703da9..e4d7aa9f1 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -354,7 +354,7 @@ def validate_basedirs(cls, v): model_add_vars: dict[str, list[str]] | None = {} model_to_stp: bool = False - model_ts_type_read = None + model_ts_type_read: str | dict | None = None # LB: need to check this declaration model_read_aux: dict[str, dict[Literal["vars_required", "fun"], list[str]]] | None = {} model_use_climatology: bool = False @@ -373,8 +373,8 @@ def validate_basedirs(cls, v): model_remove_outliers: bool = False # Custom outlier ranges for model and obs - obs_outlier_ranges = {} - model_outlier_ranges = {} + obs_outlier_ranges: dict[str : tuple[float, float]] | None = {} + model_outlier_ranges: dict[str : tuple[float, float]] | None = {} zeros_to_nan: bool = False harmonise_units: bool = False regrid_res_deg: float | None = None From 77b8dec1f2f1b56dbba7858d33fd515a7e9356c2 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 21:15:43 +0200 Subject: [PATCH 06/44] back up and try basic tests on ColocationSetup --- pyaerocom/__init__.py | 5 +- pyaerocom/colocation_auto.py | 996 ++++++++++++++++----------------- tests/test_colocation_setup.py | 60 ++ 3 files changed, 561 insertions(+), 500 deletions(-) create mode 100644 tests/test_colocation_setup.py diff --git a/pyaerocom/__init__.py b/pyaerocom/__init__.py index bae25e093..252ec7911 100644 --- a/pyaerocom/__init__.py +++ b/pyaerocom/__init__.py @@ -48,8 +48,9 @@ from .ungriddeddata import UngriddedData from .filter import Filter from .colocateddata import ColocatedData -from .colocation_setup import ColocationSetup -from .colocation_auto import Colocator + +# from .colocation_setup import ColocationSetup +from .colocation_auto import ColocationSetup, Colocator from .tstype import TsType from .time_resampler import TimeResampler from .io.helpers import search_data_dir_aerocom diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index bc96d8312..2217b3b0b 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -39,504 +39,504 @@ logger = logging.getLogger(__name__) -# class ColocationSetup(BrowseDict): -# """ -# Setup class for high-level model / obs co-location. - -# An instance of this setup class can be used to run a colocation analysis -# between a model and an observation network and will create a number of -# :class:`pya.ColocatedData` instances, which can be saved automatically -# as NetCDF files. - -# Apart from co-location, this class also handles reading of the input data -# for co-location. Supported co-location options are: - -# 1. gridded vs. ungridded data -# For instance 3D model data (instance of :class:`GriddedData`) with lat, -# lon and time dimension that is co-located with station based observations -# which are represented in pyaerocom through :class:`UngriddedData` objects. -# The co-location function used is -# :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of -# co-location, the output co-located data object will be 3-dimensional, -# with dimensions `data_source` (index 0: obs, index 1: model), `time` and -# `station_name`. - -# 2. gridded vs. gridded data -# For instance 3D model data that is co-located with 3D satellite data -# (both instances of :class:`GriddedData`), both objects with lat, -# lon and time dimensions. The co-location function used -# is :func:`pyaerocom.colocation.colocated_gridded_gridded`. -# For this type of co-location, the output co-located data object will be -# 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: -# model), `time` and `latitude` and `longitude`. - - -# Attributes -# ---------- -# model_id : str -# ID of model to be used. - -# obs_config: PyaroConfig -# In the case Pyaro is used, a config must be provided. In that case obs_id(see below) -# is ignored and only the config is used. -# obs_id : str -# ID of observation network to be used. -# obs_vars : list -# Variables to be analysed (need to be available in input obs dataset). -# Variables that are not available in the model data output will be -# skipped. Alternatively, model variables to be used for a given obs -# variable can also be specified via attributes :attr:`model_use_vars` -# and :attr:`model_add_vars`. -# ts_type : str -# String specifying colocation output frequency. -# start -# Start time of colocation. Input can be integer denoting the year or -# anything that can be converted into :class:`pandas.Timestamp` using -# :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first -# available date in the model data is used. -# stop -# stop time of colocation. int or anything that can be converted into -# :class:`pandas.Timestamp` using -# :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if -# ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` -# will be automatically set to the end of that year. Else, it will be -# set to the last available timestamp in the model data. -# filter_name : str -# name of filter to be applied. If None, no filter is used -# (to be precise, if None, then -# :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should -# default to `ALL-wMOUNTAINS`, that is, no filtering). -# basedir_coldata : str -# Base directory for storing of colocated data files. -# save_coldata : bool -# if True, colocated data objects are saved as NetCDF file. -# obs_name : str, optional -# if provided, this string will be used in colocated data filename to -# specify obsnetwork, else obs_id will be used. -# obs_data_dir : str, optional -# location of obs data. If None, attempt to infer obs location based on -# obs ID. -# obs_use_climatology : bool -# BETA if True, pyaerocom default climatology is computed from observation -# stations (so far only possible for unrgidded / gridded colocation). -# obs_vert_type : str -# AeroCom vertical code encoded in the model filenames (only AeroCom 3 -# and later). Specifies which model file should be read in case there are -# multiple options (e.g. surface level data can be read from a -# *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is -# string (e.g. 'Surface'), then the corresponding vertical type code is -# used for reading of all variables that are colocated (i.e. that are -# specified in :attr:`obs_vars`). -# obs_ts_type_read : str or dict, optional -# may be specified to explicitly define the reading frequency of the -# observation data (so far, this does only apply to gridded obsdata such -# as satellites), either as str (same for all obs variables) or variable -# specific as dict. For ungridded reading, the frequency may be specified -# via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). -# Not to be confused with :attr:`ts_type`, which specifies the -# frequency used for colocation. Can be specified variable specific in -# form of dictionary. -# obs_filters : dict -# filters applied to the observational dataset before co-location. -# In case of gridded / gridded, these are filters that can be passed to -# :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, -# or `constraints`. In case the obsdata is ungridded (gridded / ungridded -# co-locations) these are filters that are handled through keyword -# `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters -# are applied to the :class:`UngriddedData` objects after reading and -# caching the data, so changing them, will not invalidate the latest -# cache of the :class:`UngriddedData`. -# read_opts_ungridded : dict, optional -# dictionary that specifies reading constraints for ungridded reading, -# and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. -# Note that - other than for `obs_filters` these filters are applied -# during the reading of the :class:`UngriddedData` objects and specifying -# them will deactivate caching. -# model_name : str, optional -# if provided, this string will be used in colocated data filename to -# specify model, else obs_id will be used. -# model_data_dir : str, optional -# Location of model data. If None, attempt to infer model location based -# on model ID. -# model_read_opts : dict, optional -# options for model reading (passed as keyword args to -# :func:`pyaerocom.io.ReadUngridded.read`). -# model_use_vars : dict, optional -# dictionary that specifies mapping of model variables. Keys are -# observation variables, values are the corresponding model variables -# (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your -# observation has var *od550aer* but your model model uses a different -# variable name for that variable, say *od550*. Then, you can specify -# this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, -# a model variable *od550aer* will be ignored, even if it exists -# (cf :attr:`model_add_vars`). -# model_rename_vars : dict, optional -# rename certain model variables **after** co-location, before storing -# the associated :class:`ColocatedData` object on disk. Keys are model -# variables, values are new names -# (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). -# Note: this does not impact which variables are read from the model. -# model_add_vars : dict, optional -# additional model variables to be processed for one obs variable. E.g. -# `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would -# co-locate both model SO4 AOD (od550so4) and model coarse mode AOD -# (od550gt1aer) with total AOD (od550aer) from obs (in addition to -# od550aer vs od550aer if applicable). -# model_to_stp : bool -# ALPHA (please do not use): convert model data values to STP conditions -# after co-location. Note: this only works for very particular settings -# at the moment and needs revision, as it relies on access to -# meteorological data. -# model_ts_type_read : str or dict, optional -# may be specified to explicitly define the reading frequency of the -# model data, either as str (same for all obs variables) or variable -# specific as dict. Not to be confused with :attr:`ts_type`, which -# specifies the output frequency of the co-located data. -# model_read_aux : dict, optional -# may be used to specify additional computation methods of variables from -# models. Keys are variables to be computed, values are dictionaries with -# keys `vars_required` (list of required variables for computation of var -# and `fun` (method that takes list of read data objects and computes -# and returns var). -# model_use_climatology : bool -# if True, attempt to use climatological model data field. Note: this -# only works if model data is in AeroCom conventions (climatological -# fields are indicated with 9999 as year in the filename) and if this is -# active, only single year analysis are supported (i.e. provide int to -# :attr:`start` to specify the year and leave :attr:`stop` empty). -# gridded_reader_id : dict -# BETA: dictionary specifying which gridded reader is supposed to be used -# for model (and gridded obs) reading. Note: this is a workaround -# solution and will likely be removed in the future when the gridded -# reading API is more harmonised -# (see https://github.com/metno/pyaerocom/issues/174). -# flex_ts_type : bool -# Bboolean specifying whether reading frequency of gridded data is -# allowed to be flexible. This includes all gridded data, whether it is -# model or gridded observation (e.g. satellites). Defaults to True. -# min_num_obs : dict or int, optional -# time resampling constraints applied, defaults to None, in which case -# no constraints are applied. For instance, say your input is in daily -# resolution and you want output in monthly and you want to make sure to -# have roughly 50% daily coverage for the monthly averages. Then you may -# specify `min_num_obs=15` which will ensure that at least 15 daily -# averages are available to compute a monthly average. However, you may -# also define a hierarchical scheme that first goes from daily to -# weekly and then from weekly to monthly, via a dict. E.g. -# `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would -# ensure that each week has at least 3 daily values, as well as that each -# month has at least 4 weekly values. -# resample_how : str or dict, optional -# string specifying how data should be aggregated when resampling in time. -# Default is "mean". Can also be a nested dictionary, e.g. -# `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the -# maximum value to aggregate from hourly to daily for variable conco3, -# rather than the mean. -# obs_remove_outliers : bool -# if True, outliers are removed from obs data before colocation, -# else not. Default is False. -# Custom outlier ranges for each variable can be specified via -# :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom -# default outlier ranges are used. The latter are specified in -# `variables.ini` file via `minimum` and `maximum` attributes and can -# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` -# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. -# model_remove_outliers : bool -# if True, outliers are removed from model data (normally this should be -# set to False, as the models are supposed to be assessed, including -# outlier cases). Default is False. -# Custom outlier ranges for each variable can be specified via -# :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom -# default outlier ranges are used. The latter are specified in -# `variables.ini` file via `minimum` and `maximum` attributes and can -# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` -# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. -# obs_outlier_ranges : dict, optional -# dictionary specifying outlier ranges for individual obs variables. -# (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant -# if :attr:`obs_remove_outliers` is True. -# model_outlier_ranges : dict, optional -# like :attr:`obs_outlier_ranges` but for model variables. Only relevant -# if :attr:`model_remove_outliers` is True. -# zeros_to_nan : bool -# If True, zero's in output co-located data object will be converted to -# NaN. Default is False. -# harmonise_units : bool -# if True, units are attempted to be harmonised during co-location -# (note: raises Exception if True and in case units cannot be harmonised). -# regrid_res_deg : int, optional -# resolution in degrees for regridding of model grid (done before -# co-location). Default is None. -# colocate_time : bool -# if True and if obs and model sampling frequency (e.g. daily) are higher -# than output colocation frequency (e.g. monthly), then the datasets are -# first colocated in time (e.g. on a daily basis), before the monthly -# averages are calculated. Default is False. -# reanalyse_existing : bool -# if True, always redo co-location, even if there is already an existing -# co-located NetCDF file (under the output location specified by -# :attr:`basedir_coldata` ) for the given variable combination to be -# co-located. If False and output already exists, then co-location is -# skipped for the associated variable. Default is True. -# raise_exceptions : bool -# if True, Exceptions that may occur for individual variables to be -# processed, are raised, else the analysis is skipped for such cases. -# keep_data : bool -# if True, then all colocated data objects computed when running -# :func:`run` will be stored in :attr:`data`. Defaults to True. -# add_meta : dict -# additional metadata that is supposed to be added to each output -# :class:`ColocatedData` object. -# """ - -# #: Dictionary specifying alternative vertical types that may be used to -# #: read model data. E.g. consider the variable is ec550aer, -# #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). -# #: Now, if a model that is used for the analysis does not contain a data -# #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the -# #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this -# #: exists, it will load it and extract the surface level. -# OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} - -# #: do not raise Exception if invalid item is attempted to be assigned -# #: (Overwritten from base class) -# CRASH_ON_INVALID = False - -# FORBIDDEN_KEYS = [ -# "var_outlier_ranges", # deprecated since v0.12.0 -# "var_ref_outlier_ranges", # deprecated since v0.12.0 -# "remove_outliers", # deprecated since v0.12.0 -# ] - -# ts_type = StrWithDefault("monthly") -# obs_vars = ListOfStrings() - -# def __init__( -# self, -# model_id=None, -# obs_config: Optional[PyaroConfig] = None, -# obs_id=None, -# obs_vars=None, -# ts_type=None, -# start=None, -# stop=None, -# basedir_coldata=None, -# save_coldata=False, -# **kwargs, -# ): -# self.model_id = model_id -# self._obs_id = None -# self._obs_config = None - -# self.obs_id = obs_id -# self.obs_config = obs_config - -# self.obs_vars = obs_vars - -# self.ts_type = ts_type -# self.start = start -# self.stop = stop - -# # crashes if input filter name is invalid -# self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" - -# if basedir_coldata is not None: -# basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) -# else: -# basedir_coldata = const.COLOCATEDDATADIR -# self.basedir_coldata = basedir_coldata -# self.save_coldata = save_coldata - -# # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER -# # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED - -# # Options related to obs reading and processing -# self.obs_name = None -# self.obs_data_dir = None - -# self.obs_use_climatology = False - -# self._obs_cache_only = False # only relevant if obs is ungridded -# self.obs_vert_type = None -# self.obs_ts_type_read = None -# self.obs_filters = {} -# self._obs_is_vertical_profile = False -# self.colocation_layer_limits = None -# self.profile_layer_limits = None - -# self.read_opts_ungridded = {} - -# # Attributes related to model data -# self.model_name = None -# self.model_data_dir = None - -# self.model_read_opts = {} - -# self.model_use_vars = {} -# self.model_rename_vars = {} -# self.model_add_vars = {} -# self.model_to_stp = False - -# self.model_ts_type_read = None -# self.model_read_aux = {} -# self.model_use_climatology = False - -# self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} - -# self.flex_ts_type = True - -# # Options related to time resampling -# self.min_num_obs = None -# self.resample_how = "mean" - -# # Options related to outlier removal -# self.obs_remove_outliers = False -# self.model_remove_outliers = False - -# # Custom outlier ranges for model and obs -# self.obs_outlier_ranges = {} -# self.model_outlier_ranges = {} - -# self.zeros_to_nan = False -# self.harmonise_units = False -# self.regrid_res_deg = None -# self.colocate_time = False - -# self.reanalyse_existing = True -# self.raise_exceptions = False -# self.keep_data = True - -# self.add_meta = {} -# self.update(**kwargs) - -# def _check_input_basedir_coldata(self, basedir_coldata): -# """ -# Make sure input basedir_coldata is str and exists - -# Parameters -# ---------- -# basedir_coldata : str or Path -# basic output directory for colocated data - -# Raises -# ------ -# ValueError -# If input is invalid. - -# Returns -# ------- -# str -# valid output directory - -# """ -# if isinstance(basedir_coldata, Path): -# basedir_coldata = str(basedir_coldata) -# if isinstance(basedir_coldata, str): -# if not os.path.exists(basedir_coldata): -# os.mkdir(basedir_coldata) -# return basedir_coldata -# raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") - -# def _check_basedir_coldata(self): -# """ -# Make sure output directory for colocated data files exists - -# Raises -# ------ -# FileNotFoundError -# If :attr:`basedir_coldata` does not exist and cannot be created. - -# Returns -# ------- -# str -# current value of :attr:`basedir_coldata` - -# """ -# basedir_coldata = self.basedir_coldata -# if basedir_coldata is None: -# basedir_coldata = const.COLOCATEDDATADIR -# if not os.path.exists(basedir_coldata): -# logger.info(f"Creating directory: {basedir_coldata}") -# os.mkdir(basedir_coldata) -# elif isinstance(basedir_coldata, Path): -# basedir_coldata = str(basedir_coldata) -# if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): -# os.mkdir(basedir_coldata) -# if not os.path.exists(basedir_coldata): -# raise FileNotFoundError( -# f"Output directory for colocated data files {basedir_coldata} does not exist" -# ) -# self.basedir_coldata = basedir_coldata -# return basedir_coldata - -# @property -# def basedir_logfiles(self): -# """Base directory for storing logfiles""" -# p = chk_make_subdir(self.basedir_coldata, "logfiles") -# return p - -# @property -# def obs_id(self) -> str: -# return self._obs_id - -# @obs_id.setter -# def obs_id(self, val: Optional[str]) -> None: -# if self.obs_config is not None and val != self.obs_config.name: -# logger.info( -# f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" -# ) -# self.obs_config = None - -# self._obs_id = val - -# @property -# def obs_config(self) -> PyaroConfig: -# return self._obs_config - -# @obs_config.setter -# def obs_config(self, val: Optional[PyaroConfig]) -> None: -# if val is not None: -# if isinstance(val, dict): -# logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") -# val = PyaroConfig(**val) -# if self.obs_id is not None and val.name != self.obs_id: -# logger.info( -# f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" -# ) -# self.obs_id = val.name -# if self.obs_id is None: -# self.obs_id = val.name -# self._obs_config = val - -# def add_glob_meta(self, **kwargs): -# """ -# Add global metadata to :attr:`add_meta` - -# Parameters -# ---------- -# kwargs -# metadata to be added - -# Returns -# ------- -# None - -# """ -# self.add_meta.update(**kwargs) - -# def __setitem__(self, key, val): -# if key == "basedir_coldata": -# val = self._check_input_basedir_coldata(val) -# super().__setitem__(key, val) - -# def _period_from_start_stop(self) -> str: -# start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) -# y0, y1 = start.year, stop.year -# assert y0 <= y1 -# if y0 == y1: -# return str(y0) -# else: -# return f"{y0}-{y1}" +class ColocationSetup(BrowseDict): + """ + Setup class for high-level model / obs co-location. + + An instance of this setup class can be used to run a colocation analysis + between a model and an observation network and will create a number of + :class:`pya.ColocatedData` instances, which can be saved automatically + as NetCDF files. + + Apart from co-location, this class also handles reading of the input data + for co-location. Supported co-location options are: + + 1. gridded vs. ungridded data + For instance 3D model data (instance of :class:`GriddedData`) with lat, + lon and time dimension that is co-located with station based observations + which are represented in pyaerocom through :class:`UngriddedData` objects. + The co-location function used is + :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of + co-location, the output co-located data object will be 3-dimensional, + with dimensions `data_source` (index 0: obs, index 1: model), `time` and + `station_name`. + + 2. gridded vs. gridded data + For instance 3D model data that is co-located with 3D satellite data + (both instances of :class:`GriddedData`), both objects with lat, + lon and time dimensions. The co-location function used + is :func:`pyaerocom.colocation.colocated_gridded_gridded`. + For this type of co-location, the output co-located data object will be + 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: + model), `time` and `latitude` and `longitude`. + + + Attributes + ---------- + model_id : str + ID of model to be used. + + obs_config: PyaroConfig + In the case Pyaro is used, a config must be provided. In that case obs_id(see below) + is ignored and only the config is used. + obs_id : str + ID of observation network to be used. + obs_vars : list + Variables to be analysed (need to be available in input obs dataset). + Variables that are not available in the model data output will be + skipped. Alternatively, model variables to be used for a given obs + variable can also be specified via attributes :attr:`model_use_vars` + and :attr:`model_add_vars`. + ts_type : str + String specifying colocation output frequency. + start + Start time of colocation. Input can be integer denoting the year or + anything that can be converted into :class:`pandas.Timestamp` using + :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first + available date in the model data is used. + stop + stop time of colocation. int or anything that can be converted into + :class:`pandas.Timestamp` using + :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if + ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` + will be automatically set to the end of that year. Else, it will be + set to the last available timestamp in the model data. + filter_name : str + name of filter to be applied. If None, no filter is used + (to be precise, if None, then + :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should + default to `ALL-wMOUNTAINS`, that is, no filtering). + basedir_coldata : str + Base directory for storing of colocated data files. + save_coldata : bool + if True, colocated data objects are saved as NetCDF file. + obs_name : str, optional + if provided, this string will be used in colocated data filename to + specify obsnetwork, else obs_id will be used. + obs_data_dir : str, optional + location of obs data. If None, attempt to infer obs location based on + obs ID. + obs_use_climatology : bool + BETA if True, pyaerocom default climatology is computed from observation + stations (so far only possible for unrgidded / gridded colocation). + obs_vert_type : str + AeroCom vertical code encoded in the model filenames (only AeroCom 3 + and later). Specifies which model file should be read in case there are + multiple options (e.g. surface level data can be read from a + *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is + string (e.g. 'Surface'), then the corresponding vertical type code is + used for reading of all variables that are colocated (i.e. that are + specified in :attr:`obs_vars`). + obs_ts_type_read : str or dict, optional + may be specified to explicitly define the reading frequency of the + observation data (so far, this does only apply to gridded obsdata such + as satellites), either as str (same for all obs variables) or variable + specific as dict. For ungridded reading, the frequency may be specified + via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). + Not to be confused with :attr:`ts_type`, which specifies the + frequency used for colocation. Can be specified variable specific in + form of dictionary. + obs_filters : dict + filters applied to the observational dataset before co-location. + In case of gridded / gridded, these are filters that can be passed to + :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, + or `constraints`. In case the obsdata is ungridded (gridded / ungridded + co-locations) these are filters that are handled through keyword + `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters + are applied to the :class:`UngriddedData` objects after reading and + caching the data, so changing them, will not invalidate the latest + cache of the :class:`UngriddedData`. + read_opts_ungridded : dict, optional + dictionary that specifies reading constraints for ungridded reading, + and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. + Note that - other than for `obs_filters` these filters are applied + during the reading of the :class:`UngriddedData` objects and specifying + them will deactivate caching. + model_name : str, optional + if provided, this string will be used in colocated data filename to + specify model, else obs_id will be used. + model_data_dir : str, optional + Location of model data. If None, attempt to infer model location based + on model ID. + model_read_opts : dict, optional + options for model reading (passed as keyword args to + :func:`pyaerocom.io.ReadUngridded.read`). + model_use_vars : dict, optional + dictionary that specifies mapping of model variables. Keys are + observation variables, values are the corresponding model variables + (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your + observation has var *od550aer* but your model model uses a different + variable name for that variable, say *od550*. Then, you can specify + this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, + a model variable *od550aer* will be ignored, even if it exists + (cf :attr:`model_add_vars`). + model_rename_vars : dict, optional + rename certain model variables **after** co-location, before storing + the associated :class:`ColocatedData` object on disk. Keys are model + variables, values are new names + (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). + Note: this does not impact which variables are read from the model. + model_add_vars : dict, optional + additional model variables to be processed for one obs variable. E.g. + `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would + co-locate both model SO4 AOD (od550so4) and model coarse mode AOD + (od550gt1aer) with total AOD (od550aer) from obs (in addition to + od550aer vs od550aer if applicable). + model_to_stp : bool + ALPHA (please do not use): convert model data values to STP conditions + after co-location. Note: this only works for very particular settings + at the moment and needs revision, as it relies on access to + meteorological data. + model_ts_type_read : str or dict, optional + may be specified to explicitly define the reading frequency of the + model data, either as str (same for all obs variables) or variable + specific as dict. Not to be confused with :attr:`ts_type`, which + specifies the output frequency of the co-located data. + model_read_aux : dict, optional + may be used to specify additional computation methods of variables from + models. Keys are variables to be computed, values are dictionaries with + keys `vars_required` (list of required variables for computation of var + and `fun` (method that takes list of read data objects and computes + and returns var). + model_use_climatology : bool + if True, attempt to use climatological model data field. Note: this + only works if model data is in AeroCom conventions (climatological + fields are indicated with 9999 as year in the filename) and if this is + active, only single year analysis are supported (i.e. provide int to + :attr:`start` to specify the year and leave :attr:`stop` empty). + gridded_reader_id : dict + BETA: dictionary specifying which gridded reader is supposed to be used + for model (and gridded obs) reading. Note: this is a workaround + solution and will likely be removed in the future when the gridded + reading API is more harmonised + (see https://github.com/metno/pyaerocom/issues/174). + flex_ts_type : bool + Bboolean specifying whether reading frequency of gridded data is + allowed to be flexible. This includes all gridded data, whether it is + model or gridded observation (e.g. satellites). Defaults to True. + min_num_obs : dict or int, optional + time resampling constraints applied, defaults to None, in which case + no constraints are applied. For instance, say your input is in daily + resolution and you want output in monthly and you want to make sure to + have roughly 50% daily coverage for the monthly averages. Then you may + specify `min_num_obs=15` which will ensure that at least 15 daily + averages are available to compute a monthly average. However, you may + also define a hierarchical scheme that first goes from daily to + weekly and then from weekly to monthly, via a dict. E.g. + `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would + ensure that each week has at least 3 daily values, as well as that each + month has at least 4 weekly values. + resample_how : str or dict, optional + string specifying how data should be aggregated when resampling in time. + Default is "mean". Can also be a nested dictionary, e.g. + `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the + maximum value to aggregate from hourly to daily for variable conco3, + rather than the mean. + obs_remove_outliers : bool + if True, outliers are removed from obs data before colocation, + else not. Default is False. + Custom outlier ranges for each variable can be specified via + :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom + default outlier ranges are used. The latter are specified in + `variables.ini` file via `minimum` and `maximum` attributes and can + also be accessed through :attr:`pyaerocom.variable.Variable.minimum` + and :attr:`pyaerocom.variable.Variable.maximum`, respectively. + model_remove_outliers : bool + if True, outliers are removed from model data (normally this should be + set to False, as the models are supposed to be assessed, including + outlier cases). Default is False. + Custom outlier ranges for each variable can be specified via + :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom + default outlier ranges are used. The latter are specified in + `variables.ini` file via `minimum` and `maximum` attributes and can + also be accessed through :attr:`pyaerocom.variable.Variable.minimum` + and :attr:`pyaerocom.variable.Variable.maximum`, respectively. + obs_outlier_ranges : dict, optional + dictionary specifying outlier ranges for individual obs variables. + (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant + if :attr:`obs_remove_outliers` is True. + model_outlier_ranges : dict, optional + like :attr:`obs_outlier_ranges` but for model variables. Only relevant + if :attr:`model_remove_outliers` is True. + zeros_to_nan : bool + If True, zero's in output co-located data object will be converted to + NaN. Default is False. + harmonise_units : bool + if True, units are attempted to be harmonised during co-location + (note: raises Exception if True and in case units cannot be harmonised). + regrid_res_deg : int, optional + resolution in degrees for regridding of model grid (done before + co-location). Default is None. + colocate_time : bool + if True and if obs and model sampling frequency (e.g. daily) are higher + than output colocation frequency (e.g. monthly), then the datasets are + first colocated in time (e.g. on a daily basis), before the monthly + averages are calculated. Default is False. + reanalyse_existing : bool + if True, always redo co-location, even if there is already an existing + co-located NetCDF file (under the output location specified by + :attr:`basedir_coldata` ) for the given variable combination to be + co-located. If False and output already exists, then co-location is + skipped for the associated variable. Default is True. + raise_exceptions : bool + if True, Exceptions that may occur for individual variables to be + processed, are raised, else the analysis is skipped for such cases. + keep_data : bool + if True, then all colocated data objects computed when running + :func:`run` will be stored in :attr:`data`. Defaults to True. + add_meta : dict + additional metadata that is supposed to be added to each output + :class:`ColocatedData` object. + """ + + #: Dictionary specifying alternative vertical types that may be used to + #: read model data. E.g. consider the variable is ec550aer, + #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). + #: Now, if a model that is used for the analysis does not contain a data + #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the + #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this + #: exists, it will load it and extract the surface level. + OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} + + #: do not raise Exception if invalid item is attempted to be assigned + #: (Overwritten from base class) + CRASH_ON_INVALID = False + + FORBIDDEN_KEYS = [ + "var_outlier_ranges", # deprecated since v0.12.0 + "var_ref_outlier_ranges", # deprecated since v0.12.0 + "remove_outliers", # deprecated since v0.12.0 + ] + + ts_type = StrWithDefault("monthly") + obs_vars = ListOfStrings() + + def __init__( + self, + model_id=None, + obs_config: Optional[PyaroConfig] = None, + obs_id=None, + obs_vars=None, + ts_type=None, + start=None, + stop=None, + basedir_coldata=None, + save_coldata=False, + **kwargs, + ): + self.model_id = model_id + self._obs_id = None + self._obs_config = None + + self.obs_id = obs_id + self.obs_config = obs_config + + self.obs_vars = obs_vars + + self.ts_type = ts_type + self.start = start + self.stop = stop + + # crashes if input filter name is invalid + self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" + + if basedir_coldata is not None: + basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) + else: + basedir_coldata = const.COLOCATEDDATADIR + self.basedir_coldata = basedir_coldata + self.save_coldata = save_coldata + + # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER + # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED + + # Options related to obs reading and processing + self.obs_name = None + self.obs_data_dir = None + + self.obs_use_climatology = False + + self._obs_cache_only = False # only relevant if obs is ungridded + self.obs_vert_type = None + self.obs_ts_type_read = None + self.obs_filters = {} + self._obs_is_vertical_profile = False + self.colocation_layer_limits = None + self.profile_layer_limits = None + + self.read_opts_ungridded = {} + + # Attributes related to model data + self.model_name = None + self.model_data_dir = None + + self.model_read_opts = {} + + self.model_use_vars = {} + self.model_rename_vars = {} + self.model_add_vars = {} + self.model_to_stp = False + + self.model_ts_type_read = None + self.model_read_aux = {} + self.model_use_climatology = False + + self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} + + self.flex_ts_type = True + + # Options related to time resampling + self.min_num_obs = None + self.resample_how = "mean" + + # Options related to outlier removal + self.obs_remove_outliers = False + self.model_remove_outliers = False + + # Custom outlier ranges for model and obs + self.obs_outlier_ranges = {} + self.model_outlier_ranges = {} + + self.zeros_to_nan = False + self.harmonise_units = False + self.regrid_res_deg = None + self.colocate_time = False + + self.reanalyse_existing = True + self.raise_exceptions = False + self.keep_data = True + + self.add_meta = {} + self.update(**kwargs) + + def _check_input_basedir_coldata(self, basedir_coldata): + """ + Make sure input basedir_coldata is str and exists + + Parameters + ---------- + basedir_coldata : str or Path + basic output directory for colocated data + + Raises + ------ + ValueError + If input is invalid. + + Returns + ------- + str + valid output directory + + """ + if isinstance(basedir_coldata, Path): + basedir_coldata = str(basedir_coldata) + if isinstance(basedir_coldata, str): + if not os.path.exists(basedir_coldata): + os.mkdir(basedir_coldata) + return basedir_coldata + raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") + + def _check_basedir_coldata(self): + """ + Make sure output directory for colocated data files exists + + Raises + ------ + FileNotFoundError + If :attr:`basedir_coldata` does not exist and cannot be created. + + Returns + ------- + str + current value of :attr:`basedir_coldata` + + """ + basedir_coldata = self.basedir_coldata + if basedir_coldata is None: + basedir_coldata = const.COLOCATEDDATADIR + if not os.path.exists(basedir_coldata): + logger.info(f"Creating directory: {basedir_coldata}") + os.mkdir(basedir_coldata) + elif isinstance(basedir_coldata, Path): + basedir_coldata = str(basedir_coldata) + if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): + os.mkdir(basedir_coldata) + if not os.path.exists(basedir_coldata): + raise FileNotFoundError( + f"Output directory for colocated data files {basedir_coldata} does not exist" + ) + self.basedir_coldata = basedir_coldata + return basedir_coldata + + @property + def basedir_logfiles(self): + """Base directory for storing logfiles""" + p = chk_make_subdir(self.basedir_coldata, "logfiles") + return p + + @property + def obs_id(self) -> str: + return self._obs_id + + @obs_id.setter + def obs_id(self, val: Optional[str]) -> None: + if self.obs_config is not None and val != self.obs_config.name: + logger.info( + f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" + ) + self.obs_config = None + + self._obs_id = val + + @property + def obs_config(self) -> PyaroConfig: + return self._obs_config + + @obs_config.setter + def obs_config(self, val: Optional[PyaroConfig]) -> None: + if val is not None: + if isinstance(val, dict): + logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") + val = PyaroConfig(**val) + if self.obs_id is not None and val.name != self.obs_id: + logger.info( + f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" + ) + self.obs_id = val.name + if self.obs_id is None: + self.obs_id = val.name + self._obs_config = val + + def add_glob_meta(self, **kwargs): + """ + Add global metadata to :attr:`add_meta` + + Parameters + ---------- + kwargs + metadata to be added + + Returns + ------- + None + + """ + self.add_meta.update(**kwargs) + + def __setitem__(self, key, val): + if key == "basedir_coldata": + val = self._check_input_basedir_coldata(val) + super().__setitem__(key, val) + + def _period_from_start_stop(self) -> str: + start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) + y0, y1 = start.year, stop.year + assert y0 <= y1 + if y0 == y1: + return str(y0) + else: + return f"{y0}-{y1}" class Colocator(ColocationSetup): diff --git a/tests/test_colocation_setup.py b/tests/test_colocation_setup.py new file mode 100644 index 000000000..4a2494943 --- /dev/null +++ b/tests/test_colocation_setup.py @@ -0,0 +1,60 @@ +from pathlib import Path + +import pytest + +from pyaerocom import const +from pyaerocom.colocation_setup import ColocationSetup +from pyaerocom.config import ALL_REGION_NAME + +COL_OUT_DEFAULT = Path(const.OUTPUTDIR) / "colocated_data" + +default_setup = { + "model_id": None, + "obs_id": None, + "obs_vars": [], + "ts_type": "monthly", + "start": None, + "stop": None, + "filter_name": f"{ALL_REGION_NAME}-wMOUNTAINS", + "basedir_coldata": COL_OUT_DEFAULT, + "save_coldata": False, + "obs_name": None, + "obs_data_dir": None, + "obs_use_climatology": False, + "_obs_cache_only": False, + "obs_vert_type": None, + "obs_ts_type_read": None, + "obs_filters": {}, + "model_name": None, + "model_data_dir": None, + "model_read_opts": {}, + "read_opts_ungridded": {}, + "model_use_vars": {}, + "model_rename_vars": {}, + "model_add_vars": {}, + "model_to_stp": False, + "model_ts_type_read": None, + "model_read_aux": {}, + "model_use_climatology": False, + "gridded_reader_id": {"model": "ReadGridded", "obs": "ReadGridded"}, + "flex_ts_type": True, + "min_num_obs": None, + "resample_how": "mean", + "obs_remove_outliers": False, + "model_remove_outliers": False, + "zeros_to_nan": False, + "obs_outlier_ranges": {}, + "model_outlier_ranges": {}, + "harmonise_units": False, + "regrid_res_deg": None, + "colocate_time": False, + "reanalyse_existing": True, + "raise_exceptions": False, + "keep_data": True, + "add_meta": {}, +} + + +@pytest.mark.parametrize("stp,should_be", [(ColocationSetup(), default_setup)]) +def test_ColocationSetup(stp: ColocationSetup, should_be: dict): + pass From 6a2830fac8ccfeeb972380480f17172a5de89981 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 21:38:41 +0200 Subject: [PATCH 07/44] clean up syntax errors --- pyaerocom/colocation_setup.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index e4d7aa9f1..17af5bdfb 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -269,19 +269,19 @@ class ColocationSetup(BaseModel): ########################## # Pydantic ConfigDict ########################## - model_config = ConfigDict(arbitrary_types_allowed=True, allow="extra") + model_config = ConfigDict(arbitrary_types_allowed=True, allow="extra", protected_namespaces=()) ######################### # Required Input ######################### # LB: remains to be seen if this can actually be required without chaning the code elsewhere - model_id: str - obs_id: str - obs_vars: list[str] - ts_type: str - start: pd.Timestamp | int - stop: pd.Timestamp | int + model_id: str | None = None + obs_id: str | None = None + obs_vars: list[str] | None = None + ts_type: str | None = None + start: pd.Timestamp | int | None = None + stop: pd.Timestamp | int | None = None ############################### # Attributes with defaults @@ -325,8 +325,8 @@ def validate_basedirs(cls, v): save_coldata: bool = False - # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER - # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED + # # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER + # # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED # Options related to obs reading and processing obs_name: str | None = None @@ -339,7 +339,7 @@ def validate_basedirs(cls, v): obs_ts_type_read: str | dict | None = None obs_filters: dict = {} _obs_is_vertical_profile: bool = False - colocation_layer_limits: dict[str:float] | None = None + colocation_layer_limits: dict[str, float] | None = None profile_layer_limits: dict | None = None read_opts_ungridded: dict | None = {} @@ -351,7 +351,7 @@ def validate_basedirs(cls, v): model_use_vars: dict[str, str] | None = {} model_rename_vars: dict[str, str] | None = {} - model_add_vars: dict[str, list[str]] | None = {} + model_add_vars: dict[str, list] | None = {} model_to_stp: bool = False model_ts_type_read: str | dict | None = None @@ -373,8 +373,8 @@ def validate_basedirs(cls, v): model_remove_outliers: bool = False # Custom outlier ranges for model and obs - obs_outlier_ranges: dict[str : tuple[float, float]] | None = {} - model_outlier_ranges: dict[str : tuple[float, float]] | None = {} + obs_outlier_ranges: dict[str, tuple[float, float]] | None = {} + model_outlier_ranges: dict[str, tuple[float, float]] | None = {} zeros_to_nan: bool = False harmonise_units: bool = False regrid_res_deg: float | None = None From 691a97820e0044579025860b35575037d36fdef3 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 21:39:25 +0200 Subject: [PATCH 08/44] don't import new ColocationSetup into old tests --- pyaerocom/__init__.py | 2 -- pyaerocom/colocation_auto.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyaerocom/__init__.py b/pyaerocom/__init__.py index 252ec7911..4397e400b 100644 --- a/pyaerocom/__init__.py +++ b/pyaerocom/__init__.py @@ -48,8 +48,6 @@ from .ungriddeddata import UngriddedData from .filter import Filter from .colocateddata import ColocatedData - -# from .colocation_setup import ColocationSetup from .colocation_auto import ColocationSetup, Colocator from .tstype import TsType from .time_resampler import TimeResampler diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 2217b3b0b..3c4782fb3 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -22,7 +22,8 @@ correct_model_stp_coldata, ) from pyaerocom.colocation_3d import ColocatedDataLists, colocate_vertical_profile_gridded -from pyaerocom.colocation_setup import ColocationSetup + +# from pyaerocom.colocation_setup import ColocationSetup from pyaerocom.config import ALL_REGION_NAME from pyaerocom.exceptions import ColocationError, ColocationSetupError, DataCoverageError from pyaerocom.helpers import ( From 4a8e42b8e6be52e02edaca114086f09ea9e225b7 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 21:39:38 +0200 Subject: [PATCH 09/44] start test_colocation_setup --- tests/test_colocation_setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_colocation_setup.py b/tests/test_colocation_setup.py index 4a2494943..6a6178663 100644 --- a/tests/test_colocation_setup.py +++ b/tests/test_colocation_setup.py @@ -57,4 +57,9 @@ @pytest.mark.parametrize("stp,should_be", [(ColocationSetup(), default_setup)]) def test_ColocationSetup(stp: ColocationSetup, should_be: dict): - pass + for key, val in should_be.items(): + assert key in stp + if key == "basedir_coldata": + assert Path(val) == Path(stp["basedir_coldata"]) + else: + assert val == stp[key], key From c3788c55b732c4564cdf3b00c53ffd299164f889 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Sun, 14 Apr 2024 22:01:18 +0200 Subject: [PATCH 10/44] initial test passes --- pyaerocom/colocation_setup.py | 17 +++++++++++++---- tests/test_colocation_setup.py | 11 ++++++----- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 17af5bdfb..1763c235b 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -2,10 +2,10 @@ import os from functools import cached_property from pathlib import Path -from typing import Literal +from typing import Iterable, Literal import pandas as pd -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pyaerocom import const from pyaerocom.config import ALL_REGION_NAME @@ -271,6 +271,15 @@ class ColocationSetup(BaseModel): ########################## model_config = ConfigDict(arbitrary_types_allowed=True, allow="extra", protected_namespaces=()) + # @model_validator('*', mode="before") + # def convert_to_none(cls, v): + # if isinstance(v, str) and v.strip() == "": + # return None + # if isinstance(v, Iterable) and len(v) == 0: + # return None + # else: + # return v + ######################### # Required Input ######################### @@ -334,11 +343,11 @@ def validate_basedirs(cls, v): obs_use_climatology: bool = False - _obs_cache_only: bool = False # only relevant if obs is ungridded + obs_cache_only: bool = False # only relevant if obs is ungridded obs_vert_type: str | None = None obs_ts_type_read: str | dict | None = None obs_filters: dict = {} - _obs_is_vertical_profile: bool = False + obs_is_vertical_profile: bool = False colocation_layer_limits: dict[str, float] | None = None profile_layer_limits: dict | None = None read_opts_ungridded: dict | None = {} diff --git a/tests/test_colocation_setup.py b/tests/test_colocation_setup.py index 6a6178663..509adc611 100644 --- a/tests/test_colocation_setup.py +++ b/tests/test_colocation_setup.py @@ -11,7 +11,7 @@ default_setup = { "model_id": None, "obs_id": None, - "obs_vars": [], + "obs_vars": None, "ts_type": "monthly", "start": None, "stop": None, @@ -21,7 +21,7 @@ "obs_name": None, "obs_data_dir": None, "obs_use_climatology": False, - "_obs_cache_only": False, + "obs_cache_only": False, "obs_vert_type": None, "obs_ts_type_read": None, "obs_filters": {}, @@ -57,9 +57,10 @@ @pytest.mark.parametrize("stp,should_be", [(ColocationSetup(), default_setup)]) def test_ColocationSetup(stp: ColocationSetup, should_be: dict): + stp_dict = stp.model_dump() for key, val in should_be.items(): - assert key in stp + assert key in stp_dict if key == "basedir_coldata": - assert Path(val) == Path(stp["basedir_coldata"]) + assert Path(val) == Path(stp_dict["basedir_coldata"]) else: - assert val == stp[key], key + assert val == stp_dict[key], key From 1e1118e0ee453472d10ca8640280f66b65fa0835 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Mon, 13 May 2024 10:38:54 +0200 Subject: [PATCH 11/44] ColocationSetup init and model_validator --- pyaerocom/colocation_setup.py | 43 ++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 1763c235b..95802abf8 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -5,7 +5,14 @@ from typing import Iterable, Literal import pandas as pd -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + ValidationError, + field_validator, + model_validator, +) from pyaerocom import const from pyaerocom.config import ALL_REGION_NAME @@ -396,6 +403,40 @@ def validate_basedirs(cls, v): # TODO: implelent field validators # self.update(**kwargs) + # Override __init__ to allow for positional arguments + def __init__( + self, + model_id=None, + obs_config=None, + obs_id=None, + obs_vars=None, + ts_type=None, + start=None, + stop=None, + basedir_coldata=None, + save_coldata=False, + **kwargs, + ) -> None: + super(ColocationSetup, self).__init__( + model_id=model_id, + obs_config=obs_config, + obs_id=obs_id, + obs_vars=obs_vars, + ts_type=ts_type, + start=start, + stop=stop, + basedir_coldata=basedir_coldata, + save_coldata=save_coldata, + **kwargs, + ) + + # Model validator for forbidden keys + @model_validator(mode="after") + def validate_no_forbidden_keys(self): + for key in self.FORBIDDEN_KEYS: + if key in self.model_fields: # LB: Check this is where they will be found + raise ValidationError + # TODO: validator for extra arguments. what are they? @cached_property From 6a910dddd18bef3ab20b5bd19e9aea665ba0e7b1 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Mon, 13 May 2024 10:48:00 +0200 Subject: [PATCH 12/44] new ColocatedSetup in test_colocation_auto WIP --- pyaerocom/colocation_setup.py | 7 ++++++- tests/test_colocation_auto.py | 23 +++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 95802abf8..018670e00 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -449,7 +449,12 @@ def basedir_logfiles(self): @field_validator("obs_id") def validate_obs_id(cls, v: str): if cls.obs_config is not None and v != cls.obs.config.name: - logger + logger.info( + f"Data ID in Pyaro config {cls.obs_config.name} does not match obs_id {v}. Setting Pyaro config to None!" + ) + cls.obs_config = None + + cls.obs_id = v # LB: Think we need a validator on the PyaroConfig, not the obs_id. # Combining the validation logic from those two things here. needs testing. diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index ac023f13a..5e362584b 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -2,9 +2,11 @@ import numpy as np import pytest +from pydantic import ValidationError from pyaerocom import ColocatedData, GriddedData, UngriddedData, const -from pyaerocom.colocation_auto import ColocationSetup, Colocator +from pyaerocom.colocation_auto import Colocator +from pyaerocom.colocation_setup import ColocationSetup from pyaerocom.config import ALL_REGION_NAME from pyaerocom.exceptions import ColocationError, ColocationSetupError from pyaerocom.io.aux_read_cubes import add_cubes @@ -16,7 +18,7 @@ default_setup = { "model_id": None, "obs_id": None, - "obs_vars": [], + "obs_vars": None, "ts_type": "monthly", "start": None, "stop": None, @@ -26,7 +28,7 @@ "obs_name": None, "obs_data_dir": None, "obs_use_climatology": False, - "_obs_cache_only": False, + "obs_cache_only": False, "obs_vert_type": None, "obs_ts_type_read": None, "obs_filters": {}, @@ -79,21 +81,22 @@ def col(): @pytest.mark.parametrize("stp,should_be", [(ColocationSetup(), default_setup)]) def test_colocation_setup(stp: ColocationSetup, should_be: dict): + stp_dict = stp.model_dump() for key, val in should_be.items(): - assert key in stp + assert key in stp_dict if key == "basedir_coldata": - assert Path(val) == Path(stp["basedir_coldata"]) + assert Path(val) == Path(stp_dict["basedir_coldata"]) else: - assert val == stp[key], key + assert val == stp_dict[key], key @pytest.mark.parametrize( "key,val,raises", [ - ("obs_vars", 42, pytest.raises(ValueError)), - ("var_ref_outlier_ranges", [41, 42], pytest.raises(KeyError)), - ("var_outlier_ranges", [41, 42], pytest.raises(KeyError)), - ("remove_outliers", True, pytest.raises(KeyError)), + ("obs_vars", 42, ValidationError), + ("var_ref_outlier_ranges", [41, 42], ValidationError), + ("var_outlier_ranges", [41, 42], ValidationError), + ("remove_outliers", True, ValidationError), ], ) def test_ColocationSetup_invalid_input(key, val, raises): From cf3e4eb69b826f0a2a4248b6b46b0d9c3494b553 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 15 May 2024 20:02:22 +0200 Subject: [PATCH 13/44] maybe use obs_config in default_setup --- tests/test_colocation_setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_colocation_setup.py b/tests/test_colocation_setup.py index 509adc611..a4e136781 100644 --- a/tests/test_colocation_setup.py +++ b/tests/test_colocation_setup.py @@ -12,6 +12,7 @@ "model_id": None, "obs_id": None, "obs_vars": None, + "obs_config": None, "ts_type": "monthly", "start": None, "stop": None, From 52fcd990a2a56e5e526f6528dc66506035218f33 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 15 May 2024 21:09:08 +0200 Subject: [PATCH 14/44] obs_config problematic. positional args working --- pyaerocom/colocation_setup.py | 57 +++++++++++++++++----------------- tests/test_colocation_setup.py | 2 +- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 018670e00..a178a64eb 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -288,23 +288,22 @@ class ColocationSetup(BaseModel): # return v ######################### - # Required Input + # Init Input ######################### # LB: remains to be seen if this can actually be required without chaning the code elsewhere - model_id: str | None = None - obs_id: str | None = None - obs_vars: list[str] | None = None - ts_type: str | None = None - start: pd.Timestamp | int | None = None - stop: pd.Timestamp | int | None = None + model_id: str | None # = None + obs_id: str | None # = None + obs_vars: list[str] | str | None # = None + ts_type: str # = None + start: pd.Timestamp | int | None # = None + stop: pd.Timestamp | int | None # = None + obs_config: PyaroConfig | None # = None ############################### # Attributes with defaults ############################### - obs_config: PyaroConfig | None = None - #: Dictionary specifying alternative vertical types that may be used to #: read model data. E.g. consider the variable is ec550aer, #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). @@ -324,12 +323,10 @@ class ColocationSetup(BaseModel): "remove_outliers", # deprecated since v0.12.0 ] - ts_type: str = "monthly" - # crashes if input filter name is invalid filter_name: str = f"{ALL_REGION_NAME}-wMOUNTAINS" - basedir_coldata: str = Field(default=const.COLOCATEDDATADIR, validate_default=True) + basedir_coldata: str # = Field(default=const.COLOCATEDDATADIR, validate_default=True) @field_validator("basedir_coldata") @classmethod @@ -406,15 +403,15 @@ def validate_basedirs(cls, v): # Override __init__ to allow for positional arguments def __init__( self, - model_id=None, - obs_config=None, - obs_id=None, - obs_vars=None, - ts_type=None, - start=None, - stop=None, - basedir_coldata=None, - save_coldata=False, + model_id: str | None = None, + obs_config: PyaroConfig | None = None, + obs_id: str | None = None, + obs_vars: list[str] | None = None, + ts_type: str = "monthly", + start: pd.Timestamp | int | None = None, + stop: pd.Timestamp | int | None = None, + basedir_coldata: str = const.COLOCATEDDATADIR, + save_coldata: bool = False, **kwargs, ) -> None: super(ColocationSetup, self).__init__( @@ -446,19 +443,21 @@ def basedir_logfiles(self): p.mkdir(parents=True, exist_ok=True) return str(p) # LB: not sure why pyaerocom insists these be strings as this point - @field_validator("obs_id") - def validate_obs_id(cls, v: str): - if cls.obs_config is not None and v != cls.obs.config.name: - logger.info( - f"Data ID in Pyaro config {cls.obs_config.name} does not match obs_id {v}. Setting Pyaro config to None!" - ) - cls.obs_config = None + # @field_validator("obs_id") + # def validate_obs_id(cls, v: str): + # if cls.obs_config is not None and v != cls.obs.config.name: + # logger.info( + # f"Data ID in Pyaro config {cls.obs_config.name} does not match obs_id {v}. Setting Pyaro config to None!" + # ) + # cls.obs_config = None - cls.obs_id = v + # cls.obs_id = v # LB: Think we need a validator on the PyaroConfig, not the obs_id. # Combining the validation logic from those two things here. needs testing. + # LB: this needs serious work @field_validator("obs_config") + @classmethod def validate_obs_config(cls, v: PyaroConfig): if cls.obs_config is not None and cls.obs.config.name != cls.obs_id: logger.info( diff --git a/tests/test_colocation_setup.py b/tests/test_colocation_setup.py index a4e136781..41cdfa93a 100644 --- a/tests/test_colocation_setup.py +++ b/tests/test_colocation_setup.py @@ -12,7 +12,7 @@ "model_id": None, "obs_id": None, "obs_vars": None, - "obs_config": None, + # "obs_config": None, "ts_type": "monthly", "start": None, "stop": None, From f15c3079c48108c9abe066eaa03f55e53498fa5f Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 22 May 2024 16:05:25 +0200 Subject: [PATCH 15/44] colocation_setup testing --- pyaerocom/colocation_auto.py | 1005 +++++++++++++++++---------------- pyaerocom/colocation_setup.py | 30 +- tests/test_colocation_auto.py | 4 +- 3 files changed, 529 insertions(+), 510 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 63ff8560e..06ee3fea6 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -22,6 +22,7 @@ correct_model_stp_coldata, ) from pyaerocom.colocation_3d import ColocatedDataLists, colocate_vertical_profile_gridded +from pyaerocom.colocation_setup import ColocationSetup # from pyaerocom.colocation_setup import ColocationSetup from pyaerocom.config import ALL_REGION_NAME @@ -41,506 +42,506 @@ logger = logging.getLogger(__name__) -class ColocationSetup(BrowseDict): - """ - Setup class for high-level model / obs co-location. - - An instance of this setup class can be used to run a colocation analysis - between a model and an observation network and will create a number of - :class:`pya.ColocatedData` instances, which can be saved automatically - as NetCDF files. - - Apart from co-location, this class also handles reading of the input data - for co-location. Supported co-location options are: - - 1. gridded vs. ungridded data - For instance 3D model data (instance of :class:`GriddedData`) with lat, - lon and time dimension that is co-located with station based observations - which are represented in pyaerocom through :class:`UngriddedData` objects. - The co-location function used is - :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of - co-location, the output co-located data object will be 3-dimensional, - with dimensions `data_source` (index 0: obs, index 1: model), `time` and - `station_name`. - - 2. gridded vs. gridded data - For instance 3D model data that is co-located with 3D satellite data - (both instances of :class:`GriddedData`), both objects with lat, - lon and time dimensions. The co-location function used - is :func:`pyaerocom.colocation.colocated_gridded_gridded`. - For this type of co-location, the output co-located data object will be - 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: - model), `time` and `latitude` and `longitude`. - - - Attributes - ---------- - model_id : str - ID of model to be used. - - obs_config: PyaroConfig - In the case Pyaro is used, a config must be provided. In that case obs_id(see below) - is ignored and only the config is used. - obs_id : str - ID of observation network to be used. - obs_vars : list - Variables to be analysed (need to be available in input obs dataset). - Variables that are not available in the model data output will be - skipped. Alternatively, model variables to be used for a given obs - variable can also be specified via attributes :attr:`model_use_vars` - and :attr:`model_add_vars`. - ts_type : str - String specifying colocation output frequency. - start - Start time of colocation. Input can be integer denoting the year or - anything that can be converted into :class:`pandas.Timestamp` using - :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first - available date in the model data is used. - stop - stop time of colocation. int or anything that can be converted into - :class:`pandas.Timestamp` using - :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if - ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` - will be automatically set to the end of that year. Else, it will be - set to the last available timestamp in the model data. - filter_name : str - name of filter to be applied. If None, no filter is used - (to be precise, if None, then - :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should - default to `ALL-wMOUNTAINS`, that is, no filtering). - basedir_coldata : str - Base directory for storing of colocated data files. - save_coldata : bool - if True, colocated data objects are saved as NetCDF file. - obs_name : str, optional - if provided, this string will be used in colocated data filename to - specify obsnetwork, else obs_id will be used. - obs_data_dir : str, optional - location of obs data. If None, attempt to infer obs location based on - obs ID. - obs_use_climatology : bool - BETA if True, pyaerocom default climatology is computed from observation - stations (so far only possible for unrgidded / gridded colocation). - obs_vert_type : str - AeroCom vertical code encoded in the model filenames (only AeroCom 3 - and later). Specifies which model file should be read in case there are - multiple options (e.g. surface level data can be read from a - *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is - string (e.g. 'Surface'), then the corresponding vertical type code is - used for reading of all variables that are colocated (i.e. that are - specified in :attr:`obs_vars`). - obs_ts_type_read : str or dict, optional - may be specified to explicitly define the reading frequency of the - observation data (so far, this does only apply to gridded obsdata such - as satellites), either as str (same for all obs variables) or variable - specific as dict. For ungridded reading, the frequency may be specified - via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). - Not to be confused with :attr:`ts_type`, which specifies the - frequency used for colocation. Can be specified variable specific in - form of dictionary. - obs_filters : dict - filters applied to the observational dataset before co-location. - In case of gridded / gridded, these are filters that can be passed to - :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, - or `constraints`. In case the obsdata is ungridded (gridded / ungridded - co-locations) these are filters that are handled through keyword - `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters - are applied to the :class:`UngriddedData` objects after reading and - caching the data, so changing them, will not invalidate the latest - cache of the :class:`UngriddedData`. - read_opts_ungridded : dict, optional - dictionary that specifies reading constraints for ungridded reading, - and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. - Note that - other than for `obs_filters` these filters are applied - during the reading of the :class:`UngriddedData` objects and specifying - them will deactivate caching. - model_name : str, optional - if provided, this string will be used in colocated data filename to - specify model, else obs_id will be used. - model_data_dir : str, optional - Location of model data. If None, attempt to infer model location based - on model ID. - model_read_opts : dict, optional - options for model reading (passed as keyword args to - :func:`pyaerocom.io.ReadUngridded.read`). - model_use_vars : dict, optional - dictionary that specifies mapping of model variables. Keys are - observation variables, values are the corresponding model variables - (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your - observation has var *od550aer* but your model model uses a different - variable name for that variable, say *od550*. Then, you can specify - this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, - a model variable *od550aer* will be ignored, even if it exists - (cf :attr:`model_add_vars`). - model_rename_vars : dict, optional - rename certain model variables **after** co-location, before storing - the associated :class:`ColocatedData` object on disk. Keys are model - variables, values are new names - (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). - Note: this does not impact which variables are read from the model. - model_add_vars : dict, optional - additional model variables to be processed for one obs variable. E.g. - `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would - co-locate both model SO4 AOD (od550so4) and model coarse mode AOD - (od550gt1aer) with total AOD (od550aer) from obs (in addition to - od550aer vs od550aer if applicable). - model_to_stp : bool - ALPHA (please do not use): convert model data values to STP conditions - after co-location. Note: this only works for very particular settings - at the moment and needs revision, as it relies on access to - meteorological data. - model_ts_type_read : str or dict, optional - may be specified to explicitly define the reading frequency of the - model data, either as str (same for all obs variables) or variable - specific as dict. Not to be confused with :attr:`ts_type`, which - specifies the output frequency of the co-located data. - model_read_aux : dict, optional - may be used to specify additional computation methods of variables from - models. Keys are variables to be computed, values are dictionaries with - keys `vars_required` (list of required variables for computation of var - and `fun` (method that takes list of read data objects and computes - and returns var). - model_use_climatology : bool - if True, attempt to use climatological model data field. Note: this - only works if model data is in AeroCom conventions (climatological - fields are indicated with 9999 as year in the filename) and if this is - active, only single year analysis are supported (i.e. provide int to - :attr:`start` to specify the year and leave :attr:`stop` empty). - gridded_reader_id : dict - BETA: dictionary specifying which gridded reader is supposed to be used - for model (and gridded obs) reading. Note: this is a workaround - solution and will likely be removed in the future when the gridded - reading API is more harmonised - (see https://github.com/metno/pyaerocom/issues/174). - flex_ts_type : bool - Bboolean specifying whether reading frequency of gridded data is - allowed to be flexible. This includes all gridded data, whether it is - model or gridded observation (e.g. satellites). Defaults to True. - min_num_obs : dict or int, optional - time resampling constraints applied, defaults to None, in which case - no constraints are applied. For instance, say your input is in daily - resolution and you want output in monthly and you want to make sure to - have roughly 50% daily coverage for the monthly averages. Then you may - specify `min_num_obs=15` which will ensure that at least 15 daily - averages are available to compute a monthly average. However, you may - also define a hierarchical scheme that first goes from daily to - weekly and then from weekly to monthly, via a dict. E.g. - `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would - ensure that each week has at least 3 daily values, as well as that each - month has at least 4 weekly values. - resample_how : str or dict, optional - string specifying how data should be aggregated when resampling in time. - Default is "mean". Can also be a nested dictionary, e.g. - `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the - maximum value to aggregate from hourly to daily for variable conco3, - rather than the mean. - obs_remove_outliers : bool - if True, outliers are removed from obs data before colocation, - else not. Default is False. - Custom outlier ranges for each variable can be specified via - :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom - default outlier ranges are used. The latter are specified in - `variables.ini` file via `minimum` and `maximum` attributes and can - also be accessed through :attr:`pyaerocom.variable.Variable.minimum` - and :attr:`pyaerocom.variable.Variable.maximum`, respectively. - model_remove_outliers : bool - if True, outliers are removed from model data (normally this should be - set to False, as the models are supposed to be assessed, including - outlier cases). Default is False. - Custom outlier ranges for each variable can be specified via - :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom - default outlier ranges are used. The latter are specified in - `variables.ini` file via `minimum` and `maximum` attributes and can - also be accessed through :attr:`pyaerocom.variable.Variable.minimum` - and :attr:`pyaerocom.variable.Variable.maximum`, respectively. - obs_outlier_ranges : dict, optional - dictionary specifying outlier ranges for individual obs variables. - (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant - if :attr:`obs_remove_outliers` is True. - model_outlier_ranges : dict, optional - like :attr:`obs_outlier_ranges` but for model variables. Only relevant - if :attr:`model_remove_outliers` is True. - zeros_to_nan : bool - If True, zero's in output co-located data object will be converted to - NaN. Default is False. - harmonise_units : bool - if True, units are attempted to be harmonised during co-location - (note: raises Exception if True and in case units cannot be harmonised). - regrid_res_deg : int, optional - resolution in degrees for regridding of model grid (done before - co-location). Default is None. - colocate_time : bool - if True and if obs and model sampling frequency (e.g. daily) are higher - than output colocation frequency (e.g. monthly), then the datasets are - first colocated in time (e.g. on a daily basis), before the monthly - averages are calculated. Default is False. - reanalyse_existing : bool - if True, always redo co-location, even if there is already an existing - co-located NetCDF file (under the output location specified by - :attr:`basedir_coldata` ) for the given variable combination to be - co-located. If False and output already exists, then co-location is - skipped for the associated variable. Default is True. - raise_exceptions : bool - if True, Exceptions that may occur for individual variables to be - processed, are raised, else the analysis is skipped for such cases. - keep_data : bool - if True, then all colocated data objects computed when running - :func:`run` will be stored in :attr:`data`. Defaults to True. - add_meta : dict - additional metadata that is supposed to be added to each output - :class:`ColocatedData` object. - """ - - #: Dictionary specifying alternative vertical types that may be used to - #: read model data. E.g. consider the variable is ec550aer, - #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). - #: Now, if a model that is used for the analysis does not contain a data - #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the - #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this - #: exists, it will load it and extract the surface level. - OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} - - #: do not raise Exception if invalid item is attempted to be assigned - #: (Overwritten from base class) - CRASH_ON_INVALID = False - - FORBIDDEN_KEYS = [ - "var_outlier_ranges", # deprecated since v0.12.0 - "var_ref_outlier_ranges", # deprecated since v0.12.0 - "remove_outliers", # deprecated since v0.12.0 - ] - - ts_type = StrWithDefault("monthly") - obs_vars = ListOfStrings() - - def __init__( - self, - model_id=None, - obs_config: Optional[PyaroConfig] = None, - obs_id=None, - obs_vars=None, - ts_type=None, - start=None, - stop=None, - basedir_coldata=None, - save_coldata=False, - **kwargs, - ): - self.model_id = model_id - self._obs_id = None - self._obs_config = None - - self.obs_id = obs_id - self.obs_config = obs_config - - self.obs_vars = obs_vars - - self.ts_type = ts_type - self.start = start - self.stop = stop - - # crashes if input filter name is invalid - self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" - - if basedir_coldata is not None: - basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) - else: - basedir_coldata = const.COLOCATEDDATADIR - self.basedir_coldata = basedir_coldata - self.save_coldata = save_coldata - - # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER - # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED - - # Options related to obs reading and processing - self.obs_name = None - self.obs_data_dir = None - - self.obs_use_climatology = False - - self._obs_cache_only = False # only relevant if obs is ungridded - self.obs_vert_type = None - self.obs_ts_type_read = None - self.obs_filters = {} - self._obs_is_vertical_profile = False - self.colocation_layer_limits = None - self.profile_layer_limits = None - - self.read_opts_ungridded = {} - - # Attributes related to model data - self.model_name = None - self.model_data_dir = None - - self.model_read_opts = {} - - self.model_use_vars = {} - self.model_rename_vars = {} - self.model_add_vars = {} - self.model_to_stp = False - - self.model_ts_type_read = None - self.model_read_aux = {} - self.model_use_climatology = False - - self.model_kwargs = {} - - self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} - - self.flex_ts_type = True - - # Options related to time resampling - self.min_num_obs = None - self.resample_how = "mean" - - # Options related to outlier removal - self.obs_remove_outliers = False - self.model_remove_outliers = False - - # Custom outlier ranges for model and obs - self.obs_outlier_ranges = {} - self.model_outlier_ranges = {} - - self.zeros_to_nan = False - self.harmonise_units = False - self.regrid_res_deg = None - self.colocate_time = False - - self.reanalyse_existing = True - self.raise_exceptions = False - self.keep_data = True - - self.add_meta = {} - self.update(**kwargs) - - def _check_input_basedir_coldata(self, basedir_coldata): - """ - Make sure input basedir_coldata is str and exists - - Parameters - ---------- - basedir_coldata : str or Path - basic output directory for colocated data - - Raises - ------ - ValueError - If input is invalid. - - Returns - ------- - str - valid output directory - - """ - if isinstance(basedir_coldata, Path): - basedir_coldata = str(basedir_coldata) - if isinstance(basedir_coldata, str): - if not os.path.exists(basedir_coldata): - os.mkdir(basedir_coldata) - return basedir_coldata - raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") - - def _check_basedir_coldata(self): - """ - Make sure output directory for colocated data files exists - - Raises - ------ - FileNotFoundError - If :attr:`basedir_coldata` does not exist and cannot be created. - - Returns - ------- - str - current value of :attr:`basedir_coldata` - - """ - basedir_coldata = self.basedir_coldata - if basedir_coldata is None: - basedir_coldata = const.COLOCATEDDATADIR - if not os.path.exists(basedir_coldata): - logger.info(f"Creating directory: {basedir_coldata}") - os.mkdir(basedir_coldata) - elif isinstance(basedir_coldata, Path): - basedir_coldata = str(basedir_coldata) - if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): - os.mkdir(basedir_coldata) - if not os.path.exists(basedir_coldata): - raise FileNotFoundError( - f"Output directory for colocated data files {basedir_coldata} does not exist" - ) - self.basedir_coldata = basedir_coldata - return basedir_coldata - - @property - def basedir_logfiles(self): - """Base directory for storing logfiles""" - p = chk_make_subdir(self.basedir_coldata, "logfiles") - return p - - @property - def obs_id(self) -> str: - return self._obs_id - - @obs_id.setter - def obs_id(self, val: Optional[str]) -> None: - if self.obs_config is not None and val != self.obs_config.name: - logger.info( - f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" - ) - self.obs_config = None - - self._obs_id = val - - @property - def obs_config(self) -> PyaroConfig: - return self._obs_config - - @obs_config.setter - def obs_config(self, val: Optional[PyaroConfig]) -> None: - if val is not None: - if isinstance(val, dict): - logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") - val = PyaroConfig(**val) - if self.obs_id is not None and val.name != self.obs_id: - logger.info( - f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" - ) - self.obs_id = val.name - if self.obs_id is None: - self.obs_id = val.name - self._obs_config = val - - def add_glob_meta(self, **kwargs): - """ - Add global metadata to :attr:`add_meta` - - Parameters - ---------- - kwargs - metadata to be added - - Returns - ------- - None - - """ - self.add_meta.update(**kwargs) - - def __setitem__(self, key, val): - if key == "basedir_coldata": - val = self._check_input_basedir_coldata(val) - super().__setitem__(key, val) - - def _period_from_start_stop(self) -> str: - start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) - y0, y1 = start.year, stop.year - assert y0 <= y1 - if y0 == y1: - return str(y0) - else: - return f"{y0}-{y1}" +# class ColocationSetup(BrowseDict): +# """ +# Setup class for high-level model / obs co-location. + +# An instance of this setup class can be used to run a colocation analysis +# between a model and an observation network and will create a number of +# :class:`pya.ColocatedData` instances, which can be saved automatically +# as NetCDF files. + +# Apart from co-location, this class also handles reading of the input data +# for co-location. Supported co-location options are: + +# 1. gridded vs. ungridded data +# For instance 3D model data (instance of :class:`GriddedData`) with lat, +# lon and time dimension that is co-located with station based observations +# which are represented in pyaerocom through :class:`UngriddedData` objects. +# The co-location function used is +# :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of +# co-location, the output co-located data object will be 3-dimensional, +# with dimensions `data_source` (index 0: obs, index 1: model), `time` and +# `station_name`. + +# 2. gridded vs. gridded data +# For instance 3D model data that is co-located with 3D satellite data +# (both instances of :class:`GriddedData`), both objects with lat, +# lon and time dimensions. The co-location function used +# is :func:`pyaerocom.colocation.colocated_gridded_gridded`. +# For this type of co-location, the output co-located data object will be +# 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: +# model), `time` and `latitude` and `longitude`. + + +# Attributes +# ---------- +# model_id : str +# ID of model to be used. + +# obs_config: PyaroConfig +# In the case Pyaro is used, a config must be provided. In that case obs_id(see below) +# is ignored and only the config is used. +# obs_id : str +# ID of observation network to be used. +# obs_vars : list +# Variables to be analysed (need to be available in input obs dataset). +# Variables that are not available in the model data output will be +# skipped. Alternatively, model variables to be used for a given obs +# variable can also be specified via attributes :attr:`model_use_vars` +# and :attr:`model_add_vars`. +# ts_type : str +# String specifying colocation output frequency. +# start +# Start time of colocation. Input can be integer denoting the year or +# anything that can be converted into :class:`pandas.Timestamp` using +# :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first +# available date in the model data is used. +# stop +# stop time of colocation. int or anything that can be converted into +# :class:`pandas.Timestamp` using +# :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if +# ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` +# will be automatically set to the end of that year. Else, it will be +# set to the last available timestamp in the model data. +# filter_name : str +# name of filter to be applied. If None, no filter is used +# (to be precise, if None, then +# :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should +# default to `ALL-wMOUNTAINS`, that is, no filtering). +# basedir_coldata : str +# Base directory for storing of colocated data files. +# save_coldata : bool +# if True, colocated data objects are saved as NetCDF file. +# obs_name : str, optional +# if provided, this string will be used in colocated data filename to +# specify obsnetwork, else obs_id will be used. +# obs_data_dir : str, optional +# location of obs data. If None, attempt to infer obs location based on +# obs ID. +# obs_use_climatology : bool +# BETA if True, pyaerocom default climatology is computed from observation +# stations (so far only possible for unrgidded / gridded colocation). +# obs_vert_type : str +# AeroCom vertical code encoded in the model filenames (only AeroCom 3 +# and later). Specifies which model file should be read in case there are +# multiple options (e.g. surface level data can be read from a +# *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is +# string (e.g. 'Surface'), then the corresponding vertical type code is +# used for reading of all variables that are colocated (i.e. that are +# specified in :attr:`obs_vars`). +# obs_ts_type_read : str or dict, optional +# may be specified to explicitly define the reading frequency of the +# observation data (so far, this does only apply to gridded obsdata such +# as satellites), either as str (same for all obs variables) or variable +# specific as dict. For ungridded reading, the frequency may be specified +# via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). +# Not to be confused with :attr:`ts_type`, which specifies the +# frequency used for colocation. Can be specified variable specific in +# form of dictionary. +# obs_filters : dict +# filters applied to the observational dataset before co-location. +# In case of gridded / gridded, these are filters that can be passed to +# :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, +# or `constraints`. In case the obsdata is ungridded (gridded / ungridded +# co-locations) these are filters that are handled through keyword +# `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters +# are applied to the :class:`UngriddedData` objects after reading and +# caching the data, so changing them, will not invalidate the latest +# cache of the :class:`UngriddedData`. +# read_opts_ungridded : dict, optional +# dictionary that specifies reading constraints for ungridded reading, +# and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. +# Note that - other than for `obs_filters` these filters are applied +# during the reading of the :class:`UngriddedData` objects and specifying +# them will deactivate caching. +# model_name : str, optional +# if provided, this string will be used in colocated data filename to +# specify model, else obs_id will be used. +# model_data_dir : str, optional +# Location of model data. If None, attempt to infer model location based +# on model ID. +# model_read_opts : dict, optional +# options for model reading (passed as keyword args to +# :func:`pyaerocom.io.ReadUngridded.read`). +# model_use_vars : dict, optional +# dictionary that specifies mapping of model variables. Keys are +# observation variables, values are the corresponding model variables +# (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your +# observation has var *od550aer* but your model model uses a different +# variable name for that variable, say *od550*. Then, you can specify +# this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, +# a model variable *od550aer* will be ignored, even if it exists +# (cf :attr:`model_add_vars`). +# model_rename_vars : dict, optional +# rename certain model variables **after** co-location, before storing +# the associated :class:`ColocatedData` object on disk. Keys are model +# variables, values are new names +# (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). +# Note: this does not impact which variables are read from the model. +# model_add_vars : dict, optional +# additional model variables to be processed for one obs variable. E.g. +# `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would +# co-locate both model SO4 AOD (od550so4) and model coarse mode AOD +# (od550gt1aer) with total AOD (od550aer) from obs (in addition to +# od550aer vs od550aer if applicable). +# model_to_stp : bool +# ALPHA (please do not use): convert model data values to STP conditions +# after co-location. Note: this only works for very particular settings +# at the moment and needs revision, as it relies on access to +# meteorological data. +# model_ts_type_read : str or dict, optional +# may be specified to explicitly define the reading frequency of the +# model data, either as str (same for all obs variables) or variable +# specific as dict. Not to be confused with :attr:`ts_type`, which +# specifies the output frequency of the co-located data. +# model_read_aux : dict, optional +# may be used to specify additional computation methods of variables from +# models. Keys are variables to be computed, values are dictionaries with +# keys `vars_required` (list of required variables for computation of var +# and `fun` (method that takes list of read data objects and computes +# and returns var). +# model_use_climatology : bool +# if True, attempt to use climatological model data field. Note: this +# only works if model data is in AeroCom conventions (climatological +# fields are indicated with 9999 as year in the filename) and if this is +# active, only single year analysis are supported (i.e. provide int to +# :attr:`start` to specify the year and leave :attr:`stop` empty). +# gridded_reader_id : dict +# BETA: dictionary specifying which gridded reader is supposed to be used +# for model (and gridded obs) reading. Note: this is a workaround +# solution and will likely be removed in the future when the gridded +# reading API is more harmonised +# (see https://github.com/metno/pyaerocom/issues/174). +# flex_ts_type : bool +# Bboolean specifying whether reading frequency of gridded data is +# allowed to be flexible. This includes all gridded data, whether it is +# model or gridded observation (e.g. satellites). Defaults to True. +# min_num_obs : dict or int, optional +# time resampling constraints applied, defaults to None, in which case +# no constraints are applied. For instance, say your input is in daily +# resolution and you want output in monthly and you want to make sure to +# have roughly 50% daily coverage for the monthly averages. Then you may +# specify `min_num_obs=15` which will ensure that at least 15 daily +# averages are available to compute a monthly average. However, you may +# also define a hierarchical scheme that first goes from daily to +# weekly and then from weekly to monthly, via a dict. E.g. +# `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would +# ensure that each week has at least 3 daily values, as well as that each +# month has at least 4 weekly values. +# resample_how : str or dict, optional +# string specifying how data should be aggregated when resampling in time. +# Default is "mean". Can also be a nested dictionary, e.g. +# `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the +# maximum value to aggregate from hourly to daily for variable conco3, +# rather than the mean. +# obs_remove_outliers : bool +# if True, outliers are removed from obs data before colocation, +# else not. Default is False. +# Custom outlier ranges for each variable can be specified via +# :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom +# default outlier ranges are used. The latter are specified in +# `variables.ini` file via `minimum` and `maximum` attributes and can +# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` +# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. +# model_remove_outliers : bool +# if True, outliers are removed from model data (normally this should be +# set to False, as the models are supposed to be assessed, including +# outlier cases). Default is False. +# Custom outlier ranges for each variable can be specified via +# :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom +# default outlier ranges are used. The latter are specified in +# `variables.ini` file via `minimum` and `maximum` attributes and can +# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` +# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. +# obs_outlier_ranges : dict, optional +# dictionary specifying outlier ranges for individual obs variables. +# (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant +# if :attr:`obs_remove_outliers` is True. +# model_outlier_ranges : dict, optional +# like :attr:`obs_outlier_ranges` but for model variables. Only relevant +# if :attr:`model_remove_outliers` is True. +# zeros_to_nan : bool +# If True, zero's in output co-located data object will be converted to +# NaN. Default is False. +# harmonise_units : bool +# if True, units are attempted to be harmonised during co-location +# (note: raises Exception if True and in case units cannot be harmonised). +# regrid_res_deg : int, optional +# resolution in degrees for regridding of model grid (done before +# co-location). Default is None. +# colocate_time : bool +# if True and if obs and model sampling frequency (e.g. daily) are higher +# than output colocation frequency (e.g. monthly), then the datasets are +# first colocated in time (e.g. on a daily basis), before the monthly +# averages are calculated. Default is False. +# reanalyse_existing : bool +# if True, always redo co-location, even if there is already an existing +# co-located NetCDF file (under the output location specified by +# :attr:`basedir_coldata` ) for the given variable combination to be +# co-located. If False and output already exists, then co-location is +# skipped for the associated variable. Default is True. +# raise_exceptions : bool +# if True, Exceptions that may occur for individual variables to be +# processed, are raised, else the analysis is skipped for such cases. +# keep_data : bool +# if True, then all colocated data objects computed when running +# :func:`run` will be stored in :attr:`data`. Defaults to True. +# add_meta : dict +# additional metadata that is supposed to be added to each output +# :class:`ColocatedData` object. +# """ + +# #: Dictionary specifying alternative vertical types that may be used to +# #: read model data. E.g. consider the variable is ec550aer, +# #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). +# #: Now, if a model that is used for the analysis does not contain a data +# #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the +# #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this +# #: exists, it will load it and extract the surface level. +# OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} + +# #: do not raise Exception if invalid item is attempted to be assigned +# #: (Overwritten from base class) +# CRASH_ON_INVALID = False + +# FORBIDDEN_KEYS = [ +# "var_outlier_ranges", # deprecated since v0.12.0 +# "var_ref_outlier_ranges", # deprecated since v0.12.0 +# "remove_outliers", # deprecated since v0.12.0 +# ] + +# ts_type = StrWithDefault("monthly") +# obs_vars = ListOfStrings() + +# def __init__( +# self, +# model_id=None, +# obs_config: Optional[PyaroConfig] = None, +# obs_id=None, +# obs_vars=None, +# ts_type=None, +# start=None, +# stop=None, +# basedir_coldata=None, +# save_coldata=False, +# **kwargs, +# ): +# self.model_id = model_id +# self._obs_id = None +# self._obs_config = None + +# self.obs_id = obs_id +# self.obs_config = obs_config + +# self.obs_vars = obs_vars + +# self.ts_type = ts_type +# self.start = start +# self.stop = stop + +# # crashes if input filter name is invalid +# self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" + +# if basedir_coldata is not None: +# basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) +# else: +# basedir_coldata = const.COLOCATEDDATADIR +# self.basedir_coldata = basedir_coldata +# self.save_coldata = save_coldata + +# # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER +# # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED + +# # Options related to obs reading and processing +# self.obs_name = None +# self.obs_data_dir = None + +# self.obs_use_climatology = False + +# self._obs_cache_only = False # only relevant if obs is ungridded +# self.obs_vert_type = None +# self.obs_ts_type_read = None +# self.obs_filters = {} +# self._obs_is_vertical_profile = False +# self.colocation_layer_limits = None +# self.profile_layer_limits = None + +# self.read_opts_ungridded = {} + +# # Attributes related to model data +# self.model_name = None +# self.model_data_dir = None + +# self.model_read_opts = {} + +# self.model_use_vars = {} +# self.model_rename_vars = {} +# self.model_add_vars = {} +# self.model_to_stp = False + +# self.model_ts_type_read = None +# self.model_read_aux = {} +# self.model_use_climatology = False + +# self.model_kwargs = {} + +# self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} + +# self.flex_ts_type = True + +# # Options related to time resampling +# self.min_num_obs = None +# self.resample_how = "mean" + +# # Options related to outlier removal +# self.obs_remove_outliers = False +# self.model_remove_outliers = False + +# # Custom outlier ranges for model and obs +# self.obs_outlier_ranges = {} +# self.model_outlier_ranges = {} + +# self.zeros_to_nan = False +# self.harmonise_units = False +# self.regrid_res_deg = None +# self.colocate_time = False + +# self.reanalyse_existing = True +# self.raise_exceptions = False +# self.keep_data = True + +# self.add_meta = {} +# self.update(**kwargs) + +# def _check_input_basedir_coldata(self, basedir_coldata): +# """ +# Make sure input basedir_coldata is str and exists + +# Parameters +# ---------- +# basedir_coldata : str or Path +# basic output directory for colocated data + +# Raises +# ------ +# ValueError +# If input is invalid. + +# Returns +# ------- +# str +# valid output directory + +# """ +# if isinstance(basedir_coldata, Path): +# basedir_coldata = str(basedir_coldata) +# if isinstance(basedir_coldata, str): +# if not os.path.exists(basedir_coldata): +# os.mkdir(basedir_coldata) +# return basedir_coldata +# raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") + +# def _check_basedir_coldata(self): +# """ +# Make sure output directory for colocated data files exists + +# Raises +# ------ +# FileNotFoundError +# If :attr:`basedir_coldata` does not exist and cannot be created. + +# Returns +# ------- +# str +# current value of :attr:`basedir_coldata` + +# """ +# basedir_coldata = self.basedir_coldata +# if basedir_coldata is None: +# basedir_coldata = const.COLOCATEDDATADIR +# if not os.path.exists(basedir_coldata): +# logger.info(f"Creating directory: {basedir_coldata}") +# os.mkdir(basedir_coldata) +# elif isinstance(basedir_coldata, Path): +# basedir_coldata = str(basedir_coldata) +# if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): +# os.mkdir(basedir_coldata) +# if not os.path.exists(basedir_coldata): +# raise FileNotFoundError( +# f"Output directory for colocated data files {basedir_coldata} does not exist" +# ) +# self.basedir_coldata = basedir_coldata +# return basedir_coldata + +# @property +# def basedir_logfiles(self): +# """Base directory for storing logfiles""" +# p = chk_make_subdir(self.basedir_coldata, "logfiles") +# return p + +# @property +# def obs_id(self) -> str: +# return self._obs_id + +# @obs_id.setter +# def obs_id(self, val: Optional[str]) -> None: +# if self.obs_config is not None and val != self.obs_config.name: +# logger.info( +# f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" +# ) +# self.obs_config = None + +# self._obs_id = val + +# @property +# def obs_config(self) -> PyaroConfig: +# return self._obs_config + +# @obs_config.setter +# def obs_config(self, val: Optional[PyaroConfig]) -> None: +# if val is not None: +# if isinstance(val, dict): +# logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") +# val = PyaroConfig(**val) +# if self.obs_id is not None and val.name != self.obs_id: +# logger.info( +# f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" +# ) +# self.obs_id = val.name +# if self.obs_id is None: +# self.obs_id = val.name +# self._obs_config = val + +# def add_glob_meta(self, **kwargs): +# """ +# Add global metadata to :attr:`add_meta` + +# Parameters +# ---------- +# kwargs +# metadata to be added + +# Returns +# ------- +# None + +# """ +# self.add_meta.update(**kwargs) + +# def __setitem__(self, key, val): +# if key == "basedir_coldata": +# val = self._check_input_basedir_coldata(val) +# super().__setitem__(key, val) + +# def _period_from_start_stop(self) -> str: +# start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) +# y0, y1 = start.year, stop.year +# assert y0 <= y1 +# if y0 == y1: +# return str(y0) +# else: +# return f"{y0}-{y1}" class Colocator(ColocationSetup): @@ -552,13 +553,13 @@ class Colocator(ColocationSetup): as such. For setup attributes, please see base class. """ - SUPPORTED_GRIDDED_READERS = { + SUPPORTED_GRIDDED_READERS: dict = { "ReadGridded": ReadGridded, "ReadMscwCtm": ReadMscwCtm, "ReadCAMS2_83": ReadCAMS2_83, } - STATUS_CODES = { + STATUS_CODES: dict[int, str] = { 1: "SUCCESS", 2: "NOT OK: Missing/invalid model variable", 3: "NOT OK: Missing/invalid obs variable", diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index a178a64eb..6cdad5837 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -1,5 +1,6 @@ import logging import os +import sys from functools import cached_property from pathlib import Path from typing import Iterable, Literal @@ -22,6 +23,12 @@ logger = logging.getLogger(__name__) +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + + class ColocationSetup(BaseModel): """ Setup class for high-level model / obs co-location. @@ -298,7 +305,7 @@ class ColocationSetup(BaseModel): ts_type: str # = None start: pd.Timestamp | int | None # = None stop: pd.Timestamp | int | None # = None - obs_config: PyaroConfig | None # = None + obs_config: PyaroConfig | None = None ############################### # Attributes with defaults @@ -358,7 +365,7 @@ def validate_basedirs(cls, v): # Attributes related to model data model_name: str | None = None - model_data_dir: Path | str = None + model_data_dir: Path | str | None = None model_read_opts: dict | None = {} @@ -456,14 +463,15 @@ def basedir_logfiles(self): # LB: Think we need a validator on the PyaroConfig, not the obs_id. # Combining the validation logic from those two things here. needs testing. # LB: this needs serious work - @field_validator("obs_config") + # @field_validator("obs_config") + @model_validator(mode="after") @classmethod def validate_obs_config(cls, v: PyaroConfig): - if cls.obs_config is not None and cls.obs.config.name != cls.obs_id: + if v is not None and cls.obs.config.name != cls.obs_id: logger.info( - f"Data ID in Pyaro config {cls.obs_config.name} does not match obs_id {cls.obs_id}. Setting Pyaro config to None!" + f"Data ID in Pyaro config {v.name} does not match obs_id {cls.obs_id}. Setting Pyaro config to None!" ) - cls.obs_config = None + v = None if v is not None: if isinstance(v, dict): logger.info("Obs config was given as dict. Will try to convert to PyaroConfig") @@ -501,3 +509,13 @@ def _period_from_start_stop(self) -> str: return str(y0) else: return f"{y0}-{y1}" + + def update(self, data: dict) -> Self: + # provide an update() method analogous to MutableMapping's one + update = self.model_dump() + update.update(data) + self.model_validate(update) + for k, v in self.model_dump(exclude_defaults=True).items(): + logger.debug(f"updating value of '{k}' from '{getattr(self, k, None)}' to '{v}'") + setattr(self, k, v) + return self diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 5e362584b..771643eb2 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -274,7 +274,7 @@ def test_Colocator_run_gridded_ungridded( tm5_aero_stp, update, chk_mvar, chk_ovar, sh, mean_obs, mean_mod ): stp = ColocationSetup(**tm5_aero_stp) - stp.update(**update) + stp.update(update) result = Colocator(**stp).run() assert isinstance(result, dict) @@ -308,7 +308,7 @@ def test_Colocator_run_gridded_ungridded( ) def test_Colocator_run_gridded_ungridded_error(tm5_aero_stp, update, error): stp = ColocationSetup(**tm5_aero_stp) - stp.update(**update) + stp.update(update) with pytest.raises(ColocationSetupError) as e: Colocator(**stp).run() assert str(e.value).startswith(error) From de9dea9ffa788e80e6c00aedab66e7774008c7e0 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 23 May 2024 10:55:57 +0200 Subject: [PATCH 16/44] Pydantic ColocationSetup in EvalSetup --- pyaerocom/aeroval/setupclasses.py | 45 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/pyaerocom/aeroval/setupclasses.py b/pyaerocom/aeroval/setupclasses.py index 1828ac8e0..2b1dbc891 100644 --- a/pyaerocom/aeroval/setupclasses.py +++ b/pyaerocom/aeroval/setupclasses.py @@ -33,7 +33,9 @@ check_if_year, ) from pyaerocom.aeroval.json_utils import read_json, set_float_serialization_precision, write_json -from pyaerocom.colocation_auto import ColocationSetup + +# from pyaerocom.colocation_auto import ColocationSetup # Old +from pyaerocom.colocation_setup import ColocationSetup # New logger = logging.getLogger(__name__) @@ -422,13 +424,6 @@ def statistics_opts(self) -> StatisticsSetup: } return StatisticsSetup(**model_args) - ################################## - ## Non-BaseModel-based attributes - ################################## - - # These attributes require special attention b/c they're not based on Pydantic's BaseModel class. - - # TODO: Use Pydantic for ColocationSetup @computed_field @cached_property def colocation_opts(self) -> ColocationSetup: @@ -438,7 +433,7 @@ def colocation_opts(self) -> ColocationSetup: model_args = { key: val for key, val in self.model_extra.items() - if key in ColocationSetup().__dict__.keys() + if key in ColocationSetup.model_fields } # need to pass some default values to the ColocationSetup if not provided in config default_dict = {"save_coldata": True, "keep_data": False, "resample_how": "mean"} @@ -448,9 +443,35 @@ def colocation_opts(self) -> ColocationSetup: return ColocationSetup(**model_args) - @field_serializer("colocation_opts") - def serialize_colocation_opts(self, colocation_opts: ColocationSetup): - return colocation_opts.json_repr() + ################################## + ## Non-BaseModel-based attributes + ################################## + + # These attributes require special attention b/c they're not based on Pydantic's BaseModel class. + + # TODO: Use Pydantic for ColocationSetup + # @computed_field + # @cached_property + # def colocation_opts(self) -> ColocationSetup: + # if not hasattr(self, "model_extra") or self.model_extra is None: + # return ColocationSetup(save_coldata=True, keep_data=False, resample_how="mean") + + # model_args = { + # key: val + # for key, val in self.model_extra.items() + # if key in ColocationSetup().__dict__.keys() + # } + # # need to pass some default values to the ColocationSetup if not provided in config + # default_dict = {"save_coldata": True, "keep_data": False, "resample_how": "mean"} + # for key in default_dict: + # if key not in model_args: + # model_args[key] = default_dict[key] + + # return ColocationSetup(**model_args) + + # @field_serializer("colocation_opts") + # def serialize_colocation_opts(self, colocation_opts: ColocationSetup): + # return colocation_opts.json_repr() # ObsCollection and ModelCollection # TODO Use Pydantic for ObsCollection and ModelCollection From 058255a857235850c4d86d0935155d6fa5071ccb Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 23 May 2024 16:43:34 +0200 Subject: [PATCH 17/44] WIP --- pyaerocom/colocation_auto.py | 74 +++++++++++++++++++++++++---------- pyaerocom/colocation_setup.py | 17 ++++++-- tests/test_colocation_auto.py | 10 +++-- 3 files changed, 73 insertions(+), 28 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 06ee3fea6..7badd5df9 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -8,7 +8,7 @@ import traceback from datetime import datetime from pathlib import Path -from typing import Optional +from typing import Any, Callable, Optional import pandas as pd from cf_units import Unit @@ -22,9 +22,7 @@ correct_model_stp_coldata, ) from pyaerocom.colocation_3d import ColocatedDataLists, colocate_vertical_profile_gridded -from pyaerocom.colocation_setup import ColocationSetup - -# from pyaerocom.colocation_setup import ColocationSetup +from pyaerocom.colocation_setup import ColocationSetup # New from pyaerocom.config import ALL_REGION_NAME from pyaerocom.exceptions import ColocationError, ColocationSetupError, DataCoverageError from pyaerocom.helpers import ( @@ -567,19 +565,53 @@ class Colocator(ColocationSetup): 5: "NOT OK: Colocation failed", } - def __init__(self, **kwargs): - super().__init__(**kwargs) - - self._log = None - self.logging = True - self._loaded_model_data = {} - self.data = {} - - self._processing_status = [] - self.files_written = [] - - self._model_reader = None - self._obs_reader = None + # def __init__(self, **kwargs): + # super().__init__(**kwargs) + + # self._log = None + # self.logging = True + # self._loaded_model_data = {} + # self.data = {} + + # self._processing_status = [] + # self.files_written = [] + + # self._model_reader = None + # self._obs_reader = None + + # Logging in this class needs serious work + _log: Callable | None = None + logging: bool = True + _loaded_model_data: dict | None = {} + data: dict = {} # think about this typing + _processing_status: list[str] = [] + files_written: list[str] = [] + _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None + _obs_reader: Any | None = None # LB: Should be improved + + def __init__( + self, + _log: Callable | None = None, + logging: bool = True, + _loaded_model_data: dict | None = {}, + data: dict = {}, + _processing_status: list[str] = [], + files_written: list[str] = [], + _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None, + _obs_reader: Any | None = None, + **kwargs, + ) -> None: + super(Colocator, self).__init__( + _log=_log, + logging=logging, + _loaded_model_data=_loaded_model_data, + data=data, + _processing_status=_processing_status, + files_written=files_written, + _model_reader=_model_reader, + _obs_reader=_obs_reader, + **kwargs, + ) @property def model_vars(self): @@ -855,7 +887,7 @@ def run(self, var_list: list = None, **opts): dictionaries comprising key / value pairs of obs variables and associated instances of :class:`ColocatedData`. """ - self.update(**opts) + self.update(opts) data_out = {} # ToDo: see if the following could be solved via custom context manager try: @@ -982,7 +1014,7 @@ def _read_ungridded(self, var_name): obs_data = obs_reader.read( data_ids=[self.obs_id], vars_to_retrieve=var_name, - only_cached=self._obs_cache_only, + only_cached=self.obs_cache_only, filter_post=obs_filters_post, **self.read_opts_ungridded, ) @@ -1317,7 +1349,7 @@ def _check_remove_outliers_gridded(self, data, var_name, is_model): return data def _eval_obs_filters(self, var_name): - obs_filters = self["obs_filters"] + obs_filters = self.obs_filters if var_name in obs_filters: obs_filters = obs_filters[var_name] remaining = {} @@ -1593,7 +1625,7 @@ def _init_log(self): log.write(f"Timestamp: {datetimestr}\n\n") log.write("Analysis configuration\n") ignore = ["_log", "logging", "data", "_model_reader", "_obs_reader"] - for key, val in self.items(): + for key, val in self.model_dump().items(): if key in ignore: continue log.write(f"{key}: {val}\n") diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 6cdad5837..6551d3340 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -283,7 +283,12 @@ class ColocationSetup(BaseModel): ########################## # Pydantic ConfigDict ########################## - model_config = ConfigDict(arbitrary_types_allowed=True, allow="extra", protected_namespaces=()) + model_config = ConfigDict( + arbitrary_types_allowed=True, + allow="extra", + protected_namespaces=(), + validate_assignment=True, + ) # @model_validator('*', mode="before") # def convert_to_none(cls, v): @@ -379,6 +384,8 @@ def validate_basedirs(cls, v): model_read_aux: dict[str, dict[Literal["vars_required", "fun"], list[str]]] | None = {} model_use_climatology: bool = False + model_kwargs: dict = {} + # LB: check this as well gridded_reader_id: dict[str, str] = {"model": "ReadGridded", "obs": "ReadGridded"} @@ -435,7 +442,7 @@ def __init__( ) # Model validator for forbidden keys - @model_validator(mode="after") + @model_validator(mode="wrap") def validate_no_forbidden_keys(self): for key in self.FORBIDDEN_KEYS: if key in self.model_fields: # LB: Check this is where they will be found @@ -512,10 +519,14 @@ def _period_from_start_stop(self) -> str: def update(self, data: dict) -> Self: # provide an update() method analogous to MutableMapping's one + + # validate values in data update = self.model_dump() update.update(data) self.model_validate(update) - for k, v in self.model_dump(exclude_defaults=True).items(): + # for k, v in self.model_dump(exclude_defaults=False).items(): + # assign values from + for k, v in data.items(): logger.debug(f"updating value of '{k}' from '{getattr(self, k, None)}' to '{v}'") setattr(self, k, v) return self diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 771643eb2..4d433e635 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -100,9 +100,11 @@ def test_colocation_setup(stp: ColocationSetup, should_be: dict): ], ) def test_ColocationSetup_invalid_input(key, val, raises): - with raises: - stp = ColocationSetup(**{key: val}) - assert stp[key] == val + # with raises: + # stp = ColocationSetup(**{key: val}) + # assert stp[key] == val + with pytest.raises(ValidationError): + ColocationSetup(**{key: val}) def test_Colocator__obs_vars__setter(col): @@ -276,7 +278,7 @@ def test_Colocator_run_gridded_ungridded( stp = ColocationSetup(**tm5_aero_stp) stp.update(update) - result = Colocator(**stp).run() + result = Colocator(**stp.model_dump()).run() assert isinstance(result, dict) coldata = result[chk_mvar][chk_ovar] From 1872cb812f2d6dd7badf669d7052379c666b2235 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 23 May 2024 17:06:44 +0200 Subject: [PATCH 18/44] obs_var validator --- pyaerocom/colocation_setup.py | 9 ++++++++- tests/test_colocation_auto.py | 16 +++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 6551d3340..9e465acef 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -307,6 +307,13 @@ class ColocationSetup(BaseModel): model_id: str | None # = None obs_id: str | None # = None obs_vars: list[str] | str | None # = None + + @field_validator("obs_vars") + @classmethod + def validate_obs_vars(cls, v): + if isinstance(v, str): + return [v] + ts_type: str # = None start: pd.Timestamp | int | None # = None stop: pd.Timestamp | int | None # = None @@ -442,7 +449,7 @@ def __init__( ) # Model validator for forbidden keys - @model_validator(mode="wrap") + @model_validator(mode="after") def validate_no_forbidden_keys(self): for key in self.FORBIDDEN_KEYS: if key in self.model_fields: # LB: Check this is where they will be found diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 4d433e635..32bb6f517 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -93,18 +93,16 @@ def test_colocation_setup(stp: ColocationSetup, should_be: dict): @pytest.mark.parametrize( "key,val,raises", [ - ("obs_vars", 42, ValidationError), - ("var_ref_outlier_ranges", [41, 42], ValidationError), - ("var_outlier_ranges", [41, 42], ValidationError), - ("remove_outliers", True, ValidationError), + ("obs_vars", 42, pytest.raises(ValidationError)), + ("var_ref_outlier_ranges", [41, 42], pytest.raises(KeyError)), + ("var_outlier_ranges", [41, 42], pytest.raises(KeyError)), + ("remove_outliers", True, pytest.raises(KeyError)), ], ) def test_ColocationSetup_invalid_input(key, val, raises): - # with raises: - # stp = ColocationSetup(**{key: val}) - # assert stp[key] == val - with pytest.raises(ValidationError): - ColocationSetup(**{key: val}) + with raises: + stp = ColocationSetup(**{key: val}) + assert stp.model_dump()[key] == val def test_Colocator__obs_vars__setter(col): From 1c84e32511806d91d012238b1bab7f35975b3796 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 24 May 2024 16:23:58 +0200 Subject: [PATCH 19/44] _obs_is_vertical_profile --- pyaerocom/colocation_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 9e465acef..87b5af0f3 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -370,7 +370,7 @@ def validate_basedirs(cls, v): obs_vert_type: str | None = None obs_ts_type_read: str | dict | None = None obs_filters: dict = {} - obs_is_vertical_profile: bool = False + _obs_is_vertical_profile: bool = False colocation_layer_limits: dict[str, float] | None = None profile_layer_limits: dict | None = None read_opts_ungridded: dict | None = {} From 50119c995f06faf3ad6c28b06d375fa01481ed81 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 24 May 2024 16:58:34 +0200 Subject: [PATCH 20/44] WIP --- pyaerocom/colocation_auto.py | 11 +++++++++++ pyaerocom/colocation_setup.py | 3 ++- tests/test_colocation_auto.py | 5 +++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 7badd5df9..6bfc1794f 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -12,6 +12,7 @@ import pandas as pd from cf_units import Unit +from pydantic import ConfigDict from pyaerocom import const from pyaerocom._lowlevel_helpers import BrowseDict, ListOfStrings, StrWithDefault, chk_make_subdir @@ -551,6 +552,16 @@ class Colocator(ColocationSetup): as such. For setup attributes, please see base class. """ + # ########################## + # # Pydantic ConfigDict + # ########################## + # model_config = ConfigDict( + # arbitrary_types_allowed=True, + # allow="extra", + # protected_namespaces=(), + # validate_assignment=True, + # ) + SUPPORTED_GRIDDED_READERS: dict = { "ReadGridded": ReadGridded, "ReadMscwCtm": ReadMscwCtm, diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 87b5af0f3..760645857 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -313,6 +313,7 @@ class ColocationSetup(BaseModel): def validate_obs_vars(cls, v): if isinstance(v, str): return [v] + return v ts_type: str # = None start: pd.Timestamp | int | None # = None @@ -345,7 +346,7 @@ def validate_obs_vars(cls, v): # crashes if input filter name is invalid filter_name: str = f"{ALL_REGION_NAME}-wMOUNTAINS" - basedir_coldata: str # = Field(default=const.COLOCATEDDATADIR, validate_default=True) + basedir_coldata: str | Path = Field(default=const.COLOCATEDDATADIR, validate_default=True) @field_validator("basedir_coldata") @classmethod diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 32bb6f517..0065ed233 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -110,6 +110,7 @@ def test_Colocator__obs_vars__setter(col): assert col.obs_vars == ["var"] +# LB: Not sure if Colocator should be allowed to accept objects def test_Colocator__add_attr(col): col.bla = "blub" col["blub"] = 42 @@ -191,6 +192,9 @@ def test_Colocator__coldata_savename(): assert savename == n +# LB: Not clear if this is intended functionality or what or we can remove. +# Currently set up to revalidate the basedir_coldata everytime +# validation creates this directory def test_Colocator_basedir_coldata(tmp_path: Path): base_path = tmp_path / "test" col = Colocator(raise_exceptions=True) @@ -372,6 +376,7 @@ def test_colocator__find_var_matches_model_add_vars(): assert var_matches == {"abs550aer": ovar, ovar: ovar} +# LB: This test breaks the way I want this class to work because it implies allowing adding of attributes. def test_colocator_instantiate_gridded_reader(path_emep): col = Colocator(gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) col.filepath = path_emep["daily"] From fc1ba8593fb5165f0f313a97389c4ebd20e89fd8 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Mon, 27 May 2024 18:15:08 +0200 Subject: [PATCH 21/44] spearting functionality from data containers WIP --- pyaerocom/colocation_auto.py | 337 ++++++++++++++++++---------------- pyaerocom/colocation_setup.py | 27 ++- tests/test_colocation_auto.py | 164 ++++++++++------- 3 files changed, 291 insertions(+), 237 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 6bfc1794f..ed510ec0f 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -543,7 +543,7 @@ # return f"{y0}-{y1}" -class Colocator(ColocationSetup): +class Colocator: """High level class for running co-location Note @@ -576,53 +576,56 @@ class Colocator(ColocationSetup): 5: "NOT OK: Colocation failed", } - # def __init__(self, **kwargs): - # super().__init__(**kwargs) - - # self._log = None - # self.logging = True - # self._loaded_model_data = {} - # self.data = {} - - # self._processing_status = [] - # self.files_written = [] - - # self._model_reader = None - # self._obs_reader = None - - # Logging in this class needs serious work - _log: Callable | None = None - logging: bool = True - _loaded_model_data: dict | None = {} - data: dict = {} # think about this typing - _processing_status: list[str] = [] - files_written: list[str] = [] - _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None - _obs_reader: Any | None = None # LB: Should be improved - - def __init__( - self, - _log: Callable | None = None, - logging: bool = True, - _loaded_model_data: dict | None = {}, - data: dict = {}, - _processing_status: list[str] = [], - files_written: list[str] = [], - _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None, - _obs_reader: Any | None = None, - **kwargs, - ) -> None: - super(Colocator, self).__init__( - _log=_log, - logging=logging, - _loaded_model_data=_loaded_model_data, - data=data, - _processing_status=_processing_status, - files_written=files_written, - _model_reader=_model_reader, - _obs_reader=_obs_reader, - **kwargs, - ) + def __init__(self, colocation_setup: ColocationSetup, **kwargs): + if not colocation_setup: + raise ValueError("An instance ColocationSetup must be provided to Colocator.") + + self.colocation_setup = colocation_setup + self._log: Callable | None = None + self.logging: bool = True + self._loaded_model_data: dict | None = {} + self.data: dict = {} + + self._processing_status: list[str] = [] + self.files_written: list[str] = [] + + self._model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None + self._obs_reader: Any | None = None + self.obs_filters: dict = colocation_setup.obs_filters.copy() + + # # Logging in this class needs serious work + # _log: Callable | None = None + # logging: bool = True + # _loaded_model_data: dict | None = {} + # data: dict = {} # think about this typing + # _processing_status: list[str] = [] + # files_written: list[str] = [] + # _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None + # _obs_reader: Any | None = None # LB: Should be improved + + # def __init__( + # self, + # _log: Callable | None = None, + # logging: bool = True, + # _loaded_model_data: dict | None = {}, + # data: dict = {}, + # _processing_status: list[str] = [], + # files_written: list[str] = [], + # _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None, + # _obs_reader: Any | None = None, + # **kwargs, + # ) -> None: + # super(Colocator, self).__init__( + # _log=_log, + # logging=logging, + # _loaded_model_data=_loaded_model_data, + # data=data, + # _processing_status=_processing_status, + # files_written=files_written, + # _model_reader=_model_reader, + # _obs_reader=_obs_reader, + # **kwargs, + # ) @property def model_vars(self): @@ -647,8 +650,8 @@ def model_vars(self): else: model_vars.append(ovar) - for ovar, mvars in self.model_add_vars.items(): - if not ovar in ovars: + for ovar, mvars in self.colocation_setup.model_add_vars.items(): + if ovar not in ovars: logger.warning( f"Found entry in model_add_vars for obsvar {ovar} which " f"is not specified in attr obs_vars, and will thus be " @@ -662,17 +665,17 @@ def obs_is_ungridded(self): """ bool: True if obs_id refers to an ungridded observation, else False """ - if self.obs_config is not None: + if self.colocation_setup.obs_config is not None: return True - return True if self.obs_id in get_all_supported_ids_ungridded() else False + return True if self.colocation_setup.obs_id in get_all_supported_ids_ungridded() else False @property def obs_is_vertical_profile(self): """ bool: True if obs_id refers to a VerticalProfile, else False """ - return self._obs_is_vertical_profile + return self.colocation_setup._obs_is_vertical_profile @obs_is_vertical_profile.setter def obs_is_vertical_profile(self, value): @@ -684,11 +687,11 @@ def model_reader(self): Model data reader """ if self._model_reader is not None: - if self._model_reader.data_id == self.model_id: + if self._model_reader.data_id == self.colocation_setup.model_id: return self._model_reader logger.info( f"Reloading outdated model reader. ID of current reader: " - f"{self._model_reader.data_id}. New ID: {self.model_id}" + f"{self._model_reader.data_id}. New ID: {self.colocation_setup.model_id}" ) self._model_reader = self._instantiate_gridded_reader(what="model") self._loaded_model_data = {} @@ -706,9 +709,9 @@ def _check_data_id_obs_reader(self): reader = self._obs_reader if reader is None: return False - elif self.obs_is_ungridded and self.obs_id in reader.data_ids: + elif self.obs_is_ungridded and self.colocation_setup.obs_id in reader.data_ids: return True - elif self.obs_id == reader.data_id: + elif self.colocation_setup.obs_id == reader.data_id: return True @property @@ -720,10 +723,10 @@ def obs_reader(self): if not self._check_data_id_obs_reader(): if self.obs_is_ungridded: self._obs_reader = ReadUngridded( - data_ids=[self.obs_id], - data_dirs=self.obs_data_dir, + data_ids=[self.colocation_setup.obs_id], + data_dirs=self.colocation_setup.obs_data_dir, configs=[ - self.obs_config, + self.colocation_setup.obs_config, ], ) else: @@ -771,11 +774,13 @@ def get_model_name(self): preferably :attr:`model_name`, else :attr:`model_id` """ - if self.model_name is None: - if self.model_id is None: - raise AttributeError("Neither model_name nor model_id are set") - return self.model_id - return self.model_name + if self.colocation_setup.model_name is None: + if self.colocation_setup.model_id is None: + raise AttributeError( + "Neither model_name nor model_id are set. These must be set in ColocationSetup." + ) + return self.colocation_setup.model_id + return self.colocation_setup.model_name def get_obs_name(self): """ @@ -798,16 +803,16 @@ def get_obs_name(self): preferably :attr:`obs_name`, else :attr:`obs_id` """ - if self.obs_name is None: - if self.obs_id is None: + if self.colocation_setup.obs_name is None: + if self.colocation_setup.obs_id is None: raise AttributeError("Neither obs_name nor obs_id are set") - return self.obs_id - return self.obs_name + return self.colocation_setup.obs_id + return self.colocation_setup.obs_name def get_model_data(self, model_var): if model_var in self._loaded_model_data: mdata = self._loaded_model_data[model_var] - if mdata.data_id == self.model_id: + if mdata.data_id == self.colocation_setup.model_id: return mdata self._check_add_model_read_aux(model_var) mdata = self._read_gridded(var_name=model_var, is_model=True) @@ -858,9 +863,10 @@ def prepare_run(self, var_list: list = None) -> dict: logger.warning("Deactivating logging in Colocator") self.logging = False - if isinstance(self.obs_vars, str): - self.obs_vars = [self.obs_vars] - elif not isinstance(self.obs_vars, list): + # LB: SHould be covered by ColocationSetup validator + # if isinstance(self.colocation_setup.obs_vars, str): + # self.colocation_setup.obs_vars = [self.colocation_setup.obs_vars] + if not isinstance(self.colocation_setup.obs_vars, list): raise AttributeError("obs_vars not defined or invalid, need list with strings...") self._check_obs_vars_available() self._check_obs_filters() @@ -873,11 +879,11 @@ def prepare_run(self, var_list: list = None) -> dict: (vars_to_process, ts_types) = self._check_load_model_data(vars_to_process) - if self.save_coldata and not self.reanalyse_existing: + if self.colocation_setup.save_coldata and not self.colocation_setup.reanalyse_existing: vars_to_process = self._filter_var_matches_files_not_exist(vars_to_process, ts_types) return vars_to_process - def run(self, var_list: list = None, **opts): + def run(self, var_list: list = None): """Perform colocation for current setup See also :func:`prepare_run`. @@ -898,14 +904,15 @@ def run(self, var_list: list = None, **opts): dictionaries comprising key / value pairs of obs variables and associated instances of :class:`ColocatedData`. """ - self.update(opts) + # LB: Do not allow changing the ColocationSetup after declaration. + # self.update(opts) data_out = {} # ToDo: see if the following could be solved via custom context manager try: vars_to_process = self.prepare_run(var_list) except Exception as ex: logger.exception(ex) - if self.raise_exceptions: + if self.colocation_setup.raise_exceptions: self._print_processing_status() self._write_log(f"ABORTED: raise_exceptions is True: {traceback.format_exc()}\n") self._close_log() @@ -917,7 +924,7 @@ def run(self, var_list: list = None, **opts): coldata = self._run_helper( mod_var, obs_var ) # note this can be ColocatedData or ColocatedDataLists - if not mod_var in data_out: + if mod_var not in data_out: data_out[mod_var] = {} data_out[mod_var][obs_var] = coldata self._processing_status.append([mod_var, obs_var, 1]) @@ -926,7 +933,7 @@ def run(self, var_list: list = None, **opts): logger.warning(msg) self._processing_status.append([mod_var, obs_var, 5]) self._write_log(msg) - if self.raise_exceptions: + if self.colocation_setup.raise_exceptions: self._print_processing_status() self._write_log("ABORTED: raise_exceptions is True\n") self._close_log() @@ -934,7 +941,7 @@ def run(self, var_list: list = None, **opts): self._write_log("Colocation finished") self._close_log() self._print_processing_status() - if self.keep_data: + if self.colocation_setup.keep_data: self.data = data_out return data_out @@ -1023,15 +1030,15 @@ def _read_ungridded(self, var_name): obs_filters_post = self._eval_obs_filters(var_name) obs_data = obs_reader.read( - data_ids=[self.obs_id], + data_ids=[self.colocation_setup.obs_id], vars_to_retrieve=var_name, - only_cached=self.obs_cache_only, + only_cached=self.colocation_setup.obs_cache_only, filter_post=obs_filters_post, - **self.read_opts_ungridded, + **self.colocation_setup.read_opts_ungridded, ) - if self.obs_remove_outliers: - oor = self.obs_outlier_ranges + if self.colocation_setup.obs_remove_outliers: + oor = self.colocation_setup.obs_outlier_ranges if var_name in oor: low, high = oor[var_name] else: @@ -1043,11 +1050,11 @@ def _read_ungridded(self, var_name): return obs_data def _check_obs_filters(self): - obs_vars = self.obs_vars - if any([x in self.obs_filters for x in obs_vars]): + obs_vars = self.colocation_setup.obs_vars + if any([x in self.colocation_setup.obs_filters for x in obs_vars]): # variable specific obs_filters for ovar in obs_vars: - if not ovar in self.obs_filters: + if ovar not in self.colocation_setup.obs_filters: self.obs_filters[ovar] = {} def _check_load_model_data(self, var_matches): @@ -1087,11 +1094,11 @@ def _check_load_model_data(self, var_matches): filtered[mvar] = ovar ts_types[mvar] = mdata.ts_type except Exception as e: - msg = f"Failed to load model data: {self.model_id} ({mvar}). Reason {e}" + msg = f"Failed to load model data: {self.colocation_setup.model_id} ({mvar}). Reason {e}" logger.warning(msg) self._write_log(msg + "\n") self._processing_status.append([mvar, ovar, 4]) - if self.raise_exceptions: + if self.colocation_setup.raise_exceptions: raise ColocationError(msg) return filtered, ts_types @@ -1106,7 +1113,7 @@ def _filter_var_matches_files_not_exist(self, var_matches, ts_types): return filtered def _check_model_add_vars(self): - for ovar, mvars in self.model_add_vars.items(): + for ovar, mvars in self.colocation_setup.model_add_vars.items(): if not isinstance(mvars, list): raise ValueError("Values of model_add_vars need to be list") elif not all([isinstance(x, str) for x in mvars]): @@ -1126,11 +1133,11 @@ def _instantiate_gridded_reader(self, what): Instance of reader class defined in self.SUPPORTED_GRIDDED_READERS """ if what == "model": - data_id = self.model_id - data_dir = self.model_data_dir + data_id = self.colocation_setup.model_id + data_dir = self.colocation_setup.model_data_dir else: - data_id = self.obs_id - data_dir = self.obs_data_dir + data_id = self.colocation_setup.obs_id + data_dir = self.colocation_setup.obs_data_dir reader_class = self._get_gridded_reader_class(what=what) reader = reader_class(data_id=data_id, data_dir=data_dir) return reader @@ -1138,17 +1145,17 @@ def _instantiate_gridded_reader(self, what): def _get_gridded_reader_class(self, what): """Returns the class of the reader for gridded data.""" try: - reader = self.SUPPORTED_GRIDDED_READERS[self.gridded_reader_id[what]] + reader = self.SUPPORTED_GRIDDED_READERS[self.colocation_setup.gridded_reader_id[what]] except KeyError as e: raise NotImplementedError( - f"Reader {self.gridded_reader_id[what]} is not supported: {e}" + f"Reader {self.colocation_setup.gridded_reader_id[what]} is not supported: {e}" ) return reader def _check_add_model_read_aux(self, model_var): - if not model_var in self.model_read_aux: + if model_var not in self.colocation_setup.model_read_aux: return False - info = self.model_read_aux[model_var] + info = self.colocation_setup.model_read_aux[model_var] if not isinstance(info, dict): raise ValueError( f"Invalid value for model_read_aux of variable {model_var}. " @@ -1166,30 +1173,34 @@ def _check_add_model_read_aux(self, model_var): return True def _check_obs_vars_available(self): - if self.obs_vars == []: + if self.colocation_setup.obs_vars == []: raise ColocationSetupError("no observation variables specified...") oreader = self.obs_reader if self.obs_is_ungridded: - avail = oreader.get_vars_supported(self.obs_id, self.obs_vars) + avail = oreader.get_vars_supported( + self.colocation_setup.obs_id, self.colocation_setup.obs_vars + ) else: avail = [] - for ovar in self.obs_vars: + for ovar in self.colocation_setup.obs_vars: if oreader.has_var(ovar): avail.append(ovar) - if len(self.obs_vars) > len(avail): - for ovar in self.obs_vars: + if len(self.colocation_setup.obs_vars) > len(avail): + for ovar in self.colocation_setup.obs_vars: if not ovar in avail: logger.warning( - f"Obs variable {ovar} is not available in {self.obs_id} " + f"Obs variable {ovar} is not available in {self.colocation_setup.obs_id} " f"and will be ignored" ) self._processing_status.append([None, ovar, 3]) - if self.raise_exceptions: - invalid = [var for var in self.obs_vars if not var in avail] + if self.colocation_setup.raise_exceptions: + invalid = [var for var in self.colocation_setup.obs_vars if not var in avail] invalid = "; ".join(invalid) - raise DataCoverageError(f"Invalid obs var(s) for {self.obs_id}: {invalid}") + raise DataCoverageError( + f"Invalid obs var(s) for {self.colocation_setup.obs_id}: {invalid}" + ) self.obs_vars = avail @@ -1216,9 +1227,9 @@ def _find_var_matches(self): var_matches = {} all_ok = True - muv = self.model_use_vars + muv = self.colocation_setup.model_use_vars modreader = self.model_reader - for ovar in self.obs_vars: + for ovar in self.colocation_setup.obs_vars: if ovar in muv: mvar = muv[ovar] else: @@ -1230,8 +1241,8 @@ def _find_var_matches(self): self._processing_status.append([mvar, ovar, 2]) all_ok = False - if ovar in self.model_add_vars: # observation variable - addvars = self.model_add_vars[ovar] + if ovar in self.colocation_setup.model_add_vars: # observation variable + addvars = self.colocation_setup.model_add_vars[ovar] for addvar in addvars: self._check_add_model_read_aux(addvar) if modreader.has_var(addvar): @@ -1240,7 +1251,7 @@ def _find_var_matches(self): self._processing_status.append([addvar, ovar, 2]) all_ok = False - if not all_ok and self.raise_exceptions: + if not all_ok and self.colocation_setup.raise_exceptions: raise DataCoverageError("Some model variables are not available") return var_matches @@ -1268,18 +1279,18 @@ def _get_ts_type_read(self, var_name, is_model): frequency to be read. """ - tst = self.ts_type # default - if is_model and self.model_ts_type_read is not None: - tst = self.model_ts_type_read + tst = self.colocation_setup.ts_type # default + if is_model and self.colocation_setup.model_ts_type_read is not None: + tst = self.colocation_setup.model_ts_type_read if tst == "": - tst = self.ts_type + tst = self.colocation_setup.ts_type elif not is_model and self.obs_ts_type_read is not None: tst = self.obs_ts_type_read if isinstance(tst, dict): if var_name in tst: tst = tst[var_name] else: - tst = self.ts_type + tst = self.colocation_setup.ts_type return tst def _read_gridded(self, var_name, is_model): @@ -1288,18 +1299,18 @@ def _read_gridded(self, var_name, is_model): kwargs = {} if is_model: reader = self.model_reader - vert_which = self.obs_vert_type + vert_which = self.colocation_setup.obs_vert_type - kwargs.update(**self.model_kwargs) - if self.model_use_climatology: + kwargs.update(**self.colocation_setup.model_kwargs) + if self.colocation_setup.model_use_climatology: # overwrite start and stop to read climatology file for model start, stop = 9999, None - if var_name in self.model_read_opts: - kwargs.update(self.model_read_opts[var_name]) + if var_name in self.colocation_setup.model_read_opts: + kwargs.update(self.colocation_setup.model_read_opts[var_name]) else: reader = self.obs_reader vert_which = None - ts_type_read = self.obs_ts_type_read + ts_type_read = self.colocation_setup.obs_ts_type_read kwargs.update(self._eval_obs_filters(var_name)) try: @@ -1309,7 +1320,7 @@ def _read_gridded(self, var_name, is_model): stop=stop, ts_type=ts_type_read, vert_which=vert_which, - flex_ts_type=self.flex_ts_type, + flex_ts_type=self.colocation_setup.flex_ts_type, **kwargs, ) except DataCoverageError: @@ -1319,7 +1330,7 @@ def _read_gridded(self, var_name, is_model): start=start, stop=stop, ts_type=ts_type_read, - flex_ts_type=self.flex_ts_type, + flex_ts_type=self.colocation_setup.flex_ts_type, vert_which=vert_which_alt, ) @@ -1328,17 +1339,17 @@ def _read_gridded(self, var_name, is_model): def _try_get_vert_which_alt(self, is_model, var_name): if is_model: - if self.obs_vert_type in self.OBS_VERT_TYPES_ALT: - return self.OBS_VERT_TYPES_ALT[self.obs_vert_type] + if self.colocation_setup.obs_vert_type in self.colocation_setup.OBS_VERT_TYPES_ALT: + return self.OBS_VERT_TYPES_ALT[self.colocation_setup.obs_vert_type] raise DataCoverageError(f"No alternative vert type found for {var_name}") def _check_remove_outliers_gridded(self, data, var_name, is_model): if is_model: - rm_outliers = self.model_remove_outliers - outlier_ranges = self.model_outlier_ranges + rm_outliers = self.colocation_setup.model_remove_outliers + outlier_ranges = self.colocation_setup.model_outlier_ranges else: - rm_outliers = self.obs_remove_outliers - outlier_ranges = self.obs_outlier_ranges + rm_outliers = self.colocation_setup.obs_remove_outliers + outlier_ranges = self.colocation_setup.obs_outlier_ranges if len(outlier_ranges) > 0 and not rm_outliers: logger.warning( @@ -1419,7 +1430,7 @@ def _save_coldata(self, coldata): logger.info(msg) def _eval_resample_how(self, model_var, obs_var): - rshow = self.resample_how + rshow = self.colocation_setup.resample_how if not isinstance(rshow, dict): return rshow @@ -1440,7 +1451,7 @@ def _infer_start_stop_yr_from_model_reader(self): # get sorted list of years available in model data (files with year # 9999 denote climatological data) yrs_avail = self.model_reader.years_avail - if self.model_use_climatology: + if self.colocation_setup.model_use_climatology: if not 9999 in yrs_avail: raise DataCoverageError("No climatology files available") first, last = 9999, None @@ -1454,16 +1465,18 @@ def _infer_start_stop_yr_from_model_reader(self): self.stop = last def _check_set_start_stop(self): - if self.start is None: + if self.colocation_setup.start is None: self._infer_start_stop_yr_from_model_reader() - if self.model_use_climatology: - if self.stop is not None or not isinstance(self.start, int): + if self.colocation_setup.model_use_climatology: + if self.colocation_setup.stop is not None or not isinstance( + self.colocation_setup.start, int + ): raise ColocationSetupError( "Conflict: only single year analyses are support for model " 'climatology fields, please specify "start" as integer ' 'denoting the year, and set "stop"=None' ) - self.start, self.stop = start_stop(self.start, self.stop) + self.start, self.stop = start_stop(self.colocation_setup.start, self.colocation_setup.stop) def _coldata_savename(self, obs_var, mod_var, ts_type, **kwargs): """Get filename of colocated data file for saving""" @@ -1485,7 +1498,7 @@ def _coldata_savename(self, obs_var, mod_var, ts_type, **kwargs): return f"{name}.nc" def _get_colocation_ts_type(self, model_ts_type, obs_ts_type=None): - chk = [self.ts_type, model_ts_type] + chk = [self.colocation_setup.ts_type, model_ts_type] if obs_ts_type is not None: chk.append(obs_ts_type) return get_lowest_resolution(*chk) @@ -1518,7 +1531,7 @@ def _prepare_colocation_args(self, model_var: str, obs_var: str): rshow = self._eval_resample_how(model_var, obs_var) - if self.model_use_climatology: + if self.colocation_setup.model_use_climatology: baseyr = self.start.year else: baseyr = None @@ -1528,12 +1541,12 @@ def _prepare_colocation_args(self, model_var: str, obs_var: str): data_ref=obs_data, start=self.start, stop=self.stop, - filter_name=self.filter_name, - regrid_res_deg=self.regrid_res_deg, - harmonise_units=self.harmonise_units, + filter_name=self.colocation_setup.filter_name, + regrid_res_deg=self.colocation_setup.regrid_res_deg, + harmonise_units=self.colocation_setup.harmonise_units, update_baseyear_gridded=baseyr, - min_num_obs=self.min_num_obs, - colocate_time=self.colocate_time, + min_num_obs=self.colocation_setup.min_num_obs, + colocate_time=self.colocation_setup.colocate_time, resample_how=rshow, ) if self.obs_is_ungridded: @@ -1541,15 +1554,15 @@ def _prepare_colocation_args(self, model_var: str, obs_var: str): args.update( ts_type=ts_type, var_ref=obs_var, - use_climatology_ref=self.obs_use_climatology, + use_climatology_ref=self.colocation_setup.obs_use_climatology, ) else: ts_type = self._get_colocation_ts_type(model_data.ts_type, obs_data.ts_type) args.update(ts_type=ts_type) if self.obs_is_vertical_profile: args.update( - colocation_layer_limits=self.colocation_layer_limits, - profile_layer_limits=self.profile_layer_limits, + colocation_layer_limits=self.colocation_setup.colocation_layer_limits, + profile_layer_limits=self.colocation_setup.profile_layer_limits, ) return args @@ -1559,12 +1572,12 @@ def _check_dimensionality(self, args): from pyaerocom.exceptions import DataDimensionError from pyaerocom.griddeddata import GriddedData - if mdata.ndim == 4 and self.obs_vert_type == "Surface": + if mdata.ndim == 4 and self.colocation_setup.obs_vert_type == "Surface": mdata = mdata.extract_surface_level() args["data"] = mdata if isinstance(odata, GriddedData): - if odata.ndim == 4 and self.obs_vert_type == "Surface": + if odata.ndim == 4 and self.colocation_setup.obs_vert_type == "Surface": odata = odata.extract_surface_level() args["data_ref"] = odata elif odata.ndim > 3: @@ -1574,7 +1587,9 @@ def _check_dimensionality(self, args): return args def _run_helper(self, model_var: str, obs_var: str): - logger.info(f"Running {self.model_id} ({model_var}) vs. {self.obs_id} ({obs_var})") + logger.info( + f"Running {self.colocation_setup.model_id} ({model_var}) vs. {self.colocation_setup.obs_id} ({obs_var})" + ) args = self._prepare_colocation_args(model_var, obs_var) args = self._check_dimensionality(args) coldata = self._colocation_func(**args) @@ -1582,15 +1597,15 @@ def _run_helper(self, model_var: str, obs_var: str): if isinstance(coldata, ColocatedData): coldata.data.attrs["model_name"] = self.get_model_name() coldata.data.attrs["obs_name"] = self.get_obs_name() - coldata.data.attrs["vert_code"] = self.obs_vert_type + coldata.data.attrs["vert_code"] = self.colocation_setup.obs_vert_type - coldata.data.attrs.update(**self.add_meta) + coldata.data.attrs.update(**self.colocation_setup.add_meta) - if self.zeros_to_nan: + if self.colocation_setup.zeros_to_nan: coldata = coldata.set_zeros_nan() - if self.model_to_stp: + if self.colocation_setup.model_to_stp: coldata = correct_model_stp_coldata(coldata) - if self.save_coldata: + if self.colocation_setup.save_coldata: self._save_coldata(coldata) elif isinstance(coldata, ColocatedDataLists): # look into intertools chain.from_iterable @@ -1598,13 +1613,13 @@ def _run_helper(self, model_var: str, obs_var: str): for coldata_obj in i_list: coldata_obj.data.attrs["model_name"] = self.get_model_name() coldata_obj.data.attrs["obs_name"] = self.get_obs_name() - coldata_obj.data.attrs["vert_code"] = self.obs_vert_type - coldata_obj.data.attrs.update(**self.add_meta) - if self.zeros_to_nan: + coldata_obj.data.attrs["vert_code"] = self.colocation_setup.obs_vert_type + coldata_obj.data.attrs.update(**self.colocation_setup.add_meta) + if self.colocation_setup.zeros_to_nan: coldata_obj = coldata_obj.set_zeros_nan() - if self.model_to_stp: # TODO: check is this needs modifying + if self.colocation_setup.model_to_stp: # TODO: check is this needs modifying coldata = correct_model_stp_coldata(coldata_obj) - if self.save_coldata: + if self.colocation_setup.save_coldata: self._save_coldata(coldata_obj) else: diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 760645857..d19ff2c9a 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -3,7 +3,7 @@ import sys from functools import cached_property from pathlib import Path -from typing import Iterable, Literal +from typing import Callable, Iterable, Literal import pandas as pd from pydantic import ( @@ -287,7 +287,9 @@ class ColocationSetup(BaseModel): arbitrary_types_allowed=True, allow="extra", protected_namespaces=(), - validate_assignment=True, + frozen=True, # make immutable + # validate_assignment=True, + # property_set_methods = {"obs_config": "set_obs_config"} ) # @model_validator('*', mode="before") @@ -318,8 +320,19 @@ def validate_obs_vars(cls, v): ts_type: str # = None start: pd.Timestamp | int | None # = None stop: pd.Timestamp | int | None # = None + obs_config: PyaroConfig | None = None + # def set_obs_config(self, obs_config): + # self.obs_config = obs_config + + # def __setattr__(self, key, val): + # method = self.__config__.property_set_methods.get(key) + # if method is None: + # super().__setattr__(key, val) + # else: + # getattr(self, method)(val) + ############################### # Attributes with defaults ############################### @@ -363,7 +376,7 @@ def validate_basedirs(cls, v): # Options related to obs reading and processing obs_name: str | None = None - obs_data_dir: str | None = None + obs_data_dir: Path | str | None = None obs_use_climatology: bool = False @@ -389,7 +402,9 @@ def validate_basedirs(cls, v): model_ts_type_read: str | dict | None = None # LB: need to check this declaration - model_read_aux: dict[str, dict[Literal["vars_required", "fun"], list[str]]] | None = {} + model_read_aux: dict[ + str, dict[Literal["vars_required", "fun"], list[str] | Callable] + ] | None = {} model_use_climatology: bool = False model_kwargs: dict = {} @@ -465,7 +480,9 @@ def basedir_logfiles(self): p.mkdir(parents=True, exist_ok=True) return str(p) # LB: not sure why pyaerocom insists these be strings as this point - # @field_validator("obs_id") + # #@field_validator("obs_id") + # @model_validator(mode="after") + # @classmethod # def validate_obs_id(cls, v: str): # if cls.obs_config is not None and v != cls.obs.config.name: # logger.info( diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 0065ed233..128cc3c16 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -62,8 +62,20 @@ } +# @pytest.fixture(scope="function") +# def tm5_aero_stp(): +# return dict( +# model_id="TM5-met2010_CTRL-TEST", +# obs_id="AeronetSunV3L2Subset.daily", +# obs_vars="od550aer", +# start=2010, +# raise_exceptions=True, +# reanalyse_existing=True, +# ) + + @pytest.fixture(scope="function") -def tm5_aero_stp(): +def setup(): return dict( model_id="TM5-met2010_CTRL-TEST", obs_id="AeronetSunV3L2Subset.daily", @@ -75,13 +87,19 @@ def tm5_aero_stp(): @pytest.fixture(scope="function") -def col(): - return Colocator(raise_exceptions=True, reanalyse_existing=True) +def tm5_aero_col_stp(setup): + return ColocationSetup(**setup) + + +@pytest.fixture(scope="function") +def col(tm5_aero_col_stp): + col = Colocator(tm5_aero_col_stp, raise_exceptions=True, reanalyse_existing=True) + return col -@pytest.mark.parametrize("stp,should_be", [(ColocationSetup(), default_setup)]) -def test_colocation_setup(stp: ColocationSetup, should_be: dict): - stp_dict = stp.model_dump() +@pytest.mark.parametrize("col_stp,should_be", [(ColocationSetup(), default_setup)]) +def test_colocation_setup(col_stp: ColocationSetup, should_be: dict): + stp_dict = col_stp.model_dump() for key, val in should_be.items(): assert key in stp_dict if key == "basedir_coldata": @@ -105,33 +123,34 @@ def test_ColocationSetup_invalid_input(key, val, raises): assert stp.model_dump()[key] == val -def test_Colocator__obs_vars__setter(col): - col.obs_vars = "var" - assert col.obs_vars == ["var"] +# def test_Colocator__obs_vars__setter(col): +# col.colocation_setup.obs_vars = "var" +# assert col.obs_vars == ["var"] -# LB: Not sure if Colocator should be allowed to accept objects -def test_Colocator__add_attr(col): - col.bla = "blub" - col["blub"] = 42 +# # LB: Not sure if Colocator should be allowed to accept objects +# def test_Colocator__add_attr(col): +# col.bla = "blub" +# col.blub = 42 - assert col.bla == "blub" - assert "blub" in col +# assert col.bla == "blub" +# assert "blub" in col @pytest.mark.parametrize("ts_type_desired", ["daily", "monthly"]) @pytest.mark.parametrize("ts_type", ["monthly"]) @pytest.mark.parametrize("flex", [False, True]) -def test_Colocator_model_ts_type_read(tm5_aero_stp, ts_type_desired, ts_type, flex): - col = Colocator(**tm5_aero_stp) +def test_Colocator_model_ts_type_read(setup, ts_type_desired, ts_type, flex): + setup["model_ts_type_read"] = {"obs_var": ts_type_desired} + + setup.update(dict(ts_type=ts_type, flex=flex)) + col_stp = ColocationSetup(**setup) + + col = Colocator(col_stp) obs_var = "od550aer" - assert tm5_aero_stp["obs_vars"] == obs_var - col.save_coldata = False - col.flex_ts_type = flex - col.ts_type = ts_type + assert setup["obs_vars"] == obs_var # Problem with saving since obs_id is different # from obs_data.contains_dataset[0]... - col.model_ts_type_read = {obs_var: ts_type_desired} data = col.run() assert isinstance(data, dict) assert obs_var in data @@ -142,16 +161,16 @@ def test_Colocator_model_ts_type_read(tm5_aero_stp, ts_type_desired, ts_type, fl assert coldata.metadata["ts_type_src"][1] == ts_type_desired -def test_Colocator_model_ts_type_read_error(tm5_aero_stp): - col = Colocator(**tm5_aero_stp) +def test_Colocator_model_ts_type_read_error(tm5_aero_col_stp): + col = Colocator(tm5_aero_col_stp) col.model_ts_type_read = {"od550aer": "minutely"} with pytest.raises(ColocationError) as e: col.run() assert str(e.value).startswith("Failed to load model data: TM5-met2010_CTRL-TEST (od550aer)") -def test_Colocator_model_add_vars(tm5_aero_stp): - col = Colocator(**tm5_aero_stp) +def test_Colocator_model_add_vars(tm5_aero_col_stp): + col = Colocator(tm5_aero_col_stp) model_var = "abs550aer" obs_var = "od550aer" col.save_coldata = False @@ -165,22 +184,25 @@ def test_Colocator_model_add_vars(tm5_aero_stp): assert coldata.var_name == ["od550aer", "abs550aer"] -def test_Colocator_init_basedir_coldata(tmp_path: Path): +def test_Colocator_init_basedir_coldata(setup, tmp_path: Path): base_path = tmp_path / "basedir" - Colocator(raise_exceptions=True, basedir_coldata=base_path) + setup["basedir_coldata"] = base_path + setup["raise_exceptions"] = True + Colocator(setup) + assert base_path.is_dir() -def test_Colocator__infer_start_stop_yr_from_model_reader(): - col = Colocator() +def test_Colocator__infer_start_stop_yr_from_model_reader(tm5_aero_col_stp): + col = Colocator(tm5_aero_col_stp) col.model_id = "TM5-met2010_CTRL-TEST" col._infer_start_stop_yr_from_model_reader() assert col.start == 2010 assert col.stop == None -def test_Colocator__coldata_savename(): - col = Colocator(raise_exceptions=True) +def test_Colocator__coldata_savename(tm5_aero_col_stp): + col = Colocator(tm5_aero_col_stp, raise_exceptions=True) col.obs_name = "obs" col.model_name = "model" col.filter_name = ALL_REGION_NAME @@ -195,11 +217,11 @@ def test_Colocator__coldata_savename(): # LB: Not clear if this is intended functionality or what or we can remove. # Currently set up to revalidate the basedir_coldata everytime # validation creates this directory -def test_Colocator_basedir_coldata(tmp_path: Path): - base_path = tmp_path / "test" - col = Colocator(raise_exceptions=True) - col.basedir_coldata = base_path - assert not base_path.is_dir() +# def test_Colocator_basedir_coldata(tmp_path: Path): +# base_path = tmp_path / "test" +# col = Colocator(raise_exceptions=True) +# col.basedir_coldata = base_path +# assert not base_path.is_dir() def test_Colocator_update_basedir_coldata(tmp_path: Path): @@ -228,8 +250,8 @@ def test_Colocator_update(what): assert col[key] == val -def test_Colocator_run_gridded_gridded(tm5_aero_stp): - col = Colocator(**tm5_aero_stp) +def test_Colocator_run_gridded_gridded(tm5_aero_col_stp): + col = Colocator(**tm5_aero_col_stp) col.obs_id = col.model_id col.run() var = col.obs_vars[0] @@ -275,19 +297,19 @@ def test_Colocator_run_gridded_gridded(tm5_aero_stp): ], ) def test_Colocator_run_gridded_ungridded( - tm5_aero_stp, update, chk_mvar, chk_ovar, sh, mean_obs, mean_mod + setup, update, chk_mvar, chk_ovar, sh, mean_obs, mean_mod ): - stp = ColocationSetup(**tm5_aero_stp) - stp.update(update) + setup.update(update) + col_stp = ColocationSetup(**setup) - result = Colocator(**stp.model_dump()).run() + result = Colocator(col_stp).run() assert isinstance(result, dict) coldata = result[chk_mvar][chk_ovar] assert coldata.shape == sh mod_clim_used = any("9999" in x for x in coldata.metadata["from_files"]) - assert stp.model_use_climatology == mod_clim_used + assert col_stp.model_use_climatology == mod_clim_used assert np.nanmean(coldata.data[0].values) == pytest.approx(mean_obs, abs=0.01) assert np.nanmean(coldata.data[1].values) == pytest.approx(mean_mod, abs=0.01) @@ -310,21 +332,21 @@ def test_Colocator_run_gridded_ungridded( ), ], ) -def test_Colocator_run_gridded_ungridded_error(tm5_aero_stp, update, error): - stp = ColocationSetup(**tm5_aero_stp) - stp.update(update) +def test_Colocator_run_gridded_ungridded_error(setup, update, error): + setup.update(update) + col_stp = ColocationSetup(setup) with pytest.raises(ColocationSetupError) as e: - Colocator(**stp).run() + Colocator(col_stp).run() assert str(e.value).startswith(error) -def test_colocator_filter_name(): - col = Colocator(filter_name=ALL_REGION_NAME) +def test_colocator_filter_name(setup): + col = Colocator(setup, filter_name=ALL_REGION_NAME) assert col.filter_name == ALL_REGION_NAME -def test_colocator_read_ungridded(): - col = Colocator(raise_exceptions=True) +def test_colocator_read_ungridded(setup): + col = Colocator(setup, raise_exceptions=True) obs_id = "AeronetSunV3L2Subset.daily" obs_var = "od550aer" col.obs_filters = {"longitude": [-30, 30]} @@ -340,16 +362,16 @@ def test_colocator_read_ungridded(): data = col._read_ungridded("invalid") -def test_colocator_get_model_data(): - col = Colocator(raise_exceptions=True) +def test_colocator_get_model_data(setup): + col = Colocator(setup, raise_exceptions=True) model_id = "TM5-met2010_CTRL-TEST" col.model_id = model_id data = col.get_model_data("od550aer") assert isinstance(data, GriddedData) -def test_colocator__find_var_matches(): - col = Colocator() +def test_colocator__find_var_matches(setup): + col = Colocator(setup) col.model_id = "TM5-met2010_CTRL-TEST" col.obs_id = "AeronetSunV3L2Subset.daily" col.obs_vars = "od550aer" @@ -364,8 +386,8 @@ def test_colocator__find_var_matches(): assert var_matches == {"od550aer": "conco3"} -def test_colocator__find_var_matches_model_add_vars(): - col = Colocator() +def test_colocator__find_var_matches_model_add_vars(setup): + col = Colocator(setup) col.model_id = "TM5-met2010_CTRL-TEST" col.obs_id = "AeronetSunV3L2Subset.daily" ovar = "od550aer" @@ -377,9 +399,9 @@ def test_colocator__find_var_matches_model_add_vars(): # LB: This test breaks the way I want this class to work because it implies allowing adding of attributes. -def test_colocator_instantiate_gridded_reader(path_emep): - col = Colocator(gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) - col.filepath = path_emep["daily"] +def test_colocator_instantiate_gridded_reader(setup, path_emep): + col = Colocator(setup, gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) + # col.filepath = path_emep["daily"] model_id = "model" col.model_id = model_id r = col._instantiate_gridded_reader(what="model") @@ -387,8 +409,8 @@ def test_colocator_instantiate_gridded_reader(path_emep): assert r.data_id == model_id -def test_colocator_instantiate_gridded_reader_model_data_dir(path_emep): - col = Colocator(gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) +def test_colocator_instantiate_gridded_reader_model_data_dir(setup, path_emep): + col = Colocator(setup, gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) model_data_dir = path_emep["data_dir"] col.model_data_dir = path_emep["data_dir"] model_id = "model" @@ -399,23 +421,23 @@ def test_colocator_instantiate_gridded_reader_model_data_dir(path_emep): assert r.data_id == model_id -def test_colocator__get_gridded_reader_class(): +def test_colocator__get_gridded_reader_class(setup): gridded_reader_id = {"model": "ReadMscwCtm", "obs": "ReadMscwCtm"} - col = Colocator(gridded_reader_id=gridded_reader_id) + col = Colocator(setup, gridded_reader_id=gridded_reader_id) for what in ["model", "obs"]: assert col._get_gridded_reader_class(what=what) == ReadMscwCtm -def test_colocator__check_add_model_read_aux(): - col = Colocator(raise_exceptions=True) +def test_colocator__check_add_model_read_aux(setup): + col = Colocator(setup, raise_exceptions=True) col.model_id = "TM5-met2010_CTRL-TEST" assert not col._check_add_model_read_aux("od550aer") col.model_read_aux = {"od550aer": dict(vars_required=["od550aer", "od550aer"], fun=add_cubes)} assert col._check_add_model_read_aux("od550aer") -def test_colocator_with_obs_data_dir_ungridded(): - col = Colocator(save_coldata=False) +def test_colocator_with_obs_data_dir_ungridded(setup): + col = Colocator(setup, save_coldata=False) col.model_id = "TM5-met2010_CTRL-TEST" col.obs_id = "AeronetSunV3L2Subset.daily" col.obs_vars = "od550aer" @@ -432,8 +454,8 @@ def test_colocator_with_obs_data_dir_ungridded(): assert str(cd.stop) == "2010-12-15T00:00:00.000000000" -def test_colocator_with_model_data_dir_ungridded(): - col = Colocator(save_coldata=False) +def test_colocator_with_model_data_dir_ungridded(setup): + col = Colocator(setup, save_coldata=False) col.model_id = "TM5-met2010_CTRL-TEST" col.obs_id = "AeronetSunV3L2Subset.daily" col.obs_vars = "od550aer" From 6e8d5b899276f1279dcda4dc1cda5ed6aca181b2 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Tue, 28 May 2024 12:01:12 +0200 Subject: [PATCH 22/44] update test_colocation_auto WIP --- pyaerocom/colocation_auto.py | 42 +++--- tests/test_colocation_auto.py | 263 ++++++++++++++++++---------------- 2 files changed, 165 insertions(+), 140 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index ed510ec0f..7410e3112 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -642,11 +642,11 @@ def model_vars(self): list of all model variables specified in this setup. """ - ovars = self.obs_vars + ovars = self.self.colocation_setup.obs_vars model_vars = [] for ovar in ovars: - if ovar in self.model_use_vars: - model_vars.append(self.model_use_vars[ovar]) + if ovar in self.colocation_setup.model_add_vars: + model_vars.append(self.colocation_setup.model_add_vars[ovar]) else: model_vars.append(ovar) @@ -1173,7 +1173,7 @@ def _check_add_model_read_aux(self, model_var): return True def _check_obs_vars_available(self): - if self.colocation_setup.obs_vars == []: + if not len(self.colocation_setup.obs_vars) > 0: raise ColocationSetupError("no observation variables specified...") oreader = self.obs_reader if self.obs_is_ungridded: @@ -1188,7 +1188,7 @@ def _check_obs_vars_available(self): if len(self.colocation_setup.obs_vars) > len(avail): for ovar in self.colocation_setup.obs_vars: - if not ovar in avail: + if ovar not in avail: logger.warning( f"Obs variable {ovar} is not available in {self.colocation_setup.obs_id} " f"and will be ignored" @@ -1196,7 +1196,7 @@ def _check_obs_vars_available(self): self._processing_status.append([None, ovar, 3]) if self.colocation_setup.raise_exceptions: - invalid = [var for var in self.colocation_setup.obs_vars if not var in avail] + invalid = [var for var in self.colocation_setup.obs_vars if var not in avail] invalid = "; ".join(invalid) raise DataCoverageError( f"Invalid obs var(s) for {self.colocation_setup.obs_id}: {invalid}" @@ -1284,7 +1284,7 @@ def _get_ts_type_read(self, var_name, is_model): tst = self.colocation_setup.model_ts_type_read if tst == "": tst = self.colocation_setup.ts_type - elif not is_model and self.obs_ts_type_read is not None: + elif not is_model and self.colocation_setup.obs_ts_type_read is not None: tst = self.obs_ts_type_read if isinstance(tst, dict): if var_name in tst: @@ -1294,7 +1294,7 @@ def _get_ts_type_read(self, var_name, is_model): return tst def _read_gridded(self, var_name, is_model): - start, stop = self.start, self.stop + start, stop = self.colocation_setup.start, self.colocation_setup.stop ts_type_read = self._get_ts_type_read(var_name, is_model) kwargs = {} if is_model: @@ -1373,23 +1373,25 @@ def _check_remove_outliers_gridded(self, data, var_name, is_model): def _eval_obs_filters(self, var_name): obs_filters = self.obs_filters if var_name in obs_filters: + # return obs_filters[var_name] obs_filters = obs_filters[var_name] - remaining = {} if not isinstance(obs_filters, dict): raise AttributeError( f"Detected obs_filters attribute in Colocator class, " f"which is not a dictionary: {obs_filters}" ) - for key, val in obs_filters.items(): - # keep ts_type filter in remaining (added on 17.2.21, 0.10.0 -> 0.10.1) - if key in self and not key == "ts_type": # can be handled - if isinstance(self[key], dict) and isinstance(val, dict): - self[key].update(val) - else: - self[key] = val - else: - remaining[key] = val - return remaining + # remaining = {} + # for key, val in obs_filters.items(): + # # keep ts_type filter in remaining (added on 17.2.21, 0.10.0 -> 0.10.1) + # if key in self.colocation_setup.obs_filters and not key == "ts_type": # can be handled + # #if isinstance(self[key], dict) and isinstance(val, dict): + # #self[key].update(val) + # #else: + # #self[key] = val + # pass + # else: + # remaining[key] = val + return obs_filters if len(obs_filters) > 0 else {} def _save_coldata(self, coldata): """Helper for saving colocateddata""" @@ -1492,7 +1494,7 @@ def _coldata_savename(self, obs_var, mod_var, ts_type, **kwargs): start_str=self.get_start_str(), stop_str=self.get_stop_str(), ts_type=ts_type, - filter_name=self.filter_name, + filter_name=self.colocation_setup.filter_name, vertical_layer=vertical_layer, ) return f"{name}.nc" diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 128cc3c16..2d0e059af 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -141,14 +141,12 @@ def test_ColocationSetup_invalid_input(key, val, raises): @pytest.mark.parametrize("ts_type", ["monthly"]) @pytest.mark.parametrize("flex", [False, True]) def test_Colocator_model_ts_type_read(setup, ts_type_desired, ts_type, flex): - setup["model_ts_type_read"] = {"obs_var": ts_type_desired} - + obs_var = "od550aer" + setup["model_ts_type_read"] = {obs_var: ts_type_desired} setup.update(dict(ts_type=ts_type, flex=flex)) + assert setup["obs_vars"] == obs_var col_stp = ColocationSetup(**setup) - col = Colocator(col_stp) - obs_var = "od550aer" - assert setup["obs_vars"] == obs_var # Problem with saving since obs_id is different # from obs_data.contains_dataset[0]... data = col.run() @@ -161,23 +159,23 @@ def test_Colocator_model_ts_type_read(setup, ts_type_desired, ts_type, flex): assert coldata.metadata["ts_type_src"][1] == ts_type_desired -def test_Colocator_model_ts_type_read_error(tm5_aero_col_stp): - col = Colocator(tm5_aero_col_stp) - col.model_ts_type_read = {"od550aer": "minutely"} +def test_Colocator_model_ts_type_read_error(setup): + setup["model_ts_type_read"] = {"od550aer": "minutely"} + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) with pytest.raises(ColocationError) as e: col.run() assert str(e.value).startswith("Failed to load model data: TM5-met2010_CTRL-TEST (od550aer)") -def test_Colocator_model_add_vars(tm5_aero_col_stp): - col = Colocator(tm5_aero_col_stp) +def test_Colocator_model_add_vars(setup): model_var = "abs550aer" obs_var = "od550aer" - col.save_coldata = False + setup["model_add_vars"] = {obs_var: [model_var]} + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) # Problem with saving since obs_id is different - - col.model_add_vars = {obs_var: [model_var]} - data = col.run(var_name=model_var) + data = col.run(var_list=[model_var]) assert isinstance(data, dict) assert model_var in data coldata = data[model_var][obs_var] @@ -188,7 +186,8 @@ def test_Colocator_init_basedir_coldata(setup, tmp_path: Path): base_path = tmp_path / "basedir" setup["basedir_coldata"] = base_path setup["raise_exceptions"] = True - Colocator(setup) + col_stp = ColocationSetup(**setup) + Colocator(col_stp) assert base_path.is_dir() @@ -201,12 +200,14 @@ def test_Colocator__infer_start_stop_yr_from_model_reader(tm5_aero_col_stp): assert col.stop == None -def test_Colocator__coldata_savename(tm5_aero_col_stp): - col = Colocator(tm5_aero_col_stp, raise_exceptions=True) - col.obs_name = "obs" - col.model_name = "model" - col.filter_name = ALL_REGION_NAME - col.start = 2015 +def test_Colocator__coldata_savename(setup): + setup["raise_exceptions"] = True + setup["obs_name"] = "obs" + setup["model_name"] = "model" + setup["filter_name"] = ALL_REGION_NAME + setup["start"] = 2015 + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) col._check_set_start_stop() savename = col._coldata_savename("od550aer", "od550ss", "daily") assert isinstance(savename, str) @@ -224,37 +225,43 @@ def test_Colocator__coldata_savename(tm5_aero_col_stp): # assert not base_path.is_dir() -def test_Colocator_update_basedir_coldata(tmp_path: Path): - col = Colocator(raise_exceptions=True) - - base_path = tmp_path / "basedir" - assert not base_path.is_dir() - col.update(basedir_coldata=base_path) - assert base_path.is_dir() - - -@pytest.mark.parametrize( - "what", - [ - dict(blaa=42), - dict(obs_id="test", model_id="test"), - dict(gridded_reader_id="test"), - dict(gridded_reader_id={"test": 42}), - dict(resample_how={"daily": {"hourly": "max"}}), - ], -) -def test_Colocator_update(what): - col = Colocator(raise_exceptions=True) - col.update(**what) - for key, val in what.items(): - assert col[key] == val - - -def test_Colocator_run_gridded_gridded(tm5_aero_col_stp): - col = Colocator(**tm5_aero_col_stp) - col.obs_id = col.model_id +# def test_Colocator_update_basedir_coldata(setup, tmp_path: Path): +# base_path = tmp_path / "basedir" +# setup.update(basedir_coldata = base_path) +# col_stp = ColocationSetup(**setup) + +# col = Colocator(col_stp) +# # LB: This is not how the new one works +# # assert not base_path.is_dir() +# # col.update(basedir_coldata=base_path) +# assert base_path.is_dir() + +# Lb: We should not test this because we don't support this functionality +# @pytest.mark.parametrize( +# "what", +# [ +# dict(blaa=42), +# dict(obs_id="test", model_id="test"), +# dict(gridded_reader_id="test"), +# dict(gridded_reader_id={"test": 42}), +# dict(resample_how={"daily": {"hourly": "max"}}), +# ], +# ) +# def test_Colocator_update(what, setup): +# setup["raise_exceptions"] = True +# setup.update(**what) +# col_stp = ColocationSetup(**setup) +# col = Colocator(col_stp) +# for key, val in what.items(): +# assert col[key] == val + + +def test_Colocator_run_gridded_gridded(setup): + setup["obs_id"] = setup["model_id"] + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) col.run() - var = col.obs_vars[0] + var = col.colocation_setup.obs_vars[0] coldata = col.data[var][var] assert isinstance(coldata, ColocatedData) assert coldata.ndim == 4 @@ -341,17 +348,21 @@ def test_Colocator_run_gridded_ungridded_error(setup, update, error): def test_colocator_filter_name(setup): - col = Colocator(setup, filter_name=ALL_REGION_NAME) - assert col.filter_name == ALL_REGION_NAME + setup["filter_name"] = ALL_REGION_NAME + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) + assert col.colocation_setup.filter_name == ALL_REGION_NAME def test_colocator_read_ungridded(setup): - col = Colocator(setup, raise_exceptions=True) - obs_id = "AeronetSunV3L2Subset.daily" + # obs_id = "AeronetSunV3L2Subset.daily" obs_var = "od550aer" - col.obs_filters = {"longitude": [-30, 30]} - col.obs_id = obs_id - col.read_opts_ungridded = {"last_file": 1} + # setup["obs_id"] = + setup["raise_exceptions"] = True + setup["obs_filters"] = {"longitude": [-30, 30]} + setup["read_opts_ungridded"] = {"last_file": 1} + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) data = col._read_ungridded(obs_var) assert isinstance(data, UngriddedData) @@ -363,58 +374,69 @@ def test_colocator_read_ungridded(setup): def test_colocator_get_model_data(setup): - col = Colocator(setup, raise_exceptions=True) - model_id = "TM5-met2010_CTRL-TEST" - col.model_id = model_id + setup["raise_exceptions"] = True + setup["model_id"] = "TM5-met2010_CTRL-TEST" + col_stp = ColocationSetup(**setup) + + col = Colocator(col_stp) data = col.get_model_data("od550aer") assert isinstance(data, GriddedData) def test_colocator__find_var_matches(setup): - col = Colocator(setup) - col.model_id = "TM5-met2010_CTRL-TEST" - col.obs_id = "AeronetSunV3L2Subset.daily" - col.obs_vars = "od550aer" + setup["model_id"] = "TM5-met2010_CTRL-TEST" + setup["obs_id"] = "AeronetSunV3L2Subset.daily" + setup2 = setup.copy() + setup["obs_vars"] = "od550aer" + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) var_matches = col._find_var_matches() assert var_matches == {"od550aer": "od550aer"} obs_var = "conco3" - col.obs_vars = [obs_var] - col.model_use_vars = {obs_var: "od550aer"} - var_matches = col._find_var_matches() + setup2["obs_vars"] = obs_var + setup2["model_use_vars"] = {obs_var: "od550aer"} + col_stp2 = ColocationSetup(**setup2) + col2 = Colocator(col_stp2) + + var_matches = col2._find_var_matches() assert var_matches == {"od550aer": "conco3"} def test_colocator__find_var_matches_model_add_vars(setup): - col = Colocator(setup) - col.model_id = "TM5-met2010_CTRL-TEST" - col.obs_id = "AeronetSunV3L2Subset.daily" ovar = "od550aer" - col.obs_vars = [ovar] - - col.model_add_vars = {ovar: ["abs550aer"]} + setup["model_id"] = "TM5-met2010_CTRL-TEST" + setup["obs_id"] = "AeronetSunV3L2Subset.daily" + setup["obs_vars"] = [ovar] + setup["model_add_vars"] = {ovar: ["abs550aer"]} + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) var_matches = col._find_var_matches() assert var_matches == {"abs550aer": ovar, ovar: ovar} # LB: This test breaks the way I want this class to work because it implies allowing adding of attributes. def test_colocator_instantiate_gridded_reader(setup, path_emep): - col = Colocator(setup, gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) - # col.filepath = path_emep["daily"] model_id = "model" - col.model_id = model_id + setup["gridded_reader_id"] = {"model": "ReadMscwCtm", "obs": "ReadGridded"} + setup["model_id"] = model_id + setup["filepath"] = path_emep + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) r = col._instantiate_gridded_reader(what="model") assert isinstance(r, ReadMscwCtm) assert r.data_id == model_id def test_colocator_instantiate_gridded_reader_model_data_dir(setup, path_emep): - col = Colocator(setup, gridded_reader_id={"model": "ReadMscwCtm", "obs": "ReadGridded"}) model_data_dir = path_emep["data_dir"] - col.model_data_dir = path_emep["data_dir"] model_id = "model" - col.model_id = model_id + setup["gridded_reader_id"] = {"model": "ReadMscwCtm", "obs": "ReadGridded"} + setup["model_data_dir"] = model_data_dir + setup["model_id"] = model_id + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) r = col._instantiate_gridded_reader(what="model") assert isinstance(r, ReadMscwCtm) assert r.data_dir == model_data_dir @@ -422,29 +444,34 @@ def test_colocator_instantiate_gridded_reader_model_data_dir(setup, path_emep): def test_colocator__get_gridded_reader_class(setup): - gridded_reader_id = {"model": "ReadMscwCtm", "obs": "ReadMscwCtm"} - col = Colocator(setup, gridded_reader_id=gridded_reader_id) + setup["gridded_reader_id"] = {"model": "ReadMscwCtm", "obs": "ReadMscwCtm"} + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) for what in ["model", "obs"]: assert col._get_gridded_reader_class(what=what) == ReadMscwCtm def test_colocator__check_add_model_read_aux(setup): - col = Colocator(setup, raise_exceptions=True) - col.model_id = "TM5-met2010_CTRL-TEST" + setup["raise_exceptions"] = True + setup["model_id"] = "TM5-met2010_CTRL-TEST" + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) assert not col._check_add_model_read_aux("od550aer") - col.model_read_aux = {"od550aer": dict(vars_required=["od550aer", "od550aer"], fun=add_cubes)} - assert col._check_add_model_read_aux("od550aer") + setup["model_read_aux"] = { + "od550aer": dict(vars_required=["od550aer", "od550aer"], fun=add_cubes) + } + col_stp2 = ColocationSetup(**setup) + col2 = Colocator(col_stp2) + assert col2._check_add_model_read_aux("od550aer") def test_colocator_with_obs_data_dir_ungridded(setup): - col = Colocator(setup, save_coldata=False) - col.model_id = "TM5-met2010_CTRL-TEST" - col.obs_id = "AeronetSunV3L2Subset.daily" - col.obs_vars = "od550aer" - col.ts_type = "monthly" - - col.obs_data_dir = TEST_DATA["AeronetSunV3L2Subset.daily"].path - + setup["obs_id"] = "AeronetSunV3L2Subset.daily" + setup["model_id"] = "TM5-met2010_CTRL-TEST" + setup["obs_vars"] = "od550aer" + setup["obs_data_dir"] = TEST_DATA["AeronetSunV3L2Subset.daily"].path + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) data = col.run() assert len(data) == 1 cd = data["od550aer"]["od550aer"] @@ -455,14 +482,12 @@ def test_colocator_with_obs_data_dir_ungridded(setup): def test_colocator_with_model_data_dir_ungridded(setup): - col = Colocator(setup, save_coldata=False) - col.model_id = "TM5-met2010_CTRL-TEST" - col.obs_id = "AeronetSunV3L2Subset.daily" - col.obs_vars = "od550aer" - col.ts_type = "monthly" - - col.model_data_dir = TEST_DATA["MODELS"].path / "TM5-met2010_CTRL-TEST/renamed" - + setup["model_id"] = "TM5-met2010_CTRL-TEST" + setup["obs_id"] = "AeronetSunV3L2Subset.daily" + setup["obs_vars"] = "od550aer" + setup["model_data_dir"] = TEST_DATA["MODELS"].path / "TM5-met2010_CTRL-TEST/renamed" + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) data = col.run() assert len(data) == 1 cd = data["od550aer"]["od550aer"] @@ -472,16 +497,13 @@ def test_colocator_with_model_data_dir_ungridded(setup): assert str(cd.stop) == "2010-12-15T00:00:00.000000000" -def test_colocator_with_obs_data_dir_gridded(): - col = Colocator(save_coldata=False) - col.model_id = "TM5-met2010_CTRL-TEST" - col.obs_id = "TM5-met2010_CTRL-TEST" - col.obs_vars = "od550aer" - col.ts_type = "monthly" - - obs_dir = TEST_DATA["MODELS"].path / "TM5-met2010_CTRL-TEST/renamed" - col.obs_data_dir = str(obs_dir) - +def test_colocator_with_obs_data_dir_gridded(setup): + setup["model_id"] = "TM5-met2010_CTRL-TEST" + setup["obs_id"] = "TM5-met2010_CTRL-TEST" + setup["obs_vars"] = "od550aer" + setup["obs_dir"] = TEST_DATA["MODELS"].path / "TM5-met2010_CTRL-TEST/renamed" + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) data = col.run() assert len(data) == 1 cd = data["od550aer"]["od550aer"] @@ -496,15 +518,16 @@ def test_colocator_with_obs_data_dir_gridded(): ################################### -def test_colocation_pyaro(pyaro_testconfig, fake_MSCWCtm_data_monthly_2015) -> None: - col = Colocator(save_coldata=False) +def test_colocation_pyaro(pyaro_testconfig, fake_MSCWCtm_data_monthly_2015, setup) -> None: config = pyaro_testconfig[0] - col.obs_config = config - col.model_id = "EMEP" - col.gridded_reader_id = {"model": "ReadMscwCtm"} - col.model_data_dir = fake_MSCWCtm_data_monthly_2015 - col.obs_vars = "concso4" - col.ts_type = "monthly" + setup["obs_config"] = config + setup["model_id"] = "EMEP" + setup["gridded_reader_id"] = {"model": "ReadMscwCtm"} + setup["model_data_dir"] = fake_MSCWCtm_data_monthly_2015 + setup["obs_vars"] = "concso4" + + col_stp = ColocationSetup(**setup) + col = Colocator(col_stp) data = col.run() From dfd644a6173e5bb5178c53f6c58f87d68652ef02 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Tue, 28 May 2024 15:48:07 +0200 Subject: [PATCH 23/44] fixing up testing with daniel --- pyaerocom/aeroval/_processing_base.py | 8 +- tests/fixtures/mscw_ctm.py | 30 ++++++-- tests/fixtures/pyaro.py | 10 +-- tests/io/pyaro/test_read_pyaro.py | 6 +- tests/test_colocation_auto.py | 102 ++------------------------ 5 files changed, 45 insertions(+), 111 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index a516a0a2a..722b21919 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -4,6 +4,7 @@ from pyaerocom.aeroval import EvalSetup from pyaerocom.aeroval.experiment_output import ExperimentOutput from pyaerocom.colocation_auto import Colocator +from pyaerocom.colocation_setup import ColocationSetup class HasConfig: @@ -116,13 +117,14 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat if obs_name: obs_cfg = self.cfg.get_obs_entry(obs_name) pyaro_config = obs_cfg["obs_config"] if "obs_config" in obs_cfg else None - col_cfg = {**self.cfg.colocation_opts} + col_cfg = {**self.cfg.colocation_opts.model_dump()} col_cfg["obs_config"] = pyaro_config - col = Colocator(**col_cfg) + col_stp = ColocationSetup(**col_cfg) + col = Colocator(col_stp) col.import_from(obs_cfg) col.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) else: - col = Colocator(**self.cfg.colocation_opts) + col = Colocator(self.cfg.colocation_opts) if model_name: mod_cfg = self.cfg.get_model_entry(model_name) col.import_from(mod_cfg) diff --git a/tests/fixtures/mscw_ctm.py b/tests/fixtures/mscw_ctm.py index f500d0d43..3ea06796e 100644 --- a/tests/fixtures/mscw_ctm.py +++ b/tests/fixtures/mscw_ctm.py @@ -56,16 +56,36 @@ def create_fake_MSCWCtm_data(year="2019", numval=1, tst=None): return arr +# @pytest.fixture +# def fake_so4_MSCWCtm_data_monthly_2015(tmp_path) -> str: +# path = tmp_path / "EMEP_fake" / "2015" + +# if not path.exists(): +# path.mkdir(parents=True) +# data = create_fake_MSCWCtm_data(year=2015, numval=1) + +# var_name = "SURF_ug_SO4" +# units = "ug m-3" +# ds = xr.Dataset() + +# ds[var_name] = data +# ds[var_name].attrs.update(units=units, var_name=var_name) + +# ds.to_netcdf(path=path / "Base_month.nc") + +# return str(path) + + @pytest.fixture -def fake_MSCWCtm_data_monthly_2015(tmp_path) -> str: - path = tmp_path / "EMEP_fake" / "2015" +def fake_aod_MSCWCtm_data_monthly_2010(tmp_path) -> str: + path = tmp_path / "EMEP_fake" / "2010" if not path.exists(): path.mkdir(parents=True) - data = create_fake_MSCWCtm_data(year=2015, numval=1) + data = create_fake_MSCWCtm_data(year=2010, numval=1) - var_name = "SURF_ug_SO4" - units = "ug m-3" + var_name = "AOD_550nm" + units = "1" ds = xr.Dataset() ds[var_name] = data diff --git a/tests/fixtures/pyaro.py b/tests/fixtures/pyaro.py index 18a7d0137..312d672ca 100644 --- a/tests/fixtures/pyaro.py +++ b/tests/fixtures/pyaro.py @@ -18,13 +18,13 @@ def make_csv_test_file(tmp_path: Path) -> Path: if file.exists(): return file - start = pd.to_datetime("01.01.2015", dayfirst=True) - end = pd.to_datetime("31.12.2015", dayfirst=True) + start = pd.to_datetime("01.01.2010", dayfirst=True) + end = pd.to_datetime("31.12.2010", dayfirst=True) dates = pd.date_range(start, end, freq="D") stations = ["NO0002", "GB0881"] countries = ["NO", "GB"] coords = [(58, 8), (60, -1)] - species = ["NOx", "SOx"] + species = ["NOx", "SOx", "AOD"] with open(file, "w") as f: for s in species: @@ -48,7 +48,7 @@ def testconfig(tmp_path: Path) -> PyaroConfig: data_id=data_id, filename_or_obj_or_url=str(make_csv_test_file(tmp_path)), filters={}, - name_map={"SOx": "concso4"}, + name_map={"SOx": "concso4", "AOD": "od550aer"}, ) config2 = PyaroConfig( @@ -56,7 +56,7 @@ def testconfig(tmp_path: Path) -> PyaroConfig: data_id=data_id, filename_or_obj_or_url=str(make_csv_test_file(tmp_path)), filters={}, - name_map={"SOx": "concso4"}, + name_map={"SOx": "concso4", "AOD": "od550aer"}, ) return [config1, config2] diff --git a/tests/io/pyaro/test_read_pyaro.py b/tests/io/pyaro/test_read_pyaro.py index c6842d4e5..a786ebd4d 100644 --- a/tests/io/pyaro/test_read_pyaro.py +++ b/tests/io/pyaro/test_read_pyaro.py @@ -18,7 +18,7 @@ def test_readpyaro(pyaro_testdata): def test_variables(pyaro_testdata): rp = pyaro_testdata - variables = ["NOx", "concso4"] + variables = ["NOx", "concso4", "od550aer"] assert rp.PROVIDES_VARIABLES == variables assert rp.DEFAULT_VARS == variables @@ -47,8 +47,8 @@ def test_pyarotoungriddeddata_reading(pyaro_testdata): assert all_stations["stats"][0]["country"] == "NO" # Tests the dates - start = pd.to_datetime("01.01.2015", dayfirst=True) - end = pd.to_datetime("31.12.2015", dayfirst=True) + start = pd.to_datetime("01.01.2010", dayfirst=True) + end = pd.to_datetime("31.12.2010", dayfirst=True) dates = pd.date_range(start, end, freq="D") assert len(all_stations["stats"][0].dtime) == ceil(len(dates) / 2) diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 2d0e059af..fc8ca110c 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -62,18 +62,6 @@ } -# @pytest.fixture(scope="function") -# def tm5_aero_stp(): -# return dict( -# model_id="TM5-met2010_CTRL-TEST", -# obs_id="AeronetSunV3L2Subset.daily", -# obs_vars="od550aer", -# start=2010, -# raise_exceptions=True, -# reanalyse_existing=True, -# ) - - @pytest.fixture(scope="function") def setup(): return dict( @@ -123,20 +111,6 @@ def test_ColocationSetup_invalid_input(key, val, raises): assert stp.model_dump()[key] == val -# def test_Colocator__obs_vars__setter(col): -# col.colocation_setup.obs_vars = "var" -# assert col.obs_vars == ["var"] - - -# # LB: Not sure if Colocator should be allowed to accept objects -# def test_Colocator__add_attr(col): -# col.bla = "blub" -# col.blub = 42 - -# assert col.bla == "blub" -# assert "blub" in col - - @pytest.mark.parametrize("ts_type_desired", ["daily", "monthly"]) @pytest.mark.parametrize("ts_type", ["monthly"]) @pytest.mark.parametrize("flex", [False, True]) @@ -215,47 +189,6 @@ def test_Colocator__coldata_savename(setup): assert savename == n -# LB: Not clear if this is intended functionality or what or we can remove. -# Currently set up to revalidate the basedir_coldata everytime -# validation creates this directory -# def test_Colocator_basedir_coldata(tmp_path: Path): -# base_path = tmp_path / "test" -# col = Colocator(raise_exceptions=True) -# col.basedir_coldata = base_path -# assert not base_path.is_dir() - - -# def test_Colocator_update_basedir_coldata(setup, tmp_path: Path): -# base_path = tmp_path / "basedir" -# setup.update(basedir_coldata = base_path) -# col_stp = ColocationSetup(**setup) - -# col = Colocator(col_stp) -# # LB: This is not how the new one works -# # assert not base_path.is_dir() -# # col.update(basedir_coldata=base_path) -# assert base_path.is_dir() - -# Lb: We should not test this because we don't support this functionality -# @pytest.mark.parametrize( -# "what", -# [ -# dict(blaa=42), -# dict(obs_id="test", model_id="test"), -# dict(gridded_reader_id="test"), -# dict(gridded_reader_id={"test": 42}), -# dict(resample_how={"daily": {"hourly": "max"}}), -# ], -# ) -# def test_Colocator_update(what, setup): -# setup["raise_exceptions"] = True -# setup.update(**what) -# col_stp = ColocationSetup(**setup) -# col = Colocator(col_stp) -# for key, val in what.items(): -# assert col[key] == val - - def test_Colocator_run_gridded_gridded(setup): setup["obs_id"] = setup["model_id"] col_stp = ColocationSetup(**setup) @@ -341,7 +274,7 @@ def test_Colocator_run_gridded_ungridded( ) def test_Colocator_run_gridded_ungridded_error(setup, update, error): setup.update(update) - col_stp = ColocationSetup(setup) + col_stp = ColocationSetup(**setup) with pytest.raises(ColocationSetupError) as e: Colocator(col_stp).run() assert str(e.value).startswith(error) @@ -518,46 +451,25 @@ def test_colocator_with_obs_data_dir_gridded(setup): ################################### -def test_colocation_pyaro(pyaro_testconfig, fake_MSCWCtm_data_monthly_2015, setup) -> None: +def test_colocation_pyaro(pyaro_testconfig, fake_aod_MSCWCtm_data_monthly_2010, setup) -> None: config = pyaro_testconfig[0] setup["obs_config"] = config setup["model_id"] = "EMEP" setup["gridded_reader_id"] = {"model": "ReadMscwCtm"} - setup["model_data_dir"] = fake_MSCWCtm_data_monthly_2015 - setup["obs_vars"] = "concso4" + setup["model_data_dir"] = fake_aod_MSCWCtm_data_monthly_2010 + setup["obs_vars"] = "od550aer" # This obs does not exist in Aeronet col_stp = ColocationSetup(**setup) col = Colocator(col_stp) data = col.run() - cd = data["concso4"]["concso4"] + cd = data["od550aer"]["od550aer"] assert isinstance(cd, ColocatedData) assert cd.ts_type == "monthly" - assert str(cd.start) == "2015-01-15T00:00:00.000000000" - assert str(cd.stop) == "2015-12-15T00:00:00.000000000" + assert str(cd.start) == "2010-01-15T00:00:00.000000000" + assert str(cd.stop) == "2010-12-15T00:00:00.000000000" assert np.sum(np.isnan(cd.data[0, :].data)) == 0 assert cd.data[0, :].data.shape[0] == 12 - - -def test_colocation_pyaro_change_obs_id(pyaro_testconfig, fake_MSCWCtm_data_monthly_2015) -> None: - col = Colocator(save_coldata=False) - config = pyaro_testconfig[0] - col.obs_id = "undefined" - col.obs_config = config - col.obs_id = "undefined" - col.obs_config = config - - col.model_id = "EMEP" - col.gridded_reader_id = {"model": "ReadMscwCtm"} - col.model_data_dir = fake_MSCWCtm_data_monthly_2015 - col.obs_vars = "concso4" - col.ts_type = "monthly" - - data = col.run() - - cd = data["concso4"]["concso4"] - assert isinstance(cd, ColocatedData) - assert cd.ts_type == "monthly" From 9caeba1ac8dcb84d69ade67b0806a2af525061ba Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 29 May 2024 13:16:59 +0200 Subject: [PATCH 24/44] Aeroval testing fixing WIP --- pyaerocom/aeroval/_processing_base.py | 19 ++++++++++++------- pyaerocom/aeroval/setupclasses.py | 12 ++++++------ pyaerocom/colocation_auto.py | 4 ++-- pyaerocom/colocation_setup.py | 4 ++-- tests/aeroval/test__processing_base.py | 8 ++++---- tests/aeroval/test_setupclasses.py | 4 ++-- 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index 722b21919..0bbd5310c 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -114,22 +114,27 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat Colocator """ + # LB: In general I don't like what this function is doing. Ideally define the Colocator object once and just use that. + col_cfg = {**self.cfg.colocation_opts.model_dump()} + outdir = self.cfg.path_manager.get_coldata_dir() + col_cfg["basedir_coldata"] = outdir if obs_name: obs_cfg = self.cfg.get_obs_entry(obs_name) pyaro_config = obs_cfg["obs_config"] if "obs_config" in obs_cfg else None - col_cfg = {**self.cfg.colocation_opts.model_dump()} col_cfg["obs_config"] = pyaro_config col_stp = ColocationSetup(**col_cfg) col = Colocator(col_stp) - col.import_from(obs_cfg) - col.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) + # col.import_from(obs_cfg) # LB: Not sure if needed or works anymore original from lowlevel_helpers.py + col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) else: - col = Colocator(self.cfg.colocation_opts) + col_stp = ColocationSetup(**col_cfg) + col = Colocator(col_stp) if model_name: mod_cfg = self.cfg.get_model_entry(model_name) - col.import_from(mod_cfg) - outdir = self.cfg.path_manager.get_coldata_dir() - col.basedir_coldata = outdir + col_cfg["model_cfg"] = mod_cfg # LB: this is untested and just a guess at this point + col_stp = ColocationSetup(**col_cfg) + col = Colocator(col_stp) + # col.import_from(mod_cfg) # LB: also not sure if needed or works anymore return col diff --git a/pyaerocom/aeroval/setupclasses.py b/pyaerocom/aeroval/setupclasses.py index 2b1dbc891..a7f18959d 100644 --- a/pyaerocom/aeroval/setupclasses.py +++ b/pyaerocom/aeroval/setupclasses.py @@ -571,8 +571,8 @@ def _import_aux_funs(self) -> None: def _check_time_config(self) -> None: periods = self.time_cfg.periods - colstart = self.colocation_opts["start"] - colstop = self.colocation_opts["stop"] + colstart = self.colocation_opts.start + colstop = self.colocation_opts.stop if len(periods) == 0: if colstart is None: @@ -597,15 +597,15 @@ def _check_time_config(self) -> None: if stop_yr == start_yr: stop_yr += 1 if colstart is None: - self.colocation_opts["start"] = start.strftime("%Y/%m/%d %H:%M:%S") + self.colocation_opts.start = start.strftime("%Y/%m/%d %H:%M:%S") if colstop is None: - self.colocation_opts["stop"] = stop.strftime( + self.colocation_opts.stop = stop.strftime( "%Y/%m/%d %H:%M:%S" ) # + 1 # add 1 year since we want to include stop year else: if colstart is None: - self.colocation_opts["start"] = start_yr + self.colocation_opts.start = start_yr if colstop is None: - self.colocation_opts["stop"] = ( + self.colocation_opts.stop = ( stop_yr + 1 ) # add 1 year since we want to include stop year diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 7410e3112..1b60b76c9 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -866,8 +866,8 @@ def prepare_run(self, var_list: list = None) -> dict: # LB: SHould be covered by ColocationSetup validator # if isinstance(self.colocation_setup.obs_vars, str): # self.colocation_setup.obs_vars = [self.colocation_setup.obs_vars] - if not isinstance(self.colocation_setup.obs_vars, list): - raise AttributeError("obs_vars not defined or invalid, need list with strings...") + # if not isinstance(self.colocation_setup.obs_vars, list): + # raise AttributeError("obs_vars not defined or invalid, need list with strings...") self._check_obs_vars_available() self._check_obs_filters() self._check_model_add_vars() diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index d19ff2c9a..466eb12f1 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -287,7 +287,7 @@ class ColocationSetup(BaseModel): arbitrary_types_allowed=True, allow="extra", protected_namespaces=(), - frozen=True, # make immutable + # frozen=True, # make immutable # validate_assignment=True, # property_set_methods = {"obs_config": "set_obs_config"} ) @@ -415,7 +415,7 @@ def validate_basedirs(cls, v): flex_ts_type: bool = True # Options related to time resampling - min_num_obs: int | None = None + min_num_obs: dict | int | None = None resample_how: str | dict | None = "mean" # Options related to outlier removal diff --git a/tests/aeroval/test__processing_base.py b/tests/aeroval/test__processing_base.py index 5665f1be2..e67ede8de 100644 --- a/tests/aeroval/test__processing_base.py +++ b/tests/aeroval/test__processing_base.py @@ -31,11 +31,11 @@ def test_HasConfig_setup(config: HasConfig): def test_HasConfig_raise_exceptions(config: HasConfig): - assert config.raise_exceptions == False + assert not config.raise_exceptions def test_HasConfig_reanalyse_existing(config: HasConfig): - assert config.reanalyse_existing == True + assert config.reanalyse_existing @pytest.fixture(scope="module") @@ -45,8 +45,8 @@ def collocator(setup: EvalSetup) -> HasColocator: def test_HasColocator_get_diurnal_only(collocator: HasColocator): - assert collocator._get_diurnal_only("obs1") == False - assert collocator._get_diurnal_only("obs2") == True + assert not collocator._get_diurnal_only("obs1") + assert collocator._get_diurnal_only("obs2") @pytest.mark.parametrize("obs_name", [None, "obs1", "obs2"]) diff --git a/tests/aeroval/test_setupclasses.py b/tests/aeroval/test_setupclasses.py index c04ad525d..724d6ef33 100644 --- a/tests/aeroval/test_setupclasses.py +++ b/tests/aeroval/test_setupclasses.py @@ -129,8 +129,8 @@ def test_EvalSetup_TimeSetup(eval_setup: EvalSetup, cfg_exp1: dict): ) def test_EvalSetup__check_time_config(eval_setup: EvalSetup, start, stop): eval_setup._check_time_config() - assert str(eval_setup.colocation_opts["start"]) == start - assert str(eval_setup.colocation_opts["stop"]) == stop + assert str(eval_setup.colocation_opts.start) == start + assert str(eval_setup.colocation_opts.stop) == stop @pytest.mark.parametrize( From 999247466144c92a05653e24a8d173b06a67e8d5 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 29 May 2024 13:39:13 +0200 Subject: [PATCH 25/44] WIP --- pyaerocom/aeroval/experiment_processor.py | 2 +- pyaerocom/colocation_auto.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pyaerocom/aeroval/experiment_processor.py b/pyaerocom/aeroval/experiment_processor.py index 8461328af..f0599d866 100644 --- a/pyaerocom/aeroval/experiment_processor.py +++ b/pyaerocom/aeroval/experiment_processor.py @@ -56,7 +56,7 @@ def _run_single_entry(self, model_name, obs_name, var_list): elif ocfg["only_json"]: if not ocfg["coldata_dir"]: raise Exception( - f"No coldata_dir provided for an obs network for which only_json=True. The assumption of setting only_json=True is that colocated files already exist, and so a directory for these files must be provided." + "No coldata_dir provided for an obs network for which only_json=True. The assumption of setting only_json=True is that colocated files already exist, and so a directory for these files must be provided." ) else: preprocessed_coldata_dir = ocfg["coldata_dir"] diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 1b60b76c9..305b070b4 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -1173,8 +1173,9 @@ def _check_add_model_read_aux(self, model_var): return True def _check_obs_vars_available(self): - if not len(self.colocation_setup.obs_vars) > 0: - raise ColocationSetupError("no observation variables specified...") + # LB: This is what I would like but not sure if it will work with current setup + # if not len(self.colocation_setup.obs_vars) > 0: + # raise ColocationSetupError("no observation variables specified...") oreader = self.obs_reader if self.obs_is_ungridded: avail = oreader.get_vars_supported( From ba38e9253e7aa9b4dc4b789e915993f6c6cdc1eb Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 29 May 2024 16:49:28 +0200 Subject: [PATCH 26/44] keep checker ing Colocator.run() --- pyaerocom/colocation_auto.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 305b070b4..75dc55663 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -866,8 +866,10 @@ def prepare_run(self, var_list: list = None) -> dict: # LB: SHould be covered by ColocationSetup validator # if isinstance(self.colocation_setup.obs_vars, str): # self.colocation_setup.obs_vars = [self.colocation_setup.obs_vars] - # if not isinstance(self.colocation_setup.obs_vars, list): - # raise AttributeError("obs_vars not defined or invalid, need list with strings...") + + # LB: obs_vars should be defined by here + if not isinstance(self.colocation_setup.obs_vars, list): + raise AttributeError("obs_vars not defined or invalid, need list with strings...") self._check_obs_vars_available() self._check_obs_filters() self._check_model_add_vars() From 552ebb890d50982d2821af51db1405ed65b950b2 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 29 May 2024 18:50:00 +0200 Subject: [PATCH 27/44] obs_vars must be a tuple --- pyaerocom/colocation_auto.py | 4 ++-- pyaerocom/colocation_setup.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 75dc55663..38c041d16 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -868,8 +868,8 @@ def prepare_run(self, var_list: list = None) -> dict: # self.colocation_setup.obs_vars = [self.colocation_setup.obs_vars] # LB: obs_vars should be defined by here - if not isinstance(self.colocation_setup.obs_vars, list): - raise AttributeError("obs_vars not defined or invalid, need list with strings...") + if not isinstance(self.colocation_setup.obs_vars, tuple): + raise AttributeError("obs_vars not defined or invalid, need tuple with strings...") self._check_obs_vars_available() self._check_obs_filters() self._check_model_add_vars() diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 466eb12f1..7968bea44 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -308,7 +308,7 @@ class ColocationSetup(BaseModel): # LB: remains to be seen if this can actually be required without chaning the code elsewhere model_id: str | None # = None obs_id: str | None # = None - obs_vars: list[str] | str | None # = None + obs_vars: tuple[str, ...] | str # = None @field_validator("obs_vars") @classmethod @@ -443,7 +443,7 @@ def __init__( model_id: str | None = None, obs_config: PyaroConfig | None = None, obs_id: str | None = None, - obs_vars: list[str] | None = None, + obs_vars: tuple[str, ...] | None = (), ts_type: str = "monthly", start: pd.Timestamp | int | None = None, stop: pd.Timestamp | int | None = None, From abbb98d9edfb89e5af12e8a9a6a753e1fee1ca11 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 29 May 2024 19:21:48 +0200 Subject: [PATCH 28/44] update configs to use tuples for obs_vars --- tests/fixtures/aeroval/cfg_test_exp1.py | 2 +- tests/fixtures/aeroval/cfg_test_exp2.py | 2 +- tests/fixtures/aeroval/cfg_test_exp3.py | 2 +- tests/fixtures/aeroval/cfg_test_exp4.py | 6 +++--- tests/fixtures/aeroval/cfg_test_exp5.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/fixtures/aeroval/cfg_test_exp1.py b/tests/fixtures/aeroval/cfg_test_exp1.py index dae633458..c7b34ed79 100644 --- a/tests/fixtures/aeroval/cfg_test_exp1.py +++ b/tests/fixtures/aeroval/cfg_test_exp1.py @@ -4,7 +4,7 @@ OBS_GROUNDBASED = { "AERONET-Sun": dict( - obs_id="AeronetSunV3L2Subset.daily", obs_vars=["od550aer"], obs_vert_type="Column" + obs_id="AeronetSunV3L2Subset.daily", obs_vars=("od550aer",), obs_vert_type="Column" ) } diff --git a/tests/fixtures/aeroval/cfg_test_exp2.py b/tests/fixtures/aeroval/cfg_test_exp2.py index 17c6129e0..6f94560d3 100644 --- a/tests/fixtures/aeroval/cfg_test_exp2.py +++ b/tests/fixtures/aeroval/cfg_test_exp2.py @@ -12,7 +12,7 @@ ODCSFUN = "AeronetSDAV3L2Subset.daily;od550lt1aer+AeronetSDAV3L2Subset.daily;od550gt1aer" OBS_GROUNDBASED = { "AERONET-Sun": dict( - obs_id="AeronetSunV3L2Subset.daily", obs_vars=["od550aer"], obs_vert_type="Column" + obs_id="AeronetSunV3L2Subset.daily", obs_vars=("od550aer",), obs_vert_type="Column" ), "AERONET-SDA": dict( obs_id="AERONET-SDA", diff --git a/tests/fixtures/aeroval/cfg_test_exp3.py b/tests/fixtures/aeroval/cfg_test_exp3.py index fe3f15618..5ac642753 100644 --- a/tests/fixtures/aeroval/cfg_test_exp3.py +++ b/tests/fixtures/aeroval/cfg_test_exp3.py @@ -38,7 +38,7 @@ def fake_model_data(tmp_path: str | Path) -> dict: ) -OBS_GROUNDBASED = {"EBAS": dict(obs_id="EBASSubset", obs_vars=["vmro3"], obs_vert_type="Surface")} +OBS_GROUNDBASED = {"EBAS": dict(obs_id="EBASSubset", obs_vars=("vmro3",), obs_vert_type="Surface")} CFG = dict( model_cfg=dict(), # fake_model_data("PATH_TO_MODEL_DATA"), diff --git a/tests/fixtures/aeroval/cfg_test_exp4.py b/tests/fixtures/aeroval/cfg_test_exp4.py index 78e4b761f..4e515186c 100644 --- a/tests/fixtures/aeroval/cfg_test_exp4.py +++ b/tests/fixtures/aeroval/cfg_test_exp4.py @@ -9,20 +9,20 @@ OBS_GROUNDBASED = { "AERONET-Sun": dict( obs_id="AeronetSunV3L2Subset.daily", - obs_vars=["od550aer"], + obs_vars=("od550aer",), only_superobs=True, obs_vert_type="Column", ), "AERONET-SDA": dict( obs_id="AeronetSDAV3L2Subset.daily", - obs_vars=["od550aer"], + obs_vars=("od550aer",), only_superobs=True, obs_vert_type="Column", ), "SDA-and-Sun": dict( is_superobs=True, obs_id=("AERONET-Sun", "AERONET-SDA"), - obs_vars=["od550aer"], + obs_vars=("od550aer",), obs_vert_type="Column", ), } diff --git a/tests/fixtures/aeroval/cfg_test_exp5.py b/tests/fixtures/aeroval/cfg_test_exp5.py index f9773f914..aa5d69fe0 100644 --- a/tests/fixtures/aeroval/cfg_test_exp5.py +++ b/tests/fixtures/aeroval/cfg_test_exp5.py @@ -52,7 +52,7 @@ def fake_obs_data(tmp_path: str | Path) -> dict: return dict( DUMMY=dict( obs_id="DUMMY-OBS", - obs_vars=["prmm"], + obs_vars=("prmm",), obs_data_dir=obs_data_dir, obs_vert_type="Surface", ) From bc6b0ab0c4c98c7dc385c68913a9e2e969f8e924 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Wed, 29 May 2024 19:30:16 +0200 Subject: [PATCH 29/44] WIP: hacked obs_cfg keys into col.colocation_setup --- pyaerocom/aeroval/_processing_base.py | 8 +++++++- pyaerocom/aeroval/experiment_processor.py | 2 +- pyaerocom/helpers.py | 11 +++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index 0bbd5310c..caf598ee0 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -122,9 +122,15 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat obs_cfg = self.cfg.get_obs_entry(obs_name) pyaro_config = obs_cfg["obs_config"] if "obs_config" in obs_cfg else None col_cfg["obs_config"] = pyaro_config + + # LB: Hack and at what lowlevel_helpers's import_from was doing + for key, val in obs_cfg.items(): + if key in ColocationSetup.model_fields: + col_cfg[key] = val + col_stp = ColocationSetup(**col_cfg) col = Colocator(col_stp) - # col.import_from(obs_cfg) # LB: Not sure if needed or works anymore original from lowlevel_helpers.py + # col.import_from(obs_cfg) # LB: This is functionality might be needed. Want to get keys from the obs_cfg into ColocationSetup. col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) else: col_stp = ColocationSetup(**col_cfg) diff --git a/pyaerocom/aeroval/experiment_processor.py b/pyaerocom/aeroval/experiment_processor.py index f0599d866..f19e6d474 100644 --- a/pyaerocom/aeroval/experiment_processor.py +++ b/pyaerocom/aeroval/experiment_processor.py @@ -114,7 +114,7 @@ def run(self, model_name=None, obs_name=None, var_list=None, update_interface=Tr to json files. """ if isinstance(var_list, str): - var_list = [var_list] + var_list = (var_list,) self.cfg._check_time_config() diff --git a/pyaerocom/helpers.py b/pyaerocom/helpers.py index b06fa38e1..acccd374b 100644 --- a/pyaerocom/helpers.py +++ b/pyaerocom/helpers.py @@ -64,14 +64,17 @@ def varlist_aerocom(varlist): if isinstance(varlist, str): - varlist = [varlist] - elif not isinstance(varlist, list): - raise ValueError("Need string or list") + # varlist = [varlist] + varlist = (varlist,) + # elif not isinstance(varlist, list): + # raise ValueError("Need string or list") + elif not isinstance(varlist, tuple): + raise ValueError("Need string or tuple") output = [] for var in varlist: try: _var = const.VARS[var].var_name_aerocom - if not _var in output: + if _var not in output: output.append(_var) except VariableDefinitionError as e: logger.warning(repr(e)) From 011987524fd9eb8f96dd37087473aa2ae37b98e8 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 30 May 2024 14:02:14 +0200 Subject: [PATCH 30/44] WIP --- pyaerocom/aeroval/_processing_base.py | 4 +++- pyaerocom/colocation_setup.py | 2 +- pyaerocom/helpers.py | 2 +- tests/test_colocation_auto.py | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index caf598ee0..4379b59c3 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -115,7 +115,9 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat """ # LB: In general I don't like what this function is doing. Ideally define the Colocator object once and just use that. - col_cfg = {**self.cfg.colocation_opts.model_dump()} + col_cfg = { + **self.cfg.colocation_opts.model_dump() + } # LB: obs_vars is a list, should be a tuple outdir = self.cfg.path_manager.get_coldata_dir() col_cfg["basedir_coldata"] = outdir if obs_name: diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 7968bea44..9266689b8 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -397,7 +397,7 @@ def validate_basedirs(cls, v): model_use_vars: dict[str, str] | None = {} model_rename_vars: dict[str, str] | None = {} - model_add_vars: dict[str, list] | None = {} + model_add_vars: dict[str, str | tuple[str, ...]] | None = {} # LB: WIP / guess model_to_stp: bool = False model_ts_type_read: str | dict | None = None diff --git a/pyaerocom/helpers.py b/pyaerocom/helpers.py index acccd374b..008ebc671 100644 --- a/pyaerocom/helpers.py +++ b/pyaerocom/helpers.py @@ -80,7 +80,7 @@ def varlist_aerocom(varlist): logger.warning(repr(e)) if len(output) == 0: raise ValueError("None of the input variables appears to be valid") - return output + return tuple(output) def delete_all_coords_cube(cube, inplace=True): diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index fc8ca110c..7d5bfd1cb 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -341,8 +341,8 @@ def test_colocator__find_var_matches_model_add_vars(setup): ovar = "od550aer" setup["model_id"] = "TM5-met2010_CTRL-TEST" setup["obs_id"] = "AeronetSunV3L2Subset.daily" - setup["obs_vars"] = [ovar] - setup["model_add_vars"] = {ovar: ["abs550aer"]} + setup["obs_vars"] = (ovar,) + setup["model_add_vars"] = {ovar: ("abs550aer",)} col_stp = ColocationSetup(**setup) col = Colocator(col_stp) var_matches = col._find_var_matches() From 16b8b93953028e2fb677b4c953893c19151d8ad2 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 30 May 2024 15:30:09 +0200 Subject: [PATCH 31/44] do same for model cfg as obs_cfg in get_colocator --- pyaerocom/aeroval/_processing_base.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index 4379b59c3..1407083ae 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -134,15 +134,22 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat col = Colocator(col_stp) # col.import_from(obs_cfg) # LB: This is functionality might be needed. Want to get keys from the obs_cfg into ColocationSetup. col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) - else: - col_stp = ColocationSetup(**col_cfg) - col = Colocator(col_stp) - if model_name: + elif model_name: mod_cfg = self.cfg.get_model_entry(model_name) col_cfg["model_cfg"] = mod_cfg # LB: this is untested and just a guess at this point + + # LB: Hack and at what lowlevel_helpers's import_from was doing + for key, val in mod_cfg.items(): + if key in ColocationSetup.model_fields: + col_cfg[key] = val + col_stp = ColocationSetup(**col_cfg) col = Colocator(col_stp) # col.import_from(mod_cfg) # LB: also not sure if needed or works anymore + else: + col_stp = ColocationSetup(**col_cfg) + col = Colocator(col_stp) + return col From 95e1394252fbcbe5f8cd382a54c89352a7e54951 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 30 May 2024 15:50:54 +0200 Subject: [PATCH 32/44] _init_log: self.colocation_setup.basedir_coldata --- pyaerocom/colocation_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 38c041d16..23280b1f2 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -1644,7 +1644,7 @@ def _print_coloc_info(self, var_matches): logger.info(f"{key}\t{val}") def _init_log(self): - logdir = chk_make_subdir(self.basedir_logfiles, self.get_model_name()) + logdir = chk_make_subdir(self.colocation_setup.basedir_coldata, self.get_model_name()) oname = self.get_obs_name() datestr = datetime.today().strftime("%Y%m%d") datetimestr = datetime.today().strftime("%d-%m-%Y %H:%M") From 66fc25e42bb3bf626169d2c133706c006d8a0670 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Thu, 30 May 2024 17:45:34 +0200 Subject: [PATCH 33/44] almost all aeroval tests passing --- pyaerocom/aeroval/_processing_base.py | 37 +++++++++++++++------------ pyaerocom/colocation_auto.py | 16 +++++++----- pyaerocom/colocation_setup.py | 2 +- pyaerocom/helpers.py | 9 +++---- 4 files changed, 33 insertions(+), 31 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index 1407083ae..dbee7b04e 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -120,6 +120,19 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat } # LB: obs_vars is a list, should be a tuple outdir = self.cfg.path_manager.get_coldata_dir() col_cfg["basedir_coldata"] = outdir + + if not model_name and obs_name: + col_stp = ColocationSetup(**col_cfg) + return Colocator(col_stp) + + if model_name: + mod_cfg = self.cfg.get_model_entry(model_name) + col_cfg["model_cfg"] = mod_cfg # LB: this is untested and just a guess at this point + + # LB: Hack and at what lowlevel_helpers's import_from was doing + for key, val in mod_cfg.items(): + if key in ColocationSetup.model_fields: + col_cfg[key] = val if obs_name: obs_cfg = self.cfg.get_obs_entry(obs_name) pyaro_config = obs_cfg["obs_config"] if "obs_config" in obs_cfg else None @@ -130,25 +143,15 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat if key in ColocationSetup.model_fields: col_cfg[key] = val - col_stp = ColocationSetup(**col_cfg) - col = Colocator(col_stp) + # col_stp = ColocationSetup(**col_cfg) + # col = Colocator(col_stp) + # col_cfg.add_meta # col.import_from(obs_cfg) # LB: This is functionality might be needed. Want to get keys from the obs_cfg into ColocationSetup. - col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) - elif model_name: - mod_cfg = self.cfg.get_model_entry(model_name) - col_cfg["model_cfg"] = mod_cfg # LB: this is untested and just a guess at this point - - # LB: Hack and at what lowlevel_helpers's import_from was doing - for key, val in mod_cfg.items(): - if key in ColocationSetup.model_fields: - col_cfg[key] = val + # col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) + col_cfg["add_meta"] = dict(diurnal_only=self._get_diurnal_only(obs_name)) - col_stp = ColocationSetup(**col_cfg) - col = Colocator(col_stp) - # col.import_from(mod_cfg) # LB: also not sure if needed or works anymore - else: - col_stp = ColocationSetup(**col_cfg) - col = Colocator(col_stp) + col_stp = ColocationSetup(**col_cfg) + col = Colocator(col_stp) return col diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 23280b1f2..06a2bb2e3 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -642,7 +642,7 @@ def model_vars(self): list of all model variables specified in this setup. """ - ovars = self.self.colocation_setup.obs_vars + ovars = self.colocation_setup.obs_vars model_vars = [] for ovar in ovars: if ovar in self.colocation_setup.model_add_vars: @@ -738,8 +738,8 @@ def output_dir(self): """ str: Output directory for colocated data NetCDF files """ - self._check_basedir_coldata() - loc = os.path.join(self.basedir_coldata, self.get_model_name()) + # self._check_basedir_coldata() # LB: don't need as force construction in ColocationSetup + loc = os.path.join(self.colocation_setup.basedir_coldata, self.get_model_name()) if not os.path.exists(loc): logger.info(f"Creating dir {loc}") os.mkdir(loc) @@ -972,7 +972,9 @@ def check_meta_match(meta, **kwargs): mname = self.get_model_name() oname = self.get_obs_name() model_vars = self.model_vars - obs_vars = self.obs_vars + # LB: Not entirely sure. There may be a deeper problem here. + # obs_vars = self.obs_vars + obs_vars = self.colocation_setup.obs_vars start, stop = self.get_start_str(), self.get_stop_str() valid = [] all_files = self.get_nc_files_in_coldatadir() @@ -1399,13 +1401,13 @@ def _eval_obs_filters(self, var_name): def _save_coldata(self, coldata): """Helper for saving colocateddata""" obs_var, mod_var = coldata.metadata["var_name_input"] - if mod_var in self.model_rename_vars: - mvar = self.model_rename_vars[mod_var] + if mod_var in self.colocation_setup.model_rename_vars: + mvar = self.colocation_setup.model_rename_vars[mod_var] logger.info( f"Renaming model variable from {mod_var} to {mvar} in " f"ColocatedData before saving to NetCDF." ) - coldata.rename_variable(mod_var, mvar, self.model_id) + coldata.rename_variable(mod_var, mvar, self.colocation_setup.model_id) else: mvar = mod_var diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 9266689b8..cdbd72708 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -397,7 +397,7 @@ def validate_basedirs(cls, v): model_use_vars: dict[str, str] | None = {} model_rename_vars: dict[str, str] | None = {} - model_add_vars: dict[str, str | tuple[str, ...]] | None = {} # LB: WIP / guess + model_add_vars: dict[str, tuple[str, ...]] | None = {} # LB: WIP / guess model_to_stp: bool = False model_ts_type_read: str | dict | None = None diff --git a/pyaerocom/helpers.py b/pyaerocom/helpers.py index 008ebc671..adeef86c7 100644 --- a/pyaerocom/helpers.py +++ b/pyaerocom/helpers.py @@ -64,12 +64,9 @@ def varlist_aerocom(varlist): if isinstance(varlist, str): - # varlist = [varlist] - varlist = (varlist,) - # elif not isinstance(varlist, list): - # raise ValueError("Need string or list") - elif not isinstance(varlist, tuple): - raise ValueError("Need string or tuple") + varlist = [varlist] + elif not (isinstance(varlist, list) or isinstance(varlist, tuple)): + raise ValueError("Need string or list or tuple") output = [] for var in varlist: try: From eff7244aee319ecdcdd6c2017be763e7b44836d1 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 10:10:11 +0200 Subject: [PATCH 34/44] maybe need to update add_meta dict --- pyaerocom/aeroval/_processing_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index dbee7b04e..7ba16d155 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -148,7 +148,7 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat # col_cfg.add_meta # col.import_from(obs_cfg) # LB: This is functionality might be needed. Want to get keys from the obs_cfg into ColocationSetup. # col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) - col_cfg["add_meta"] = dict(diurnal_only=self._get_diurnal_only(obs_name)) + col_cfg["add_meta"].update(diurnal_only=self._get_diurnal_only(obs_name)) col_stp = ColocationSetup(**col_cfg) col = Colocator(col_stp) From 9168ae7e6fb1b5cd3181fbcd6b53f0eed32007ae Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 11:50:01 +0200 Subject: [PATCH 35/44] cams2_83 tests passing --- pyaerocom/aeroval/_processing_base.py | 2 +- pyaerocom/colocation_auto.py | 4 ++-- pyaerocom/colocation_setup.py | 12 ++++++++++-- pyaerocom/scripts/cams2_83/processer.py | 6 +++--- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index 7ba16d155..a22570d54 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -121,7 +121,7 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat outdir = self.cfg.path_manager.get_coldata_dir() col_cfg["basedir_coldata"] = outdir - if not model_name and obs_name: + if not model_name and not obs_name: col_stp = ColocationSetup(**col_cfg) return Colocator(col_stp) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 06a2bb2e3..dc6d5619c 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -1118,8 +1118,8 @@ def _filter_var_matches_files_not_exist(self, var_matches, ts_types): def _check_model_add_vars(self): for ovar, mvars in self.colocation_setup.model_add_vars.items(): - if not isinstance(mvars, list): - raise ValueError("Values of model_add_vars need to be list") + if not isinstance(mvars, (list, tuple)): + raise ValueError("Values of model_add_vars need to be list or tuple") elif not all([isinstance(x, str) for x in mvars]): raise ValueError("Values of model_add_vars need to be list of strings") diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index cdbd72708..d78f3ee6d 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -318,8 +318,16 @@ def validate_obs_vars(cls, v): return v ts_type: str # = None - start: pd.Timestamp | int | None # = None - stop: pd.Timestamp | int | None # = None + start: pd.Timestamp | int | str | None # = None + stop: pd.Timestamp | int | str | None # = None + + @field_validator("start", "stop") + @classmethod + def validate_basedirs(cls, v): + if isinstance(v, int): + return v + if isinstance(v, str): + return pd.Timestamp(v) obs_config: PyaroConfig | None = None diff --git a/pyaerocom/scripts/cams2_83/processer.py b/pyaerocom/scripts/cams2_83/processer.py index 75da9e42f..e54765d39 100755 --- a/pyaerocom/scripts/cams2_83/processer.py +++ b/pyaerocom/scripts/cams2_83/processer.py @@ -22,11 +22,11 @@ def _run_single_entry(self, model_name, obs_name, var_list, analysis=False): files_to_convert = [] for leap in range(forecast_days): runtype = "AN" if analysis else "FC" - model = col.model_id.split(".")[1] + model = col.colocation_setup.model_id.split(".")[1] model_id = f"CAMS2-83.{model}.day{leap}.{runtype}" model_name = f"CAMS2-83-{model}-day{leap}-{runtype}" - col.model_id = model_id - col.model_name = model_name + col.colocation_setup.model_id = model_id + col.colocation_setup.model_name = model_name col.run(var_list) files_to_convert = col.files_written From f7bf7007d6d65ee9a384a6a2441b5748365c7aa7 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 11:53:06 +0200 Subject: [PATCH 36/44] don't force vars_to_retrieve and varlist_aerocom to be tuples --- pyaerocom/aeroval/experiment_processor.py | 2 +- pyaerocom/helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyaerocom/aeroval/experiment_processor.py b/pyaerocom/aeroval/experiment_processor.py index f19e6d474..f0599d866 100644 --- a/pyaerocom/aeroval/experiment_processor.py +++ b/pyaerocom/aeroval/experiment_processor.py @@ -114,7 +114,7 @@ def run(self, model_name=None, obs_name=None, var_list=None, update_interface=Tr to json files. """ if isinstance(var_list, str): - var_list = (var_list,) + var_list = [var_list] self.cfg._check_time_config() diff --git a/pyaerocom/helpers.py b/pyaerocom/helpers.py index adeef86c7..4a212b536 100644 --- a/pyaerocom/helpers.py +++ b/pyaerocom/helpers.py @@ -77,7 +77,7 @@ def varlist_aerocom(varlist): logger.warning(repr(e)) if len(output) == 0: raise ValueError("None of the input variables appears to be valid") - return tuple(output) + return output def delete_all_coords_cube(cube, inplace=True): From ee1c6483103d073fe52c701583cfd507920361f4 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 12:46:39 +0200 Subject: [PATCH 37/44] fixing up the last of tests --- pyaerocom/colocation_auto.py | 14 +++++++------- tests/test_colocation_auto.py | 2 +- tests/test_colocation_setup.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index dc6d5619c..f6bd6d7f2 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -864,12 +864,12 @@ def prepare_run(self, var_list: list = None) -> dict: self.logging = False # LB: SHould be covered by ColocationSetup validator - # if isinstance(self.colocation_setup.obs_vars, str): - # self.colocation_setup.obs_vars = [self.colocation_setup.obs_vars] + if isinstance(self.colocation_setup.obs_vars, str): + self.colocation_setup.obs_vars = (self.colocation_setup.obs_vars,) - # LB: obs_vars should be defined by here - if not isinstance(self.colocation_setup.obs_vars, tuple): - raise AttributeError("obs_vars not defined or invalid, need tuple with strings...") + # # LB: obs_vars should be defined by here + # if not isinstance(self.colocation_setup.obs_vars, tuple): + # raise AttributeError("obs_vars not defined or invalid, need tuple with strings...") self._check_obs_vars_available() self._check_obs_filters() self._check_model_add_vars() @@ -1178,8 +1178,8 @@ def _check_add_model_read_aux(self, model_var): def _check_obs_vars_available(self): # LB: This is what I would like but not sure if it will work with current setup - # if not len(self.colocation_setup.obs_vars) > 0: - # raise ColocationSetupError("no observation variables specified...") + if not len(self.colocation_setup.obs_vars) > 0: + raise ColocationSetupError("no observation variables specified...") oreader = self.obs_reader if self.obs_is_ungridded: avail = oreader.get_vars_supported( diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 7d5bfd1cb..ef335afc1 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -18,7 +18,7 @@ default_setup = { "model_id": None, "obs_id": None, - "obs_vars": None, + "obs_vars": (), "ts_type": "monthly", "start": None, "stop": None, diff --git a/tests/test_colocation_setup.py b/tests/test_colocation_setup.py index 41cdfa93a..2100cff45 100644 --- a/tests/test_colocation_setup.py +++ b/tests/test_colocation_setup.py @@ -11,7 +11,7 @@ default_setup = { "model_id": None, "obs_id": None, - "obs_vars": None, + "obs_vars": (), # "obs_config": None, "ts_type": "monthly", "start": None, From 5f47157d311ceb12780ab626905c5ff723c4a2bc Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 12:55:26 +0200 Subject: [PATCH 38/44] clean up --- pyaerocom/colocation_auto.py | 570 +--------------------------------- pyaerocom/colocation_setup.py | 68 +--- tests/test_colocation_auto.py | 1 - 3 files changed, 15 insertions(+), 624 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index f6bd6d7f2..790a7c640 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -7,15 +7,13 @@ import os import traceback from datetime import datetime -from pathlib import Path -from typing import Any, Callable, Optional +from typing import Any, Callable import pandas as pd from cf_units import Unit -from pydantic import ConfigDict from pyaerocom import const -from pyaerocom._lowlevel_helpers import BrowseDict, ListOfStrings, StrWithDefault, chk_make_subdir +from pyaerocom._lowlevel_helpers import chk_make_subdir from pyaerocom.colocateddata import ColocatedData from pyaerocom.colocation import ( colocate_gridded_gridded, @@ -23,8 +21,7 @@ correct_model_stp_coldata, ) from pyaerocom.colocation_3d import ColocatedDataLists, colocate_vertical_profile_gridded -from pyaerocom.colocation_setup import ColocationSetup # New -from pyaerocom.config import ALL_REGION_NAME +from pyaerocom.colocation_setup import ColocationSetup from pyaerocom.exceptions import ColocationError, ColocationSetupError, DataCoverageError from pyaerocom.helpers import ( get_lowest_resolution, @@ -33,535 +30,20 @@ to_pandas_timestamp, ) from pyaerocom.io import ReadCAMS2_83, ReadGridded, ReadUngridded -from pyaerocom.io.cams2_83.models import ModelName from pyaerocom.io.helpers import get_all_supported_ids_ungridded from pyaerocom.io.mscw_ctm.reader import ReadMscwCtm -from pyaerocom.io.pyaro.pyaro_config import PyaroConfig logger = logging.getLogger(__name__) -# class ColocationSetup(BrowseDict): -# """ -# Setup class for high-level model / obs co-location. - -# An instance of this setup class can be used to run a colocation analysis -# between a model and an observation network and will create a number of -# :class:`pya.ColocatedData` instances, which can be saved automatically -# as NetCDF files. - -# Apart from co-location, this class also handles reading of the input data -# for co-location. Supported co-location options are: - -# 1. gridded vs. ungridded data -# For instance 3D model data (instance of :class:`GriddedData`) with lat, -# lon and time dimension that is co-located with station based observations -# which are represented in pyaerocom through :class:`UngriddedData` objects. -# The co-location function used is -# :func:`pyaerocom.colocation.colocated_gridded_ungridded`. For this type of -# co-location, the output co-located data object will be 3-dimensional, -# with dimensions `data_source` (index 0: obs, index 1: model), `time` and -# `station_name`. - -# 2. gridded vs. gridded data -# For instance 3D model data that is co-located with 3D satellite data -# (both instances of :class:`GriddedData`), both objects with lat, -# lon and time dimensions. The co-location function used -# is :func:`pyaerocom.colocation.colocated_gridded_gridded`. -# For this type of co-location, the output co-located data object will be -# 4-dimensional, with dimensions `data_source` (index 0: obs, index 1: -# model), `time` and `latitude` and `longitude`. - - -# Attributes -# ---------- -# model_id : str -# ID of model to be used. - -# obs_config: PyaroConfig -# In the case Pyaro is used, a config must be provided. In that case obs_id(see below) -# is ignored and only the config is used. -# obs_id : str -# ID of observation network to be used. -# obs_vars : list -# Variables to be analysed (need to be available in input obs dataset). -# Variables that are not available in the model data output will be -# skipped. Alternatively, model variables to be used for a given obs -# variable can also be specified via attributes :attr:`model_use_vars` -# and :attr:`model_add_vars`. -# ts_type : str -# String specifying colocation output frequency. -# start -# Start time of colocation. Input can be integer denoting the year or -# anything that can be converted into :class:`pandas.Timestamp` using -# :func:`pyaerocom.helpers.to_pandas_timestamp`. If None, than the first -# available date in the model data is used. -# stop -# stop time of colocation. int or anything that can be converted into -# :class:`pandas.Timestamp` using -# :func:`pyaerocom.helpers.to_pandas_timestamp` or None. If None and if -# ``start`` is on resolution of year (e.g. ``start=2010``) then ``stop`` -# will be automatically set to the end of that year. Else, it will be -# set to the last available timestamp in the model data. -# filter_name : str -# name of filter to be applied. If None, no filter is used -# (to be precise, if None, then -# :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should -# default to `ALL-wMOUNTAINS`, that is, no filtering). -# basedir_coldata : str -# Base directory for storing of colocated data files. -# save_coldata : bool -# if True, colocated data objects are saved as NetCDF file. -# obs_name : str, optional -# if provided, this string will be used in colocated data filename to -# specify obsnetwork, else obs_id will be used. -# obs_data_dir : str, optional -# location of obs data. If None, attempt to infer obs location based on -# obs ID. -# obs_use_climatology : bool -# BETA if True, pyaerocom default climatology is computed from observation -# stations (so far only possible for unrgidded / gridded colocation). -# obs_vert_type : str -# AeroCom vertical code encoded in the model filenames (only AeroCom 3 -# and later). Specifies which model file should be read in case there are -# multiple options (e.g. surface level data can be read from a -# *Surface*.nc file as well as from a *ModelLevel*.nc file). If input is -# string (e.g. 'Surface'), then the corresponding vertical type code is -# used for reading of all variables that are colocated (i.e. that are -# specified in :attr:`obs_vars`). -# obs_ts_type_read : str or dict, optional -# may be specified to explicitly define the reading frequency of the -# observation data (so far, this does only apply to gridded obsdata such -# as satellites), either as str (same for all obs variables) or variable -# specific as dict. For ungridded reading, the frequency may be specified -# via :attr:`obs_id`, where applicable (e.g. AeronetSunV3Lev2.daily). -# Not to be confused with :attr:`ts_type`, which specifies the -# frequency used for colocation. Can be specified variable specific in -# form of dictionary. -# obs_filters : dict -# filters applied to the observational dataset before co-location. -# In case of gridded / gridded, these are filters that can be passed to -# :func:`pyaerocom.io.ReadGridded.read_var`, for instance, `flex_ts_type`, -# or `constraints`. In case the obsdata is ungridded (gridded / ungridded -# co-locations) these are filters that are handled through keyword -# `filter_post` in :func:`pyaerocom.io.ReadUngridded.read`. These filters -# are applied to the :class:`UngriddedData` objects after reading and -# caching the data, so changing them, will not invalidate the latest -# cache of the :class:`UngriddedData`. -# read_opts_ungridded : dict, optional -# dictionary that specifies reading constraints for ungridded reading, -# and are passed as `**kwargs` to :func:`pyaerocom.io.ReadUngridded.read`. -# Note that - other than for `obs_filters` these filters are applied -# during the reading of the :class:`UngriddedData` objects and specifying -# them will deactivate caching. -# model_name : str, optional -# if provided, this string will be used in colocated data filename to -# specify model, else obs_id will be used. -# model_data_dir : str, optional -# Location of model data. If None, attempt to infer model location based -# on model ID. -# model_read_opts : dict, optional -# options for model reading (passed as keyword args to -# :func:`pyaerocom.io.ReadUngridded.read`). -# model_use_vars : dict, optional -# dictionary that specifies mapping of model variables. Keys are -# observation variables, values are the corresponding model variables -# (e.g. model_use_vars=dict(od550aer='od550csaer')). Example: your -# observation has var *od550aer* but your model model uses a different -# variable name for that variable, say *od550*. Then, you can specify -# this via `model_use_vars = {'od550aer' : 'od550'}`. NOTE: in this case, -# a model variable *od550aer* will be ignored, even if it exists -# (cf :attr:`model_add_vars`). -# model_rename_vars : dict, optional -# rename certain model variables **after** co-location, before storing -# the associated :class:`ColocatedData` object on disk. Keys are model -# variables, values are new names -# (e.g. `model_rename_vars={'od550aer':'MyAOD'}`). -# Note: this does not impact which variables are read from the model. -# model_add_vars : dict, optional -# additional model variables to be processed for one obs variable. E.g. -# `model_add_vars={'od550aer': ['od550so4', 'od550gt1aer']}` would -# co-locate both model SO4 AOD (od550so4) and model coarse mode AOD -# (od550gt1aer) with total AOD (od550aer) from obs (in addition to -# od550aer vs od550aer if applicable). -# model_to_stp : bool -# ALPHA (please do not use): convert model data values to STP conditions -# after co-location. Note: this only works for very particular settings -# at the moment and needs revision, as it relies on access to -# meteorological data. -# model_ts_type_read : str or dict, optional -# may be specified to explicitly define the reading frequency of the -# model data, either as str (same for all obs variables) or variable -# specific as dict. Not to be confused with :attr:`ts_type`, which -# specifies the output frequency of the co-located data. -# model_read_aux : dict, optional -# may be used to specify additional computation methods of variables from -# models. Keys are variables to be computed, values are dictionaries with -# keys `vars_required` (list of required variables for computation of var -# and `fun` (method that takes list of read data objects and computes -# and returns var). -# model_use_climatology : bool -# if True, attempt to use climatological model data field. Note: this -# only works if model data is in AeroCom conventions (climatological -# fields are indicated with 9999 as year in the filename) and if this is -# active, only single year analysis are supported (i.e. provide int to -# :attr:`start` to specify the year and leave :attr:`stop` empty). -# gridded_reader_id : dict -# BETA: dictionary specifying which gridded reader is supposed to be used -# for model (and gridded obs) reading. Note: this is a workaround -# solution and will likely be removed in the future when the gridded -# reading API is more harmonised -# (see https://github.com/metno/pyaerocom/issues/174). -# flex_ts_type : bool -# Bboolean specifying whether reading frequency of gridded data is -# allowed to be flexible. This includes all gridded data, whether it is -# model or gridded observation (e.g. satellites). Defaults to True. -# min_num_obs : dict or int, optional -# time resampling constraints applied, defaults to None, in which case -# no constraints are applied. For instance, say your input is in daily -# resolution and you want output in monthly and you want to make sure to -# have roughly 50% daily coverage for the monthly averages. Then you may -# specify `min_num_obs=15` which will ensure that at least 15 daily -# averages are available to compute a monthly average. However, you may -# also define a hierarchical scheme that first goes from daily to -# weekly and then from weekly to monthly, via a dict. E.g. -# `min_num_obs=dict(monthly=dict(weekly=4), weekly=dict(daily=3))` would -# ensure that each week has at least 3 daily values, as well as that each -# month has at least 4 weekly values. -# resample_how : str or dict, optional -# string specifying how data should be aggregated when resampling in time. -# Default is "mean". Can also be a nested dictionary, e.g. -# `resample_how={'conco3': 'daily': {'hourly' : 'max'}}` would use the -# maximum value to aggregate from hourly to daily for variable conco3, -# rather than the mean. -# obs_remove_outliers : bool -# if True, outliers are removed from obs data before colocation, -# else not. Default is False. -# Custom outlier ranges for each variable can be specified via -# :attr:`obs_outlier_ranges`, and for all other variables, the pyaerocom -# default outlier ranges are used. The latter are specified in -# `variables.ini` file via `minimum` and `maximum` attributes and can -# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` -# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. -# model_remove_outliers : bool -# if True, outliers are removed from model data (normally this should be -# set to False, as the models are supposed to be assessed, including -# outlier cases). Default is False. -# Custom outlier ranges for each variable can be specified via -# :attr:`model_outlier_ranges`, and for all other variables, the pyaerocom -# default outlier ranges are used. The latter are specified in -# `variables.ini` file via `minimum` and `maximum` attributes and can -# also be accessed through :attr:`pyaerocom.variable.Variable.minimum` -# and :attr:`pyaerocom.variable.Variable.maximum`, respectively. -# obs_outlier_ranges : dict, optional -# dictionary specifying outlier ranges for individual obs variables. -# (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). Only relevant -# if :attr:`obs_remove_outliers` is True. -# model_outlier_ranges : dict, optional -# like :attr:`obs_outlier_ranges` but for model variables. Only relevant -# if :attr:`model_remove_outliers` is True. -# zeros_to_nan : bool -# If True, zero's in output co-located data object will be converted to -# NaN. Default is False. -# harmonise_units : bool -# if True, units are attempted to be harmonised during co-location -# (note: raises Exception if True and in case units cannot be harmonised). -# regrid_res_deg : int, optional -# resolution in degrees for regridding of model grid (done before -# co-location). Default is None. -# colocate_time : bool -# if True and if obs and model sampling frequency (e.g. daily) are higher -# than output colocation frequency (e.g. monthly), then the datasets are -# first colocated in time (e.g. on a daily basis), before the monthly -# averages are calculated. Default is False. -# reanalyse_existing : bool -# if True, always redo co-location, even if there is already an existing -# co-located NetCDF file (under the output location specified by -# :attr:`basedir_coldata` ) for the given variable combination to be -# co-located. If False and output already exists, then co-location is -# skipped for the associated variable. Default is True. -# raise_exceptions : bool -# if True, Exceptions that may occur for individual variables to be -# processed, are raised, else the analysis is skipped for such cases. -# keep_data : bool -# if True, then all colocated data objects computed when running -# :func:`run` will be stored in :attr:`data`. Defaults to True. -# add_meta : dict -# additional metadata that is supposed to be added to each output -# :class:`ColocatedData` object. -# """ - -# #: Dictionary specifying alternative vertical types that may be used to -# #: read model data. E.g. consider the variable is ec550aer, -# #: obs_vert_type='Surface' and obs_vert_type_alt=dict(Surface='ModelLevel'). -# #: Now, if a model that is used for the analysis does not contain a data -# #: file for ec550aer at the surface ('*ec550aer*Surface*.nc'), then, the -# #: colocation routine will look for '*ec550aer*ModelLevel*.nc' and if this -# #: exists, it will load it and extract the surface level. -# OBS_VERT_TYPES_ALT = {"Surface": "ModelLevel", "2D": "2D"} - -# #: do not raise Exception if invalid item is attempted to be assigned -# #: (Overwritten from base class) -# CRASH_ON_INVALID = False - -# FORBIDDEN_KEYS = [ -# "var_outlier_ranges", # deprecated since v0.12.0 -# "var_ref_outlier_ranges", # deprecated since v0.12.0 -# "remove_outliers", # deprecated since v0.12.0 -# ] - -# ts_type = StrWithDefault("monthly") -# obs_vars = ListOfStrings() - -# def __init__( -# self, -# model_id=None, -# obs_config: Optional[PyaroConfig] = None, -# obs_id=None, -# obs_vars=None, -# ts_type=None, -# start=None, -# stop=None, -# basedir_coldata=None, -# save_coldata=False, -# **kwargs, -# ): -# self.model_id = model_id -# self._obs_id = None -# self._obs_config = None - -# self.obs_id = obs_id -# self.obs_config = obs_config - -# self.obs_vars = obs_vars - -# self.ts_type = ts_type -# self.start = start -# self.stop = stop - -# # crashes if input filter name is invalid -# self.filter_name = f"{ALL_REGION_NAME}-wMOUNTAINS" - -# if basedir_coldata is not None: -# basedir_coldata = self._check_input_basedir_coldata(basedir_coldata) -# else: -# basedir_coldata = const.COLOCATEDDATADIR -# self.basedir_coldata = basedir_coldata -# self.save_coldata = save_coldata - -# # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER -# # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED - -# # Options related to obs reading and processing -# self.obs_name = None -# self.obs_data_dir = None - -# self.obs_use_climatology = False - -# self._obs_cache_only = False # only relevant if obs is ungridded -# self.obs_vert_type = None -# self.obs_ts_type_read = None -# self.obs_filters = {} -# self._obs_is_vertical_profile = False -# self.colocation_layer_limits = None -# self.profile_layer_limits = None - -# self.read_opts_ungridded = {} - -# # Attributes related to model data -# self.model_name = None -# self.model_data_dir = None - -# self.model_read_opts = {} - -# self.model_use_vars = {} -# self.model_rename_vars = {} -# self.model_add_vars = {} -# self.model_to_stp = False - -# self.model_ts_type_read = None -# self.model_read_aux = {} -# self.model_use_climatology = False - -# self.model_kwargs = {} - -# self.gridded_reader_id = {"model": "ReadGridded", "obs": "ReadGridded"} - -# self.flex_ts_type = True - -# # Options related to time resampling -# self.min_num_obs = None -# self.resample_how = "mean" - -# # Options related to outlier removal -# self.obs_remove_outliers = False -# self.model_remove_outliers = False - -# # Custom outlier ranges for model and obs -# self.obs_outlier_ranges = {} -# self.model_outlier_ranges = {} - -# self.zeros_to_nan = False -# self.harmonise_units = False -# self.regrid_res_deg = None -# self.colocate_time = False - -# self.reanalyse_existing = True -# self.raise_exceptions = False -# self.keep_data = True - -# self.add_meta = {} -# self.update(**kwargs) - -# def _check_input_basedir_coldata(self, basedir_coldata): -# """ -# Make sure input basedir_coldata is str and exists - -# Parameters -# ---------- -# basedir_coldata : str or Path -# basic output directory for colocated data - -# Raises -# ------ -# ValueError -# If input is invalid. - -# Returns -# ------- -# str -# valid output directory - -# """ -# if isinstance(basedir_coldata, Path): -# basedir_coldata = str(basedir_coldata) -# if isinstance(basedir_coldata, str): -# if not os.path.exists(basedir_coldata): -# os.mkdir(basedir_coldata) -# return basedir_coldata -# raise ValueError(f"Invalid input for basedir_coldata: {basedir_coldata}") - -# def _check_basedir_coldata(self): -# """ -# Make sure output directory for colocated data files exists - -# Raises -# ------ -# FileNotFoundError -# If :attr:`basedir_coldata` does not exist and cannot be created. - -# Returns -# ------- -# str -# current value of :attr:`basedir_coldata` - -# """ -# basedir_coldata = self.basedir_coldata -# if basedir_coldata is None: -# basedir_coldata = const.COLOCATEDDATADIR -# if not os.path.exists(basedir_coldata): -# logger.info(f"Creating directory: {basedir_coldata}") -# os.mkdir(basedir_coldata) -# elif isinstance(basedir_coldata, Path): -# basedir_coldata = str(basedir_coldata) -# if isinstance(basedir_coldata, str) and not os.path.exists(basedir_coldata): -# os.mkdir(basedir_coldata) -# if not os.path.exists(basedir_coldata): -# raise FileNotFoundError( -# f"Output directory for colocated data files {basedir_coldata} does not exist" -# ) -# self.basedir_coldata = basedir_coldata -# return basedir_coldata - -# @property -# def basedir_logfiles(self): -# """Base directory for storing logfiles""" -# p = chk_make_subdir(self.basedir_coldata, "logfiles") -# return p - -# @property -# def obs_id(self) -> str: -# return self._obs_id - -# @obs_id.setter -# def obs_id(self, val: Optional[str]) -> None: -# if self.obs_config is not None and val != self.obs_config.name: -# logger.info( -# f"Data ID in Pyaro config {self.obs_config.name} does not match obs_id {val}. Setting Pyaro config to None!" -# ) -# self.obs_config = None - -# self._obs_id = val - -# @property -# def obs_config(self) -> PyaroConfig: -# return self._obs_config - -# @obs_config.setter -# def obs_config(self, val: Optional[PyaroConfig]) -> None: -# if val is not None: -# if isinstance(val, dict): -# logger.info(f"Obs config was given as dict. Will try to convert to PyaroConfig") -# val = PyaroConfig(**val) -# if self.obs_id is not None and val.name != self.obs_id: -# logger.info( -# f"Data ID in Pyaro config {val.name} does not match obs_id {self.obs_id}. Setting Obs ID to match Pyaro Config!" -# ) -# self.obs_id = val.name -# if self.obs_id is None: -# self.obs_id = val.name -# self._obs_config = val - -# def add_glob_meta(self, **kwargs): -# """ -# Add global metadata to :attr:`add_meta` - -# Parameters -# ---------- -# kwargs -# metadata to be added - -# Returns -# ------- -# None - -# """ -# self.add_meta.update(**kwargs) - -# def __setitem__(self, key, val): -# if key == "basedir_coldata": -# val = self._check_input_basedir_coldata(val) -# super().__setitem__(key, val) - -# def _period_from_start_stop(self) -> str: -# start, stop = start_stop(self.start, self.stop, stop_sub_sec=False) -# y0, y1 = start.year, stop.year -# assert y0 <= y1 -# if y0 == y1: -# return str(y0) -# else: -# return f"{y0}-{y1}" - - class Colocator: """High level class for running co-location Note ---- - This object inherits from :class:`ColocationSetup` and is also instantiated - as such. For setup attributes, please see base class. + This object requires and instance from :class:`ColocationSetup`. """ - # ########################## - # # Pydantic ConfigDict - # ########################## - # model_config = ConfigDict( - # arbitrary_types_allowed=True, - # allow="extra", - # protected_namespaces=(), - # validate_assignment=True, - # ) - SUPPORTED_GRIDDED_READERS: dict = { "ReadGridded": ReadGridded, "ReadMscwCtm": ReadMscwCtm, @@ -593,40 +75,6 @@ def __init__(self, colocation_setup: ColocationSetup, **kwargs): self._obs_reader: Any | None = None self.obs_filters: dict = colocation_setup.obs_filters.copy() - # # Logging in this class needs serious work - # _log: Callable | None = None - # logging: bool = True - # _loaded_model_data: dict | None = {} - # data: dict = {} # think about this typing - # _processing_status: list[str] = [] - # files_written: list[str] = [] - # _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None - # _obs_reader: Any | None = None # LB: Should be improved - - # def __init__( - # self, - # _log: Callable | None = None, - # logging: bool = True, - # _loaded_model_data: dict | None = {}, - # data: dict = {}, - # _processing_status: list[str] = [], - # files_written: list[str] = [], - # _model_reader: ReadGridded | ReadMscwCtm | ReadCAMS2_83 | None = None, - # _obs_reader: Any | None = None, - # **kwargs, - # ) -> None: - # super(Colocator, self).__init__( - # _log=_log, - # logging=logging, - # _loaded_model_data=_loaded_model_data, - # data=data, - # _processing_status=_processing_status, - # files_written=files_written, - # _model_reader=_model_reader, - # _obs_reader=_obs_reader, - # **kwargs, - # ) - @property def model_vars(self): """ @@ -738,7 +186,6 @@ def output_dir(self): """ str: Output directory for colocated data NetCDF files """ - # self._check_basedir_coldata() # LB: don't need as force construction in ColocationSetup loc = os.path.join(self.colocation_setup.basedir_coldata, self.get_model_name()) if not os.path.exists(loc): logger.info(f"Creating dir {loc}") @@ -863,13 +310,9 @@ def prepare_run(self, var_list: list = None) -> dict: logger.warning("Deactivating logging in Colocator") self.logging = False - # LB: SHould be covered by ColocationSetup validator if isinstance(self.colocation_setup.obs_vars, str): self.colocation_setup.obs_vars = (self.colocation_setup.obs_vars,) - # # LB: obs_vars should be defined by here - # if not isinstance(self.colocation_setup.obs_vars, tuple): - # raise AttributeError("obs_vars not defined or invalid, need tuple with strings...") self._check_obs_vars_available() self._check_obs_filters() self._check_model_add_vars() @@ -906,8 +349,6 @@ def run(self, var_list: list = None): dictionaries comprising key / value pairs of obs variables and associated instances of :class:`ColocatedData`. """ - # LB: Do not allow changing the ColocationSetup after declaration. - # self.update(opts) data_out = {} # ToDo: see if the following could be solved via custom context manager try: @@ -972,8 +413,6 @@ def check_meta_match(meta, **kwargs): mname = self.get_model_name() oname = self.get_obs_name() model_vars = self.model_vars - # LB: Not entirely sure. There may be a deeper problem here. - # obs_vars = self.obs_vars obs_vars = self.colocation_setup.obs_vars start, stop = self.get_start_str(), self.get_stop_str() valid = [] @@ -1177,7 +616,6 @@ def _check_add_model_read_aux(self, model_var): return True def _check_obs_vars_available(self): - # LB: This is what I would like but not sure if it will work with current setup if not len(self.colocation_setup.obs_vars) > 0: raise ColocationSetupError("no observation variables specified...") oreader = self.obs_reader diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index d78f3ee6d..3e26e72fe 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -3,7 +3,7 @@ import sys from functools import cached_property from pathlib import Path -from typing import Callable, Iterable, Literal +from typing import Callable, Literal import pandas as pd from pydantic import ( @@ -292,23 +292,13 @@ class ColocationSetup(BaseModel): # property_set_methods = {"obs_config": "set_obs_config"} ) - # @model_validator('*', mode="before") - # def convert_to_none(cls, v): - # if isinstance(v, str) and v.strip() == "": - # return None - # if isinstance(v, Iterable) and len(v) == 0: - # return None - # else: - # return v - ######################### # Init Input ######################### - # LB: remains to be seen if this can actually be required without chaning the code elsewhere - model_id: str | None # = None - obs_id: str | None # = None - obs_vars: tuple[str, ...] | str # = None + model_id: str | None + obs_id: str | None + obs_vars: tuple[str, ...] | str @field_validator("obs_vars") @classmethod @@ -317,9 +307,9 @@ def validate_obs_vars(cls, v): return [v] return v - ts_type: str # = None - start: pd.Timestamp | int | str | None # = None - stop: pd.Timestamp | int | str | None # = None + ts_type: str + start: pd.Timestamp | int | str | None + stop: pd.Timestamp | int | str | None @field_validator("start", "stop") @classmethod @@ -331,16 +321,6 @@ def validate_basedirs(cls, v): obs_config: PyaroConfig | None = None - # def set_obs_config(self, obs_config): - # self.obs_config = obs_config - - # def __setattr__(self, key, val): - # method = self.__config__.property_set_methods.get(key) - # if method is None: - # super().__setattr__(key, val) - # else: - # getattr(self, method)(val) - ############################### # Attributes with defaults ############################### @@ -379,9 +359,6 @@ def validate_basedirs(cls, v): save_coldata: bool = False - # # END OF ASSIGNMENT OF MOST COMMON PARAMETERS - BELOW ARE FURTHER - # # CONFIG ATTRIBUTES, THAT ARE OPTIONAL AND LESS FREQUENTLY USED - # Options related to obs reading and processing obs_name: str | None = None obs_data_dir: Path | str | None = None @@ -405,11 +382,10 @@ def validate_basedirs(cls, v): model_use_vars: dict[str, str] | None = {} model_rename_vars: dict[str, str] | None = {} - model_add_vars: dict[str, tuple[str, ...]] | None = {} # LB: WIP / guess + model_add_vars: dict[str, tuple[str, ...]] | None = {} model_to_stp: bool = False model_ts_type_read: str | dict | None = None - # LB: need to check this declaration model_read_aux: dict[ str, dict[Literal["vars_required", "fun"], list[str] | Callable] ] | None = {} @@ -417,7 +393,6 @@ def validate_basedirs(cls, v): model_kwargs: dict = {} - # LB: check this as well gridded_reader_id: dict[str, str] = {"model": "ReadGridded", "obs": "ReadGridded"} flex_ts_type: bool = True @@ -442,9 +417,6 @@ def validate_basedirs(cls, v): keep_data: bool = True add_meta: dict | None = {} - # TODO: implelent field validators - # self.update(**kwargs) - # Override __init__ to allow for positional arguments def __init__( self, @@ -476,34 +448,16 @@ def __init__( @model_validator(mode="after") def validate_no_forbidden_keys(self): for key in self.FORBIDDEN_KEYS: - if key in self.model_fields: # LB: Check this is where they will be found + if key in self.model_fields: raise ValidationError - # TODO: validator for extra arguments. what are they? - @cached_property def basedir_logfiles(self): p = Path(self.basedir_coldata) / "logfiles" if not p.exists(): p.mkdir(parents=True, exist_ok=True) - return str(p) # LB: not sure why pyaerocom insists these be strings as this point - - # #@field_validator("obs_id") - # @model_validator(mode="after") - # @classmethod - # def validate_obs_id(cls, v: str): - # if cls.obs_config is not None and v != cls.obs.config.name: - # logger.info( - # f"Data ID in Pyaro config {cls.obs_config.name} does not match obs_id {v}. Setting Pyaro config to None!" - # ) - # cls.obs_config = None - - # cls.obs_id = v - - # LB: Think we need a validator on the PyaroConfig, not the obs_id. - # Combining the validation logic from those two things here. needs testing. - # LB: this needs serious work - # @field_validator("obs_config") + return str(p) + @model_validator(mode="after") @classmethod def validate_obs_config(cls, v: PyaroConfig): diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index ef335afc1..1d55c216c 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -349,7 +349,6 @@ def test_colocator__find_var_matches_model_add_vars(setup): assert var_matches == {"abs550aer": ovar, ovar: ovar} -# LB: This test breaks the way I want this class to work because it implies allowing adding of attributes. def test_colocator_instantiate_gridded_reader(setup, path_emep): model_id = "model" setup["gridded_reader_id"] = {"model": "ReadMscwCtm", "obs": "ReadGridded"} From 2773379435a26f5b3264915e33f3320346067ae4 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 13:12:01 +0200 Subject: [PATCH 39/44] clean up and docstrings --- pyaerocom/aeroval/_processing_base.py | 12 ++---------- pyaerocom/aeroval/setupclasses.py | 2 +- pyaerocom/colocation_setup.py | 4 ++-- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pyaerocom/aeroval/_processing_base.py b/pyaerocom/aeroval/_processing_base.py index a22570d54..45f1ec544 100644 --- a/pyaerocom/aeroval/_processing_base.py +++ b/pyaerocom/aeroval/_processing_base.py @@ -114,10 +114,7 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat Colocator """ - # LB: In general I don't like what this function is doing. Ideally define the Colocator object once and just use that. - col_cfg = { - **self.cfg.colocation_opts.model_dump() - } # LB: obs_vars is a list, should be a tuple + col_cfg = {**self.cfg.colocation_opts.model_dump()} outdir = self.cfg.path_manager.get_coldata_dir() col_cfg["basedir_coldata"] = outdir @@ -127,7 +124,7 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat if model_name: mod_cfg = self.cfg.get_model_entry(model_name) - col_cfg["model_cfg"] = mod_cfg # LB: this is untested and just a guess at this point + col_cfg["model_cfg"] = mod_cfg # LB: Hack and at what lowlevel_helpers's import_from was doing for key, val in mod_cfg.items(): @@ -143,11 +140,6 @@ def get_colocator(self, model_name: str = None, obs_name: str = None) -> Colocat if key in ColocationSetup.model_fields: col_cfg[key] = val - # col_stp = ColocationSetup(**col_cfg) - # col = Colocator(col_stp) - # col_cfg.add_meta - # col.import_from(obs_cfg) # LB: This is functionality might be needed. Want to get keys from the obs_cfg into ColocationSetup. - # col.colocation_setup.add_glob_meta(diurnal_only=self._get_diurnal_only(obs_name)) col_cfg["add_meta"].update(diurnal_only=self._get_diurnal_only(obs_name)) col_stp = ColocationSetup(**col_cfg) diff --git a/pyaerocom/aeroval/setupclasses.py b/pyaerocom/aeroval/setupclasses.py index a7f18959d..6652aae78 100644 --- a/pyaerocom/aeroval/setupclasses.py +++ b/pyaerocom/aeroval/setupclasses.py @@ -52,7 +52,7 @@ class OutputPaths(BaseModel): project ID exp_id : str experiment ID - json_basedir : str + json_basedir : str, Path """ diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 3e26e72fe..ba010536c 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -72,7 +72,7 @@ class ColocationSetup(BaseModel): is ignored and only the config is used. obs_id : str ID of observation network to be used. - obs_vars : list + obs_vars : tuple[str, ...] Variables to be analysed (need to be available in input obs dataset). Variables that are not available in the model data output will be skipped. Alternatively, model variables to be used for a given obs @@ -97,7 +97,7 @@ class ColocationSetup(BaseModel): (to be precise, if None, then :attr:`pyaerocom.const.DEFAULT_REG_FILTER` is used which should default to `ALL-wMOUNTAINS`, that is, no filtering). - basedir_coldata : str + basedir_coldata : str | Path Base directory for storing of colocated data files. save_coldata : bool if True, colocated data objects are saved as NetCDF file. From 4242cfc17cae76150b7e989ab3b7000c4682808e Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 14:05:06 +0200 Subject: [PATCH 40/44] don't break api --- pyaerocom/colocation_auto.py | 14 ++++++++++++-- pyaerocom/colocation_setup.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 790a7c640..2a97c1f32 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -6,6 +6,7 @@ import logging import os import traceback +import warnings from datetime import datetime from typing import Any, Callable @@ -58,9 +59,18 @@ class Colocator: 5: "NOT OK: Colocation failed", } - def __init__(self, colocation_setup: ColocationSetup, **kwargs): + def __init__(self, colocation_setup: ColocationSetup | dict, **kwargs): if not colocation_setup: - raise ValueError("An instance ColocationSetup must be provided to Colocator.") + raise ValueError( + "An instance ColocationSetup or a dict must be provided to Colocator." + ) + if not isinstance(colocation_setup, ColocationSetup): + colocation_setup = ColocationSetup(**colocation_setup) + warnings.warn( + DeprecationWarning( + "Future versions of Pyaerocom will require Colocator to injest an instance of ColocationSetup." + ) + ) self.colocation_setup = colocation_setup self._log: Callable | None = None diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index ba010536c..1a705ad74 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -313,7 +313,7 @@ def validate_obs_vars(cls, v): @field_validator("start", "stop") @classmethod - def validate_basedirs(cls, v): + def validate_start_stop(cls, v): if isinstance(v, int): return v if isinstance(v, str): From 146b710d4000115eb5d6c3118507de93aa9f34da Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 16:27:43 +0200 Subject: [PATCH 41/44] passes locally. thanks daniel --- pyaerocom/aeroval/experiment_processor.py | 3 +-- pyaerocom/colocation_auto.py | 7 +++---- pyaerocom/colocation_setup.py | 6 ++++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pyaerocom/aeroval/experiment_processor.py b/pyaerocom/aeroval/experiment_processor.py index 2601d1020..8168cd55d 100644 --- a/pyaerocom/aeroval/experiment_processor.py +++ b/pyaerocom/aeroval/experiment_processor.py @@ -70,8 +70,7 @@ def _run_single_entry(self, model_name, obs_name, var_list): if self.cfg.processing_opts.only_json: files_to_convert = col.get_available_coldata_files(var_list) else: - model_read_kwargs = self.cfg.model_cfg[model_name]["kwargs"] - col.run(var_list, model_read_kwargs=model_read_kwargs) + col.run(var_list) files_to_convert = col.files_written if self.cfg.processing_opts.only_colocation: diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 8620ffc3f..8cc63bc92 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -350,9 +350,6 @@ def run(self, var_list: list = None): var_list : list, optional list of variables supposed to be analysed. The default is None, in which case all defined variables are attempted to be colocated. - **opts - keyword args that may be specified to change the current setup - before colocation Returns ------- @@ -595,7 +592,9 @@ def _instantiate_gridded_reader(self, what): data_dir = self.colocation_setup.obs_data_dir reader_class = self._get_gridded_reader_class(what=what) if what == "model" and reader_class in self.MODELS_WITH_KWARGS: - reader = reader_class(data_id=data_id, data_dir=data_dir, **self.model_read_kwargs) + reader = reader_class( + data_id=data_id, data_dir=data_dir, **self.colocation_setup.model_read_kwargs + ) else: reader = reader_class(data_id=data_id, data_dir=data_dir) return reader diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index 1a705ad74..c394f0765 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -195,6 +195,10 @@ class ColocationSetup(BaseModel): fields are indicated with 9999 as year in the filename) and if this is active, only single year analysis are supported (i.e. provide int to :attr:`start` to specify the year and leave :attr:`stop` empty). + model_kwargs: dict + Key word arguments to be given to the model reader class's read_var function + model_read_kwargs: dict + Key word arguments to be given to the model reader class's init function gridded_reader_id : dict BETA: dictionary specifying which gridded reader is supposed to be used for model (and gridded obs) reading. Note: this is a workaround @@ -392,6 +396,8 @@ def validate_basedirs(cls, v): model_use_climatology: bool = False model_kwargs: dict = {} + # model_read_kwargs are arguments that are sent to the model reader + model_read_kwargs: dict = {} gridded_reader_id: dict[str, str] = {"model": "ReadGridded", "obs": "ReadGridded"} From cb65477971512bc363d1a63eed67205bce841d79 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 16:58:52 +0200 Subject: [PATCH 42/44] clean up --- pyaerocom/aeroval/setupclasses.py | 27 --------------------------- pyaerocom/colocation_auto.py | 11 ----------- pyaerocom/colocation_setup.py | 5 ++--- 3 files changed, 2 insertions(+), 41 deletions(-) diff --git a/pyaerocom/aeroval/setupclasses.py b/pyaerocom/aeroval/setupclasses.py index 5c1a8da19..99d8e32d7 100644 --- a/pyaerocom/aeroval/setupclasses.py +++ b/pyaerocom/aeroval/setupclasses.py @@ -449,33 +449,6 @@ def colocation_opts(self) -> ColocationSetup: # These attributes require special attention b/c they're not based on Pydantic's BaseModel class. - # TODO: Use Pydantic for ColocationSetup - # @computed_field - # @cached_property - # def colocation_opts(self) -> ColocationSetup: - # if not hasattr(self, "model_extra") or self.model_extra is None: - # return ColocationSetup(save_coldata=True, keep_data=False, resample_how="mean") - - # model_args = { - # key: val - # for key, val in self.model_extra.items() - # if key in ColocationSetup().__dict__.keys() - # } - # # need to pass some default values to the ColocationSetup if not provided in config - # default_dict = {"save_coldata": True, "keep_data": False, "resample_how": "mean"} - # for key in default_dict: - # if key not in model_args: - # model_args[key] = default_dict[key] - - # return ColocationSetup(**model_args) - - # @field_serializer("colocation_opts") - # def serialize_colocation_opts(self, colocation_opts: ColocationSetup): - # return colocation_opts.json_repr() - - # ObsCollection and ModelCollection - # TODO Use Pydantic for ObsCollection and ModelCollection - obs_cfg: ObsCollection | dict = ObsCollection() @field_validator("obs_cfg") diff --git a/pyaerocom/colocation_auto.py b/pyaerocom/colocation_auto.py index 8cc63bc92..f4b41b17f 100644 --- a/pyaerocom/colocation_auto.py +++ b/pyaerocom/colocation_auto.py @@ -837,17 +837,6 @@ def _eval_obs_filters(self, var_name): f"Detected obs_filters attribute in Colocator class, " f"which is not a dictionary: {obs_filters}" ) - # remaining = {} - # for key, val in obs_filters.items(): - # # keep ts_type filter in remaining (added on 17.2.21, 0.10.0 -> 0.10.1) - # if key in self.colocation_setup.obs_filters and not key == "ts_type": # can be handled - # #if isinstance(self[key], dict) and isinstance(val, dict): - # #self[key].update(val) - # #else: - # #self[key] = val - # pass - # else: - # remaining[key] = val return obs_filters if len(obs_filters) > 0 else {} def _save_coldata(self, coldata): diff --git a/pyaerocom/colocation_setup.py b/pyaerocom/colocation_setup.py index c394f0765..6d8c379c4 100644 --- a/pyaerocom/colocation_setup.py +++ b/pyaerocom/colocation_setup.py @@ -291,9 +291,9 @@ class ColocationSetup(BaseModel): arbitrary_types_allowed=True, allow="extra", protected_namespaces=(), + # TODO # frozen=True, # make immutable # validate_assignment=True, - # property_set_methods = {"obs_config": "set_obs_config"} ) ######################### @@ -517,8 +517,7 @@ def update(self, data: dict) -> Self: update = self.model_dump() update.update(data) self.model_validate(update) - # for k, v in self.model_dump(exclude_defaults=False).items(): - # assign values from + # assign values from data for k, v in data.items(): logger.debug(f"updating value of '{k}' from '{getattr(self, k, None)}' to '{v}'") setattr(self, k, v) From fa1b79c4242f4bc4d101995776c8650afdd2a5a1 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 17:02:24 +0200 Subject: [PATCH 43/44] heiko comments / clean up --- pyaerocom/aeroval/setupclasses.py | 4 +--- tests/fixtures/mscw_ctm.py | 20 -------------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/pyaerocom/aeroval/setupclasses.py b/pyaerocom/aeroval/setupclasses.py index 99d8e32d7..3e3ee805a 100644 --- a/pyaerocom/aeroval/setupclasses.py +++ b/pyaerocom/aeroval/setupclasses.py @@ -33,9 +33,7 @@ check_if_year, ) from pyaerocom.aeroval.json_utils import read_json, set_float_serialization_precision, write_json - -# from pyaerocom.colocation_auto import ColocationSetup # Old -from pyaerocom.colocation_setup import ColocationSetup # New +from pyaerocom.colocation_setup import ColocationSetup logger = logging.getLogger(__name__) diff --git a/tests/fixtures/mscw_ctm.py b/tests/fixtures/mscw_ctm.py index 3ea06796e..9a34067cb 100644 --- a/tests/fixtures/mscw_ctm.py +++ b/tests/fixtures/mscw_ctm.py @@ -56,26 +56,6 @@ def create_fake_MSCWCtm_data(year="2019", numval=1, tst=None): return arr -# @pytest.fixture -# def fake_so4_MSCWCtm_data_monthly_2015(tmp_path) -> str: -# path = tmp_path / "EMEP_fake" / "2015" - -# if not path.exists(): -# path.mkdir(parents=True) -# data = create_fake_MSCWCtm_data(year=2015, numval=1) - -# var_name = "SURF_ug_SO4" -# units = "ug m-3" -# ds = xr.Dataset() - -# ds[var_name] = data -# ds[var_name].attrs.update(units=units, var_name=var_name) - -# ds.to_netcdf(path=path / "Base_month.nc") - -# return str(path) - - @pytest.fixture def fake_aod_MSCWCtm_data_monthly_2010(tmp_path) -> str: path = tmp_path / "EMEP_fake" / "2010" From a88fe876553decc61abdc3f0f964b510d80bd857 Mon Sep 17 00:00:00 2001 From: Lewis Blake Date: Fri, 31 May 2024 17:24:36 +0200 Subject: [PATCH 44/44] more clean up --- pyaerocom/aeroval/helpers.py | 2 -- tests/test_colocation_auto.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pyaerocom/aeroval/helpers.py b/pyaerocom/aeroval/helpers.py index 9fc6348f5..35a6640d1 100644 --- a/pyaerocom/aeroval/helpers.py +++ b/pyaerocom/aeroval/helpers.py @@ -13,8 +13,6 @@ start_stop, to_pandas_timestamp, ) -from pyaerocom.io import ReadGridded -from pyaerocom.tstype import TsType from pyaerocom.variable import Variable logger = logging.getLogger(__name__) diff --git a/tests/test_colocation_auto.py b/tests/test_colocation_auto.py index 1d55c216c..c04f0c3a5 100644 --- a/tests/test_colocation_auto.py +++ b/tests/test_colocation_auto.py @@ -171,7 +171,7 @@ def test_Colocator__infer_start_stop_yr_from_model_reader(tm5_aero_col_stp): col.model_id = "TM5-met2010_CTRL-TEST" col._infer_start_stop_yr_from_model_reader() assert col.start == 2010 - assert col.stop == None + assert col.stop is None def test_Colocator__coldata_savename(setup):