From 53c806c72f83fa9ce78aa8150091762db6a77230 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Mon, 28 Aug 2023 08:56:10 +0200 Subject: [PATCH] Harmonize logging of ixmp4 and pyam in Jupyter notebooks (#774) --- docs/api.rst | 24 ++++++--------- pyam/__init__.py | 19 +++++------- pyam/core.py | 75 ++++++++++++++++++++++------------------------ pyam/logging.py | 66 +++++++--------------------------------- pyam/utils.py | 54 ++++++++++++--------------------- tests/test_core.py | 4 +-- tests/test_io.py | 2 +- 7 files changed, 85 insertions(+), 159 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 32697b833..da1b3c3f8 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -23,29 +23,23 @@ and methods. api/timeseries api/variables -**Notebook logging behaviour** +Logging behaviour in Jupyter notebooks +-------------------------------------- -|pyam| wants to provide sensible defaults for users unfamiliar with -`setting up python's logging library `_, -and therefore will provide a basic configuration by invoking +The |pyam| package wants to provide sensible defaults for users unfamiliar with setting +up python's logging library (`read more`_), and therefore will add a streamhandler if +(and only if) it determines to be running within a notebook. -.. code-block:: python - - import logging - logging.basicConfig(level="INFO", format="%(name)s - %(levelname)s: %(message)s") - -if (and only if): - -1. it determines to be running within a notebook, and -2. logging is still *unconfigured by the time the first logging message by |pyam| is to be emitted*. +.. _`read more` : https://realpython.com/python-logging/#basic-configurations -**Intersphinx mapping** +Intersphinx mapping +------------------- To use sphinx.ext.intersphinx_ for generating automatic links from your project to the documentation of |pyam| classes and functions, please add the following to your project's :code:`conf.py`: -.. _sphinx.ext.intersphinx: https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html +.. _sphinx.ext.intersphinx : https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html .. code-block:: python diff --git a/pyam/__init__.py b/pyam/__init__.py index 6abb0787b..f0d12089f 100644 --- a/pyam/__init__.py +++ b/pyam/__init__.py @@ -15,7 +15,6 @@ validate, ) from pyam.statistics import Statistics -from pyam.logging import * from pyam.iiasa import read_iiasa, lazy_read_iiasa # noqa: F401 from pyam.datareader import read_worldbank # noqa: F401 from pyam.unfccc import read_unfccc # noqa: F401 @@ -23,8 +22,6 @@ from pyam.run_control import run_control # noqa: F401 from pyam.utils import IAMC_IDX # noqa: F401 -from pyam.logging import defer_logging_config - logger = logging.getLogger(__name__) # get version number either from git (preferred) or metadata @@ -44,15 +41,15 @@ shell = get_ipython() if isinstance(shell, ZMQInteractiveShell): - # set up basic logging if running in a notebook - log_msg = "Running in a notebook, setting up a basic logging at level INFO" + # harmonize formatting of ixmp4 and pyam logging + ixmp4_logger = logging.getLogger("ixmp4") + ixmp4_logger.removeHandler(ixmp4_logger.handlers[0]) - defer_logging_config( - logger, - log_msg, - level="INFO", - format="%(name)s - %(levelname)s: %(message)s", - ) + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(name)s - %(levelname)s: %(message)s")) + + for _logger in [logger, ixmp4_logger]: + _logger.addHandler(handler) # deactivate in-cell scrolling in a Jupyter notebook shell.run_cell_magic( diff --git a/pyam/core.py b/pyam/core.py index 74df64ac3..3f38a365b 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -69,7 +69,7 @@ append_index_col, ) from pyam.time import swap_time_for_year, swap_year_for_time -from pyam.logging import raise_data_error, deprecation_warning +from pyam.logging import raise_data_error, deprecation_warning, format_log_message from pyam.validation import _exclude_on_fail logger = logging.getLogger(__name__) @@ -160,7 +160,6 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs): # read from file if isinstance(data, Path): - data = Path(data) # casting str or LocalPath to Path if not data.is_file(): raise FileNotFoundError(f"No such file: '{data}'") logger.info(f"Reading file {data}") @@ -2488,9 +2487,7 @@ def to_datapackage(self, path): # return the package (needs to reloaded because `tmp` was deleted) return Package(path) - def load_meta( - self, path, sheet_name="meta", ignore_conflict=False, *args, **kwargs - ): + def load_meta(self, path, sheet_name="meta", ignore_conflict=False, **kwargs): """Load 'meta' indicators from file Parameters @@ -2505,59 +2502,57 @@ def load_meta( kwargs Passed to :func:`pandas.read_excel` or :func:`pandas.read_csv` """ + # load from file path = path if isinstance(path, pd.ExcelFile) else Path(path) - df = read_pandas(path, sheet_name=sheet_name, **kwargs) - - # cast model-scenario column headers to lower-case (if necessary) - df = df.rename(columns=dict([(i.capitalize(), i) for i in META_IDX])) + meta = read_pandas(path, sheet_name=sheet_name, **kwargs) - # check that required index columns exist - missing_cols = [c for c in self.index.names if c not in df.columns] - if missing_cols: + # cast index-column headers to lower-case, check that required index exists + meta = meta.rename(columns=dict([(i.capitalize(), i) for i in META_IDX])) + if missing_cols := [c for c in self.index.names if c not in meta.columns]: raise ValueError( - f"File {Path(path)} (sheet {sheet_name}) " - f"missing required index columns {missing_cols}!" + f"Missing index columns for meta indicators: {missing_cols}" ) - # set index, filter to relevant scenarios from imported file - n = len(df) - df.set_index(self.index.names, inplace=True) - df = df.loc[self.meta.index.intersection(df.index)] - - # skip import of meta indicators if np - if not n: - logger.info(f"No scenarios found in sheet {sheet_name}") + # skip import of meta indicators if no rows in meta + if not len(meta.index): + logger.warning(f"No scenarios found in sheet {sheet_name}") return - msg = "Reading meta indicators" - # indicate if not all scenarios are included in the meta file - if len(df) < len(self.meta): - i = len(self.meta) - msg += f" for {len(df)} out of {i} scenario{s(i)}" - - # indicate if more scenarios exist in meta file than in self - invalid = n - len(df) - if invalid: - msg += f", ignoring {invalid} scenario{s(invalid)} from file" - logger.warning(msg) - else: - logger.info(msg) + # set index, check consistency between existing index and meta + meta.set_index(self.index.names, inplace=True) + + missing = self.index.difference(meta.index) + invalid = meta.index.difference(self.index) + + if not missing.empty: + logger.warning( + format_log_message( + "No meta indicators for the following scenarios", missing + ) + ) + if not invalid.empty: + logger.warning( + format_log_message( + "Ignoring meta indicators for the following scenarios", invalid + ) + ) + meta = meta.loc[self.meta.index.intersection(meta.index)] # in pyam < 2.0, an "exclude" columns was part of the `meta` attribute # this section ensures compatibility with xlsx files created with pyam < 2.0 - if "exclude" in df.columns: + if "exclude" in meta.columns: logger.info( f"Found column 'exclude' in sheet '{sheet_name}', " "moved to attribute `IamDataFrame.exclude`." ) self._exclude = merge_exclude( - df.exclude, self.exclude, ignore_conflict=ignore_conflict + meta.exclude, self.exclude, ignore_conflict=ignore_conflict ) - df.drop(columns="exclude", inplace=True) + meta.drop(columns="exclude", inplace=True) # merge imported meta indicators - self.meta = merge_meta(df, self.meta, ignore_conflict=ignore_conflict) + self.meta = merge_meta(meta, self.meta, ignore_conflict=ignore_conflict) def map_regions( self, @@ -2849,7 +2844,7 @@ def filter_by_meta(data, df, join_meta=False, **kwargs): to nan if `(model, scenario)` not in `df.meta.index`) """ if not set(META_IDX).issubset(data.index.names + list(data.columns)): - raise ValueError("Missing required index dimensions or columns!") + raise ValueError("Missing required index dimensions or data columns.") meta = pd.DataFrame(df.meta[list(set(kwargs) - set(META_IDX))].copy()) diff --git a/pyam/logging.py b/pyam/logging.py index 6587eff8f..ee3f4be39 100644 --- a/pyam/logging.py +++ b/pyam/logging.py @@ -1,17 +1,18 @@ from contextlib import contextmanager -import logging +from logging import * + import pandas as pd import warnings -logger = logging.getLogger(__name__) +logger = getLogger(__name__) @contextmanager def adjust_log_level(logger="pyam", level="ERROR"): """Context manager to change log level""" if isinstance(logger, str): - logger = logging.getLogger(logger) + logger = getLogger(logger) old_level = logger.getEffectiveLevel() logger.setLevel(level) yield @@ -26,58 +27,13 @@ def deprecation_warning(msg, item="This method", stacklevel=3): def raise_data_error(msg, data): - """Utils function to format error message from data formatting""" + """Format error message with (head of) data table and raise""" + raise ValueError(format_log_message(msg, data)) + + +def format_log_message(msg, data): + """Utils function to format message with (head of) data table""" if isinstance(data, pd.MultiIndex): data = data.to_frame(index=False) data = data.drop_duplicates() - msg = f"{msg}:\n{data.head()}" + ("\n..." if len(data) > 5 else "") - raise ValueError(msg) - - -class ConfigPseudoHandler(logging.Handler): - """Pseudo logging handler to defer configuring logging until the first message - - Registers itself as a handler for the provided logger and temporarily - sets the logger as sensitive to INFO messages. Upon receival of the first - message (of at least INFO level), it configures logging with the provided - `config_kwargs` and prints `log_msg` - - Parameters - ---------- - logger : logging.Logger - Logger to listen for the first message - log_msg : str, optional - Message to print once logging is configured, by default None - **config_kwargs - Arguments to pass on to logging.basicConfig - """ - - def __init__(self, logger, log_msg=None, **config_kwargs): - super().__init__() - - self.logger = logger - self.log_msg = log_msg - self.config_kwargs = config_kwargs - - self.logger.addHandler(self) - - # temporarily set the logging level to a non-standard value, - # slightly below logging.INFO == 20 and use that as a sentinel - # to switch back to logging.NOTSET later - self.logger.setLevel(19) - - def emit(self, record): - self.logger.removeHandler(self) - - if self.logger.level == 19: - self.logger.setLevel(logging.NOTSET) - - if not self.logger.root.hasHandlers(): - logging.basicConfig(**self.config_kwargs) - - if self.log_msg is not None: - self.logger.info(self.log_msg) - - -# Give the Handler a function like alias -defer_logging_config = ConfigPseudoHandler + return f"{msg}:\n{data.head()}" + ("\n..." if len(data) > 5 else "") diff --git a/pyam/utils.py b/pyam/utils.py index 2e96ac540..d63b005c3 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -51,28 +51,6 @@ } -def requires_package(pkg, msg, error_type=ImportError): - """Decorator when a function requires an optional dependency - - Parameters - ---------- - pkg : imported package object - msg : string - Message to show to user with error_type - error_type : python error class - """ - - def _requires_package(func): - def wrapper(*args, **kwargs): - if pkg is None: - raise error_type(msg) - return func(*args, **kwargs) - - return wrapper - - return _requires_package - - def isstr(x): # TODO deprecated, remove for release >= 2.1 deprecation_warning("Please use `pyam.str.is_str()`.", "The function `isstr()`") @@ -152,7 +130,7 @@ def read_pandas(path, sheet_name=["data*", "Data*"], *args, **kwargs): # apply pattern-matching for sheet names (use * as wildcard) sheets = sheet_names[pattern_match(sheet_names, values=sheets)] if sheets.empty: - raise ValueError(f"No sheets {sheet_name} in file {path}!") + raise ValueError(f"Sheet(s) '{sheet_name}' not found in file '{path}'.") df = pd.concat([xl.parse(s, *args, **kwargs) for s in sheets]) @@ -217,7 +195,9 @@ def _knead_data(df, **kwargs): dfs = [] for v in value: if v not in df.columns: - raise ValueError("column `{}` does not exist!".format(v)) + raise ValueError( + f"Column `{v}` not in timeseries data, found: {df.columns}" + ) vdf = _df[v].to_frame().rename(columns={v: "value"}) vdf["variable"] = v dfs.append(vdf.reset_index()) @@ -226,7 +206,9 @@ def _knead_data(df, **kwargs): # otherwise, rename columns or concat to IAMC-style or do a fill-by-value for col, value in kwargs.items(): if col in df: - raise ValueError(f"Conflict of kwarg with column `{col}` in dataframe!") + raise ValueError( + f"Conflict of kwarg with column `{col}` in timeseries data." + ) if is_str(value) and value in df: df.rename(columns={value: col}, inplace=True) @@ -236,7 +218,7 @@ def _knead_data(df, **kwargs): elif is_str(value): df[col] = value else: - raise ValueError(f"Invalid argument for casting `{col}: {value}`") + raise ValueError(f"Invalid argument for casting data: `{col}: {value}`") return df @@ -244,7 +226,7 @@ def _knead_data(df, **kwargs): def _format_from_legacy_database(df): """Process data from legacy databases (SSP and earlier)""" - logger.info("Ignoring notes column in `data`") + logger.info("Ignoring notes column in `data`.") df.drop(columns="notes", inplace=True) col = df.columns[0] # first column has database copyright notice df = df[~df[col].str.contains("database", case=False)] @@ -270,7 +252,7 @@ def _intuit_column_groups(df, index, include_index=False): # check that there is no column in the timeseries data with reserved names if None in existing_cols: - raise ValueError("Unnamed column in `data`: None") + raise ValueError("Unnamed column in timeseries data: None") # check that there is no column in the timeseries data with reserved/illegal names conflict_cols = [i for i in existing_cols if i in ILLEGAL_COLS] @@ -286,11 +268,13 @@ def _intuit_column_groups(df, index, include_index=False): # check that index and required columns exist missing_index = [c for c in index if c not in existing_cols] if missing_index: - raise ValueError(f"Missing index columns: {missing_index}") + raise ValueError(f"Missing index columns in timeseries data: {missing_index}") missing_required_col = [c for c in REQUIRED_COLS if c not in existing_cols] if missing_required_col: - raise ValueError(f"Missing required columns: {missing_required_col}") + raise ValueError( + f"Missing required columns in timeseries data: {missing_required_col}" + ) # check whether data in wide format (standard IAMC) or long format (`value` column) if "value" in existing_cols: @@ -300,7 +284,7 @@ def _intuit_column_groups(df, index, include_index=False): elif "time" in existing_cols and "year" not in existing_cols: time_col = "time" else: - raise ValueError("Invalid time domain, must have either `year` or `time`!") + raise ValueError("Invalid time domain, must have either `year` or `time`.") extra_cols = [ c for c in existing_cols @@ -334,7 +318,7 @@ def _intuit_column_groups(df, index, include_index=False): time_col = "time" data_cols = sorted(year_cols) + sorted(time_cols) if not data_cols: - raise ValueError("Missing time domain") + raise ValueError("No time domain in the data.") return time_col, extra_cols, data_cols @@ -438,7 +422,7 @@ def format_data(df, index, **kwargs): ) del rows if df.empty: - logger.warning("Formatted data is empty!") + logger.warning("Formatted data is empty.") return df.sort_index(), index, time_col, extra_cols @@ -469,7 +453,7 @@ def merge_meta(left, right, ignore_conflict=False): diff = right.index.difference(left.index) sect = right.index.intersection(left.index) - # merge `right` into `left` for overlapping scenarios ( `sect`) + # merge `right` into `left` for overlapping scenarios (`sect`) if not sect.empty: # if not ignored, check that overlapping `meta` columns are equal if not ignore_conflict: @@ -480,7 +464,7 @@ def merge_meta(left, right, ignore_conflict=False): .drop_duplicates() .index.drop_duplicates() ) - msg = "conflict in `meta` for scenarios {}".format( + msg = "Conflict in `meta` for scenarios {}".format( [i for i in pd.DataFrame(index=conflict_idx).index] ) raise ValueError(msg) diff --git a/tests/test_core.py b/tests/test_core.py index 7fafb992c..f10b2a08b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -206,7 +206,7 @@ def test_init_df_with_custom_index(test_pd_df): def test_init_empty_message(caplog): IamDataFrame(data=df_empty) - drop_message = "Formatted data is empty!" + drop_message = "Formatted data is empty." message_idx = caplog.messages.index(drop_message) assert caplog.records[message_idx].levelno == logging.WARNING @@ -216,7 +216,7 @@ def test_init_with_unnamed_column(test_pd_df): test_pd_df[None] = "foo" # check that initialising an instance with an unnamed column raises - with pytest.raises(ValueError, match="Unnamed column in `data`: None"): + with pytest.raises(ValueError, match="Unnamed column in timeseries data: None"): IamDataFrame(test_pd_df) diff --git a/tests/test_io.py b/tests/test_io.py index 330df8fe4..c358e9531 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -198,7 +198,7 @@ def test_load_meta_wrong_index(test_df_year, tmpdir): file = tmpdir / "testing_meta_empty.xlsx" pd.DataFrame(columns=["model", "foo"]).to_excel(file, index=False) - match = ".* \(sheet meta\) missing required index columns \['scenario'\]\!" + match = "Missing index columns for meta indicators: \['scenario'\]" with pytest.raises(ValueError, match=match): test_df_year.load_meta(file)