From 53c806c72f83fa9ce78aa8150091762db6a77230 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 28 Aug 2023 08:56:10 +0200
Subject: [PATCH] Harmonize logging of ixmp4 and pyam in Jupyter notebooks
 (#774)

---
 docs/api.rst       | 24 ++++++---------
 pyam/__init__.py   | 19 +++++-------
 pyam/core.py       | 75 ++++++++++++++++++++++------------------------
 pyam/logging.py    | 66 +++++++---------------------------------
 pyam/utils.py      | 54 ++++++++++++---------------------
 tests/test_core.py |  4 +--
 tests/test_io.py   |  2 +-
 7 files changed, 85 insertions(+), 159 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index 32697b833..da1b3c3f8 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -23,29 +23,23 @@ and methods.
    api/timeseries
    api/variables
 
-**Notebook logging behaviour**
+Logging behaviour in Jupyter notebooks
+--------------------------------------
 
-|pyam| wants to provide sensible defaults for users unfamiliar with
-`setting up python's logging library <https://realpython.com/python-logging/#basic-configurations>`_,
-and therefore will provide a basic configuration by invoking
+The |pyam| package wants to provide sensible defaults for users unfamiliar with setting
+up python's logging library (`read more`_), and therefore will add a streamhandler if
+(and only if) it determines to be running within a notebook.
 
-.. code-block:: python
-
-   import logging
-   logging.basicConfig(level="INFO", format="%(name)s - %(levelname)s: %(message)s")
-
-if (and only if):
-
-1. it determines to be running within a notebook, and
-2. logging is still *unconfigured by the time the first logging message by |pyam| is to be emitted*.
+.. _`read more` : https://realpython.com/python-logging/#basic-configurations
 
-**Intersphinx mapping**
+Intersphinx mapping
+-------------------
 
 To use sphinx.ext.intersphinx_ for generating automatic links from your project
 to the documentation of |pyam| classes and functions, please add the following
 to your project's :code:`conf.py`:
 
-.. _sphinx.ext.intersphinx: https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html
+.. _sphinx.ext.intersphinx : https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html
 
 .. code-block:: python
 
diff --git a/pyam/__init__.py b/pyam/__init__.py
index 6abb0787b..f0d12089f 100644
--- a/pyam/__init__.py
+++ b/pyam/__init__.py
@@ -15,7 +15,6 @@
     validate,
 )
 from pyam.statistics import Statistics
-from pyam.logging import *
 from pyam.iiasa import read_iiasa, lazy_read_iiasa  # noqa: F401
 from pyam.datareader import read_worldbank  # noqa: F401
 from pyam.unfccc import read_unfccc  # noqa: F401
@@ -23,8 +22,6 @@
 from pyam.run_control import run_control  # noqa: F401
 from pyam.utils import IAMC_IDX  # noqa: F401
 
-from pyam.logging import defer_logging_config
-
 logger = logging.getLogger(__name__)
 
 # get version number either from git (preferred) or metadata
@@ -44,15 +41,15 @@
 
     shell = get_ipython()
     if isinstance(shell, ZMQInteractiveShell):
-        # set up basic logging if running in a notebook
-        log_msg = "Running in a notebook, setting up a basic logging at level INFO"
+        # harmonize formatting of ixmp4 and pyam logging
+        ixmp4_logger = logging.getLogger("ixmp4")
+        ixmp4_logger.removeHandler(ixmp4_logger.handlers[0])
 
-        defer_logging_config(
-            logger,
-            log_msg,
-            level="INFO",
-            format="%(name)s - %(levelname)s: %(message)s",
-        )
+        handler = logging.StreamHandler()
+        handler.setFormatter(logging.Formatter("%(name)s - %(levelname)s: %(message)s"))
+
+        for _logger in [logger, ixmp4_logger]:
+            _logger.addHandler(handler)
 
         # deactivate in-cell scrolling in a Jupyter notebook
         shell.run_cell_magic(
diff --git a/pyam/core.py b/pyam/core.py
index 74df64ac3..3f38a365b 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -69,7 +69,7 @@
     append_index_col,
 )
 from pyam.time import swap_time_for_year, swap_year_for_time
-from pyam.logging import raise_data_error, deprecation_warning
+from pyam.logging import raise_data_error, deprecation_warning, format_log_message
 from pyam.validation import _exclude_on_fail
 
 logger = logging.getLogger(__name__)
@@ -160,7 +160,6 @@ def _init(self, data, meta=None, index=DEFAULT_META_INDEX, **kwargs):
 
         # read from file
         if isinstance(data, Path):
-            data = Path(data)  # casting str or LocalPath to Path
             if not data.is_file():
                 raise FileNotFoundError(f"No such file: '{data}'")
             logger.info(f"Reading file {data}")
@@ -2488,9 +2487,7 @@ def to_datapackage(self, path):
         # return the package (needs to reloaded because `tmp` was deleted)
         return Package(path)
 
-    def load_meta(
-        self, path, sheet_name="meta", ignore_conflict=False, *args, **kwargs
-    ):
+    def load_meta(self, path, sheet_name="meta", ignore_conflict=False, **kwargs):
         """Load 'meta' indicators from file
 
         Parameters
@@ -2505,59 +2502,57 @@ def load_meta(
         kwargs
             Passed to :func:`pandas.read_excel` or :func:`pandas.read_csv`
         """
+
         # load from file
         path = path if isinstance(path, pd.ExcelFile) else Path(path)
-        df = read_pandas(path, sheet_name=sheet_name, **kwargs)
-
-        # cast model-scenario column headers to lower-case (if necessary)
-        df = df.rename(columns=dict([(i.capitalize(), i) for i in META_IDX]))
+        meta = read_pandas(path, sheet_name=sheet_name, **kwargs)
 
-        # check that required index columns exist
-        missing_cols = [c for c in self.index.names if c not in df.columns]
-        if missing_cols:
+        # cast index-column headers to lower-case, check that required index exists
+        meta = meta.rename(columns=dict([(i.capitalize(), i) for i in META_IDX]))
+        if missing_cols := [c for c in self.index.names if c not in meta.columns]:
             raise ValueError(
-                f"File {Path(path)} (sheet {sheet_name}) "
-                f"missing required index columns {missing_cols}!"
+                f"Missing index columns for meta indicators: {missing_cols}"
             )
 
-        # set index, filter to relevant scenarios from imported file
-        n = len(df)
-        df.set_index(self.index.names, inplace=True)
-        df = df.loc[self.meta.index.intersection(df.index)]
-
-        # skip import of meta indicators if np
-        if not n:
-            logger.info(f"No scenarios found in sheet {sheet_name}")
+        # skip import of meta indicators if no rows in meta
+        if not len(meta.index):
+            logger.warning(f"No scenarios found in sheet {sheet_name}")
             return
 
-        msg = "Reading meta indicators"
-        # indicate if not all scenarios are included in the meta file
-        if len(df) < len(self.meta):
-            i = len(self.meta)
-            msg += f" for {len(df)} out of {i} scenario{s(i)}"
-
-        # indicate if more scenarios exist in meta file than in self
-        invalid = n - len(df)
-        if invalid:
-            msg += f", ignoring {invalid} scenario{s(invalid)} from file"
-            logger.warning(msg)
-        else:
-            logger.info(msg)
+        # set index, check consistency between existing index and meta
+        meta.set_index(self.index.names, inplace=True)
+
+        missing = self.index.difference(meta.index)
+        invalid = meta.index.difference(self.index)
+
+        if not missing.empty:
+            logger.warning(
+                format_log_message(
+                    "No meta indicators for the following scenarios", missing
+                )
+            )
+        if not invalid.empty:
+            logger.warning(
+                format_log_message(
+                    "Ignoring meta indicators for the following scenarios", invalid
+                )
+            )
+            meta = meta.loc[self.meta.index.intersection(meta.index)]
 
         # in pyam < 2.0, an "exclude" columns was part of the `meta` attribute
         # this section ensures compatibility with xlsx files created with pyam < 2.0
-        if "exclude" in df.columns:
+        if "exclude" in meta.columns:
             logger.info(
                 f"Found column 'exclude' in sheet '{sheet_name}', "
                 "moved to attribute `IamDataFrame.exclude`."
             )
             self._exclude = merge_exclude(
-                df.exclude, self.exclude, ignore_conflict=ignore_conflict
+                meta.exclude, self.exclude, ignore_conflict=ignore_conflict
             )
-            df.drop(columns="exclude", inplace=True)
+            meta.drop(columns="exclude", inplace=True)
 
         # merge imported meta indicators
-        self.meta = merge_meta(df, self.meta, ignore_conflict=ignore_conflict)
+        self.meta = merge_meta(meta, self.meta, ignore_conflict=ignore_conflict)
 
     def map_regions(
         self,
@@ -2849,7 +2844,7 @@ def filter_by_meta(data, df, join_meta=False, **kwargs):
         to nan if `(model, scenario)` not in `df.meta.index`)
     """
     if not set(META_IDX).issubset(data.index.names + list(data.columns)):
-        raise ValueError("Missing required index dimensions or columns!")
+        raise ValueError("Missing required index dimensions or data columns.")
 
     meta = pd.DataFrame(df.meta[list(set(kwargs) - set(META_IDX))].copy())
 
diff --git a/pyam/logging.py b/pyam/logging.py
index 6587eff8f..ee3f4be39 100644
--- a/pyam/logging.py
+++ b/pyam/logging.py
@@ -1,17 +1,18 @@
 from contextlib import contextmanager
-import logging
+from logging import *
+
 import pandas as pd
 import warnings
 
 
-logger = logging.getLogger(__name__)
+logger = getLogger(__name__)
 
 
 @contextmanager
 def adjust_log_level(logger="pyam", level="ERROR"):
     """Context manager to change log level"""
     if isinstance(logger, str):
-        logger = logging.getLogger(logger)
+        logger = getLogger(logger)
     old_level = logger.getEffectiveLevel()
     logger.setLevel(level)
     yield
@@ -26,58 +27,13 @@ def deprecation_warning(msg, item="This method", stacklevel=3):
 
 
 def raise_data_error(msg, data):
-    """Utils function to format error message from data formatting"""
+    """Format error message with (head of) data table and raise"""
+    raise ValueError(format_log_message(msg, data))
+
+
+def format_log_message(msg, data):
+    """Utils function to format message with (head of) data table"""
     if isinstance(data, pd.MultiIndex):
         data = data.to_frame(index=False)
     data = data.drop_duplicates()
-    msg = f"{msg}:\n{data.head()}" + ("\n..." if len(data) > 5 else "")
-    raise ValueError(msg)
-
-
-class ConfigPseudoHandler(logging.Handler):
-    """Pseudo logging handler to defer configuring logging until the first message
-
-    Registers itself as a handler for the provided logger and temporarily
-    sets the logger as sensitive to INFO messages. Upon receival of the first
-    message (of at least INFO level), it configures logging with the provided
-    `config_kwargs` and prints `log_msg`
-
-    Parameters
-    ----------
-    logger : logging.Logger
-        Logger to listen for the first message
-    log_msg : str, optional
-        Message to print once logging is configured, by default None
-    **config_kwargs
-        Arguments to pass on to logging.basicConfig
-    """
-
-    def __init__(self, logger, log_msg=None, **config_kwargs):
-        super().__init__()
-
-        self.logger = logger
-        self.log_msg = log_msg
-        self.config_kwargs = config_kwargs
-
-        self.logger.addHandler(self)
-
-        # temporarily set the logging level to a non-standard value,
-        # slightly below logging.INFO == 20 and use that as a sentinel
-        # to switch back to logging.NOTSET later
-        self.logger.setLevel(19)
-
-    def emit(self, record):
-        self.logger.removeHandler(self)
-
-        if self.logger.level == 19:
-            self.logger.setLevel(logging.NOTSET)
-
-        if not self.logger.root.hasHandlers():
-            logging.basicConfig(**self.config_kwargs)
-
-            if self.log_msg is not None:
-                self.logger.info(self.log_msg)
-
-
-# Give the Handler a function like alias
-defer_logging_config = ConfigPseudoHandler
+    return f"{msg}:\n{data.head()}" + ("\n..." if len(data) > 5 else "")
diff --git a/pyam/utils.py b/pyam/utils.py
index 2e96ac540..d63b005c3 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -51,28 +51,6 @@
 }
 
 
-def requires_package(pkg, msg, error_type=ImportError):
-    """Decorator when a function requires an optional dependency
-
-    Parameters
-    ----------
-    pkg : imported package object
-    msg : string
-        Message to show to user with error_type
-    error_type : python error class
-    """
-
-    def _requires_package(func):
-        def wrapper(*args, **kwargs):
-            if pkg is None:
-                raise error_type(msg)
-            return func(*args, **kwargs)
-
-        return wrapper
-
-    return _requires_package
-
-
 def isstr(x):
     # TODO deprecated, remove for release >= 2.1
     deprecation_warning("Please use `pyam.str.is_str()`.", "The function `isstr()`")
@@ -152,7 +130,7 @@ def read_pandas(path, sheet_name=["data*", "Data*"], *args, **kwargs):
             # apply pattern-matching for sheet names (use * as wildcard)
             sheets = sheet_names[pattern_match(sheet_names, values=sheets)]
             if sheets.empty:
-                raise ValueError(f"No sheets {sheet_name} in file {path}!")
+                raise ValueError(f"Sheet(s) '{sheet_name}' not found in file '{path}'.")
 
             df = pd.concat([xl.parse(s, *args, **kwargs) for s in sheets])
 
@@ -217,7 +195,9 @@ def _knead_data(df, **kwargs):
         dfs = []
         for v in value:
             if v not in df.columns:
-                raise ValueError("column `{}` does not exist!".format(v))
+                raise ValueError(
+                    f"Column `{v}` not in timeseries data, found: {df.columns}"
+                )
             vdf = _df[v].to_frame().rename(columns={v: "value"})
             vdf["variable"] = v
             dfs.append(vdf.reset_index())
@@ -226,7 +206,9 @@ def _knead_data(df, **kwargs):
     # otherwise, rename columns or concat to IAMC-style or do a fill-by-value
     for col, value in kwargs.items():
         if col in df:
-            raise ValueError(f"Conflict of kwarg with column `{col}` in dataframe!")
+            raise ValueError(
+                f"Conflict of kwarg with column `{col}` in timeseries data."
+            )
 
         if is_str(value) and value in df:
             df.rename(columns={value: col}, inplace=True)
@@ -236,7 +218,7 @@ def _knead_data(df, **kwargs):
         elif is_str(value):
             df[col] = value
         else:
-            raise ValueError(f"Invalid argument for casting `{col}: {value}`")
+            raise ValueError(f"Invalid argument for casting data: `{col}: {value}`")
 
     return df
 
@@ -244,7 +226,7 @@ def _knead_data(df, **kwargs):
 def _format_from_legacy_database(df):
     """Process data from legacy databases (SSP and earlier)"""
 
-    logger.info("Ignoring notes column in `data`")
+    logger.info("Ignoring notes column in `data`.")
     df.drop(columns="notes", inplace=True)
     col = df.columns[0]  # first column has database copyright notice
     df = df[~df[col].str.contains("database", case=False)]
@@ -270,7 +252,7 @@ def _intuit_column_groups(df, index, include_index=False):
 
     # check that there is no column in the timeseries data with reserved names
     if None in existing_cols:
-        raise ValueError("Unnamed column in `data`: None")
+        raise ValueError("Unnamed column in timeseries data: None")
 
     # check that there is no column in the timeseries data with reserved/illegal names
     conflict_cols = [i for i in existing_cols if i in ILLEGAL_COLS]
@@ -286,11 +268,13 @@ def _intuit_column_groups(df, index, include_index=False):
     # check that index and required columns exist
     missing_index = [c for c in index if c not in existing_cols]
     if missing_index:
-        raise ValueError(f"Missing index columns: {missing_index}")
+        raise ValueError(f"Missing index columns in timeseries data: {missing_index}")
 
     missing_required_col = [c for c in REQUIRED_COLS if c not in existing_cols]
     if missing_required_col:
-        raise ValueError(f"Missing required columns: {missing_required_col}")
+        raise ValueError(
+            f"Missing required columns in timeseries data: {missing_required_col}"
+        )
 
     # check whether data in wide format (standard IAMC) or long format (`value` column)
     if "value" in existing_cols:
@@ -300,7 +284,7 @@ def _intuit_column_groups(df, index, include_index=False):
         elif "time" in existing_cols and "year" not in existing_cols:
             time_col = "time"
         else:
-            raise ValueError("Invalid time domain, must have either `year` or `time`!")
+            raise ValueError("Invalid time domain, must have either `year` or `time`.")
         extra_cols = [
             c
             for c in existing_cols
@@ -334,7 +318,7 @@ def _intuit_column_groups(df, index, include_index=False):
             time_col = "time"
             data_cols = sorted(year_cols) + sorted(time_cols)
         if not data_cols:
-            raise ValueError("Missing time domain")
+            raise ValueError("No time domain in the data.")
 
     return time_col, extra_cols, data_cols
 
@@ -438,7 +422,7 @@ def format_data(df, index, **kwargs):
         )
     del rows
     if df.empty:
-        logger.warning("Formatted data is empty!")
+        logger.warning("Formatted data is empty.")
 
     return df.sort_index(), index, time_col, extra_cols
 
@@ -469,7 +453,7 @@ def merge_meta(left, right, ignore_conflict=False):
     diff = right.index.difference(left.index)
     sect = right.index.intersection(left.index)
 
-    # merge `right` into `left` for overlapping scenarios ( `sect`)
+    # merge `right` into `left` for overlapping scenarios (`sect`)
     if not sect.empty:
         # if not ignored, check that overlapping `meta` columns are equal
         if not ignore_conflict:
@@ -480,7 +464,7 @@ def merge_meta(left, right, ignore_conflict=False):
                     .drop_duplicates()
                     .index.drop_duplicates()
                 )
-                msg = "conflict in `meta` for scenarios {}".format(
+                msg = "Conflict in `meta` for scenarios {}".format(
                     [i for i in pd.DataFrame(index=conflict_idx).index]
                 )
                 raise ValueError(msg)
diff --git a/tests/test_core.py b/tests/test_core.py
index 7fafb992c..f10b2a08b 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -206,7 +206,7 @@ def test_init_df_with_custom_index(test_pd_df):
 
 def test_init_empty_message(caplog):
     IamDataFrame(data=df_empty)
-    drop_message = "Formatted data is empty!"
+    drop_message = "Formatted data is empty."
     message_idx = caplog.messages.index(drop_message)
     assert caplog.records[message_idx].levelno == logging.WARNING
 
@@ -216,7 +216,7 @@ def test_init_with_unnamed_column(test_pd_df):
     test_pd_df[None] = "foo"
 
     # check that initialising an instance with an unnamed column raises
-    with pytest.raises(ValueError, match="Unnamed column in `data`: None"):
+    with pytest.raises(ValueError, match="Unnamed column in timeseries data: None"):
         IamDataFrame(test_pd_df)
 
 
diff --git a/tests/test_io.py b/tests/test_io.py
index 330df8fe4..c358e9531 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -198,7 +198,7 @@ def test_load_meta_wrong_index(test_df_year, tmpdir):
     file = tmpdir / "testing_meta_empty.xlsx"
     pd.DataFrame(columns=["model", "foo"]).to_excel(file, index=False)
 
-    match = ".* \(sheet meta\) missing required index columns \['scenario'\]\!"
+    match = "Missing index columns for meta indicators: \['scenario'\]"
     with pytest.raises(ValueError, match=match):
         test_df_year.load_meta(file)