From 89c4c81e9e81096d4dd2642c553bf1ca2b272a13 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 15:13:44 +0200 Subject: [PATCH 01/22] Speed up caching of audbcards.Dataset --- audbcards/core/datacard.py | 3 ++- audbcards/core/dataset.py | 18 ++++-------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/audbcards/core/datacard.py b/audbcards/core/datacard.py index 38b8311..ffdd515 100644 --- a/audbcards/core/datacard.py +++ b/audbcards/core/datacard.py @@ -114,7 +114,8 @@ def example_media(self) -> typing.Optional[str]: ) # Download of example data might fail try: - media = self.dataset.deps.media[index] + files = audb.info.files(self.dataset.name, version=self.dataset.version) + media = files[index] audb.load_media( self.dataset.name, media, diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index a6d2269..00451e1 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -4,7 +4,6 @@ import pickle import typing -import dohq_artifactory import jinja2 import pandas as pd @@ -18,19 +17,6 @@ from audbcards.core.utils import limit_presented_samples -def _getstate(self): - return self.name - - -def _setstate(self, state): - self.name = state - - -# Ensure we can pickle the repository -dohq_artifactory.GenericRepository.__getstate__ = _getstate -dohq_artifactory.GenericRepository.__setstate__ = _setstate - - class _Dataset: @classmethod def create( @@ -95,6 +81,10 @@ def __init__( for other_version in other_versions: audeer.rmdir(audeer.path(self.cache_root, name, other_version)) + def __get_state__(self): + r"""Returns attributes to be pickled.""" + return self.properties + @staticmethod def _dataset_cache_path(name: str, version: str, cache_root: str) -> str: r"""Generate the name of the cache file.""" From 164d482728141372759a92509d457d0b49b6898e Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 15:22:09 +0200 Subject: [PATCH 02/22] Try to fix --- audbcards/core/dataset.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 00451e1..6e30b6f 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -4,6 +4,7 @@ import pickle import typing +# import dohq_artifactory import jinja2 import pandas as pd @@ -17,6 +18,19 @@ from audbcards.core.utils import limit_presented_samples +# def _getstate(self): +# return self.name +# +# +# def _setstate(self, state): +# self.name = state +# +# +# # Ensure we can pickle the repository +# dohq_artifactory.GenericRepository.__getstate__ = _getstate +# dohq_artifactory.GenericRepository.__setstate__ = _setstate + + class _Dataset: @classmethod def create( @@ -81,7 +95,7 @@ def __init__( for other_version in other_versions: audeer.rmdir(audeer.path(self.cache_root, name, other_version)) - def __get_state__(self): + def __getstate__(self): r"""Returns attributes to be pickled.""" return self.properties From a0e1d1be695927835e686fb6b7a300ed78e4e474 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 15:24:23 +0200 Subject: [PATCH 03/22] Another fix --- audbcards/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 6e30b6f..a7e4a31 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -97,7 +97,7 @@ def __init__( def __getstate__(self): r"""Returns attributes to be pickled.""" - return self.properties + return self.properties() @staticmethod def _dataset_cache_path(name: str, version: str, cache_root: str) -> str: From 784921b0a0c5e01d4a9029e248af4d2c4c182d63 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 15:42:17 +0200 Subject: [PATCH 04/22] Improve execution after loading from cache --- audbcards/core/dataset.py | 53 ++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index a7e4a31..7feb77c 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -4,7 +4,6 @@ import pickle import typing -# import dohq_artifactory import jinja2 import pandas as pd @@ -18,19 +17,6 @@ from audbcards.core.utils import limit_presented_samples -# def _getstate(self): -# return self.name -# -# -# def _setstate(self, state): -# self.name = state -# -# -# # Ensure we can pickle the repository -# dohq_artifactory.GenericRepository.__getstate__ = _getstate -# dohq_artifactory.GenericRepository.__setstate__ = _setstate - - class _Dataset: @classmethod def create( @@ -62,19 +48,24 @@ def __init__( version: str, cache_root: str = None, ): - self.cache_root = audeer.mkdir(audeer.path(cache_root)) - self.header = audb.info.header( + self.cache_root = audeer.mkdir(cache_root) + + # Store name and version in private attributes here, + # ``self.name`` and ``self.version`` + # are implemented as cached properties below + self._name = name + self._version = version + + self._header = audb.info.header( name, version=version, load_tables=True, # ensure misc tables are loaded ) - self.deps = audb.dependencies( + self._deps = audb.dependencies( name, version=version, verbose=False, ) - - self._version = version self._repository = audb.repository(name, version) self._backend = audbackend.access( name=self._repository.backend, @@ -127,6 +118,28 @@ def _save_pickled(obj, path: str): with open(path, "wb") as f: pickle.dump(obj, f, protocol=4) + @property + def deps(self) -> audb.Dependencies: + r"""Dataset dependency table.""" + if self._deps is None: # when loaded from cache + self._deps = audb.dependencies( + self.name, + version=self.version, + verbose=False, + ) + return self._deps + + @property + def header(self) -> audformat.Database: + r"""Dataset header.""" + if self._header is None: # when loaded from cache + self._header = audb.info.header( + self.name, + version=self.version, + load_tables=True, # ensure misc tables are loaded + ) + return self._header + @functools.cached_property def archives(self) -> int: r"""Number of archives of media files in dataset.""" @@ -232,7 +245,7 @@ def license_link(self) -> typing.Optional[str]: @functools.cached_property def name(self) -> str: r"""Name of dataset.""" - return self.header.name + return self._name @functools.cached_property def publication_date(self) -> str: From 99e42090c485fa256e4934194f719e69d20ed790 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 15:50:05 +0200 Subject: [PATCH 05/22] Use hidden attributes --- audbcards/core/dataset.py | 66 +++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 7feb77c..7726911 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -48,7 +48,7 @@ def __init__( version: str, cache_root: str = None, ): - self.cache_root = audeer.mkdir(cache_root) + audeer.mkdir(cache_root) # Store name and version in private attributes here, # ``self.name`` and ``self.version`` @@ -79,12 +79,12 @@ def __init__( # by removing all other versions of the same dataset # to reduce its storage size in CI runners versions = audeer.list_dir_names( - audeer.path(self.cache_root, name), + audeer.path(cache_root, name), basenames=True, ) other_versions = [v for v in versions if v != version] for other_version in other_versions: - audeer.rmdir(audeer.path(self.cache_root, name, other_version)) + audeer.rmdir(cache_root, name, other_version) def __getstate__(self): r"""Returns attributes to be pickled.""" @@ -93,7 +93,7 @@ def __getstate__(self): @staticmethod def _dataset_cache_path(name: str, version: str, cache_root: str) -> str: r"""Generate the name of the cache file.""" - cache_dir = audeer.mkdir(audeer.path(cache_root, name, version)) + cache_dir = audeer.mkdir(cache_root, name, version) cache_filename = audeer.path( cache_dir, @@ -143,12 +143,12 @@ def header(self) -> audformat.Database: @functools.cached_property def archives(self) -> int: r"""Number of archives of media files in dataset.""" - return len(set([self.deps.archive(file) for file in self.deps.media])) + return len(set([self._deps.archive(file) for file in self._deps.media])) @functools.cached_property def author(self) -> typing.List[str]: r"""Authors of the database.""" - return self.header.author + return self._header.author @functools.cached_property def bit_depths(self) -> typing.List[int]: @@ -157,9 +157,9 @@ def bit_depths(self) -> typing.List[int]: list( set( [ - self.deps.bit_depth(file) - for file in self.deps.media - if self.deps.bit_depth(file) + self._deps.bit_depth(file) + for file in self._deps.media + if self._deps.bit_depth(file) ] ) ) @@ -172,9 +172,9 @@ def channels(self) -> typing.List[int]: list( set( [ - self.deps.channels(file) - for file in self.deps.media - if self.deps.channels(file) + self._deps.channels(file) + for file in self._deps.media + if self._deps.channels(file) ] ) ) @@ -183,12 +183,12 @@ def channels(self) -> typing.List[int]: @functools.cached_property def description(self) -> str: r"""Source of the database.""" - return self.header.description + return self._header.description @functools.cached_property def duration(self) -> pd.Timedelta: r"""Total duration of media files in dataset.""" - durations = [self.deps.duration(file) for file in self.deps.media] + durations = [self._deps.duration(file) for file in self._deps.media] return pd.to_timedelta( sum([d for d in durations if d is not None]), unit="s", @@ -197,22 +197,22 @@ def duration(self) -> pd.Timedelta: @functools.cached_property def files(self) -> int: r"""Number of media files in dataset.""" - return len(self.deps.media) + return len(self._deps.media) @functools.cached_property def file_durations(self) -> typing.List: r"""File durations in dataset in seconds.""" - return [self.deps.duration(file) for file in self.deps.media] + return [self._deps.duration(file) for file in self._deps.media] @functools.cached_property def formats(self) -> typing.List[str]: r"""File formats of media files in dataset.""" - return sorted(list(set([self.deps.format(file) for file in self.deps.media]))) + return sorted(list(set([self._deps.format(file) for file in self._deps.media]))) @functools.cached_property def languages(self) -> typing.List[str]: r"""Languages of the database.""" - return self.header.languages + return self._header.languages @functools.cached_property def iso_languages(self) -> typing.List[str]: @@ -227,7 +227,7 @@ def license(self) -> str: ``'Unknown'`` is returned. """ - return self.header.license or "Unknown" + return self._header.license or "Unknown" @functools.cached_property def license_link(self) -> typing.Optional[str]: @@ -237,10 +237,10 @@ def license_link(self) -> typing.Optional[str]: ``None`` is returned. """ - if self.header.license_url is None or len(self.header.license_url) == 0: + if self._header.license_url is None or len(self._header.license_url) == 0: return None else: - return self.header.license_url + return self._header.license_url @functools.cached_property def name(self) -> str: @@ -293,9 +293,9 @@ def sampling_rates(self) -> typing.List[int]: list( set( [ - self.deps.sampling_rate(file) - for file in self.deps.media - if self.deps.sampling_rate(file) + self._deps.sampling_rate(file) + for file in self._deps.media + if self._deps.sampling_rate(file) ] ) ) @@ -304,7 +304,7 @@ def sampling_rates(self) -> typing.List[int]: @functools.cached_property def schemes(self) -> typing.List[str]: r"""Schemes of dataset.""" - return list(self.header.schemes) + return list(self._header.schemes) @functools.cached_property def schemes_table(self) -> typing.List[typing.List[str]]: @@ -314,7 +314,7 @@ def schemes_table(self) -> typing.List[typing.List[str]]: with column names as keys. """ - db = self.header + db = self._header dataset_schemes = [] for scheme_id in db.schemes: dataset_scheme = self._scheme_to_list(scheme_id) @@ -333,7 +333,7 @@ def schemes_table(self) -> typing.List[typing.List[str]]: def short_description(self) -> str: r"""Description of dataset shortened to 150 chars.""" length = 150 - description = self.header.description or "" + description = self._header.description or "" # Fix RST used signs description = description.replace("`", "'") if len(description) > length: @@ -343,12 +343,12 @@ def short_description(self) -> str: @functools.cached_property def source(self) -> str: r"""Source of the database.""" - return self.header.source + return self._header.source @functools.cached_property def tables(self) -> typing.List[str]: """Tables of the dataset.""" - db = self.header + db = self._header tables = list(db) return tables @@ -356,7 +356,7 @@ def tables(self) -> typing.List[str]: def tables_table(self) -> typing.List[str]: """Tables of the dataset.""" table_list = [["ID", "Type", "Columns"]] - db = self.header + db = self._header for table_id in self.tables: table = db[table_id] if isinstance(table, audformat.MiscTable): @@ -371,7 +371,7 @@ def tables_table(self) -> typing.List[str]: @functools.cached_property def usage(self) -> str: r"""Usage of the database.""" - return self.header.usage + return self._header.usage @functools.cached_property def version(self) -> str: @@ -390,7 +390,7 @@ def _scheme_table_columns(self) -> typing.List[str]: ``'Mappings'``. """ - schemes = self.header.schemes + schemes = self._header.schemes if len(schemes) == 0: return [] @@ -410,7 +410,7 @@ def _scheme_table_columns(self) -> typing.List[str]: return columns def _scheme_to_list(self, scheme_id): - db = self.header + db = self._header scheme_info = self._scheme_table_columns scheme = db.schemes[scheme_id] From dc03a6349b5f0a8cdaf1f1ef5efb6d5fc718d872 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 16:36:25 +0200 Subject: [PATCH 06/22] Add audbcards.Dataset.schemes_summary --- audbcards/core/dataset.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 7726911..8455fd1 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -306,6 +306,18 @@ def schemes(self) -> typing.List[str]: r"""Schemes of dataset.""" return list(self._header.schemes) + @functools.cached_property + def schemes_summary(self) -> str: + r"""Summary of dataset schemes. + + It lists all schemes in a string, + showing additional information + on schemes named ``'emotion'`` and ``'speaker'``, + e.g. ``'speaker: [age, gender, language]'``. + + """ + return format_schemes(self._header.schemes) + @functools.cached_property def schemes_table(self) -> typing.List[typing.List[str]]: """Schemes table with name, type, min, max, labels, mappings. @@ -606,7 +618,7 @@ def create_datasets_page( dataset.short_description, f"`{dataset.license} <{dataset.license_link}>`__", dataset.version, - format_schemes(dataset.header.schemes), + dataset.schemes_summary, ) for dataset in datasets ] From 81bed5254ef29b8ac5494910ff8a086964f5e6df Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 16:44:21 +0200 Subject: [PATCH 07/22] Ensure header and deps are loaded on the fly --- audbcards/core/datacard.py | 3 +- audbcards/core/dataset.py | 64 +++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/audbcards/core/datacard.py b/audbcards/core/datacard.py index ffdd515..38b8311 100644 --- a/audbcards/core/datacard.py +++ b/audbcards/core/datacard.py @@ -114,8 +114,7 @@ def example_media(self) -> typing.Optional[str]: ) # Download of example data might fail try: - files = audb.info.files(self.dataset.name, version=self.dataset.version) - media = files[index] + media = self.dataset.deps.media[index] audb.load_media( self.dataset.name, media, diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 8455fd1..08e32b1 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -121,7 +121,7 @@ def _save_pickled(obj, path: str): @property def deps(self) -> audb.Dependencies: r"""Dataset dependency table.""" - if self._deps is None: # when loaded from cache + if not hasattr(self, "_deps"): # when loaded from cache self._deps = audb.dependencies( self.name, version=self.version, @@ -132,7 +132,7 @@ def deps(self) -> audb.Dependencies: @property def header(self) -> audformat.Database: r"""Dataset header.""" - if self._header is None: # when loaded from cache + if not hasattr(self, "_header"): # when loaded from cache self._header = audb.info.header( self.name, version=self.version, @@ -143,12 +143,12 @@ def header(self) -> audformat.Database: @functools.cached_property def archives(self) -> int: r"""Number of archives of media files in dataset.""" - return len(set([self._deps.archive(file) for file in self._deps.media])) + return len(set([self.deps.archive(file) for file in self.deps.media])) @functools.cached_property def author(self) -> typing.List[str]: r"""Authors of the database.""" - return self._header.author + return self.header.author @functools.cached_property def bit_depths(self) -> typing.List[int]: @@ -157,9 +157,9 @@ def bit_depths(self) -> typing.List[int]: list( set( [ - self._deps.bit_depth(file) - for file in self._deps.media - if self._deps.bit_depth(file) + self.deps.bit_depth(file) + for file in self.deps.media + if self.deps.bit_depth(file) ] ) ) @@ -172,9 +172,9 @@ def channels(self) -> typing.List[int]: list( set( [ - self._deps.channels(file) - for file in self._deps.media - if self._deps.channels(file) + self.deps.channels(file) + for file in self.deps.media + if self.deps.channels(file) ] ) ) @@ -183,12 +183,12 @@ def channels(self) -> typing.List[int]: @functools.cached_property def description(self) -> str: r"""Source of the database.""" - return self._header.description + return self.header.description @functools.cached_property def duration(self) -> pd.Timedelta: r"""Total duration of media files in dataset.""" - durations = [self._deps.duration(file) for file in self._deps.media] + durations = [self.deps.duration(file) for file in self.deps.media] return pd.to_timedelta( sum([d for d in durations if d is not None]), unit="s", @@ -197,22 +197,22 @@ def duration(self) -> pd.Timedelta: @functools.cached_property def files(self) -> int: r"""Number of media files in dataset.""" - return len(self._deps.media) + return len(self.deps.media) @functools.cached_property def file_durations(self) -> typing.List: r"""File durations in dataset in seconds.""" - return [self._deps.duration(file) for file in self._deps.media] + return [self.deps.duration(file) for file in self.deps.media] @functools.cached_property def formats(self) -> typing.List[str]: r"""File formats of media files in dataset.""" - return sorted(list(set([self._deps.format(file) for file in self._deps.media]))) + return sorted(list(set([self.deps.format(file) for file in self.deps.media]))) @functools.cached_property def languages(self) -> typing.List[str]: r"""Languages of the database.""" - return self._header.languages + return self.header.languages @functools.cached_property def iso_languages(self) -> typing.List[str]: @@ -227,7 +227,7 @@ def license(self) -> str: ``'Unknown'`` is returned. """ - return self._header.license or "Unknown" + return self.header.license or "Unknown" @functools.cached_property def license_link(self) -> typing.Optional[str]: @@ -237,10 +237,10 @@ def license_link(self) -> typing.Optional[str]: ``None`` is returned. """ - if self._header.license_url is None or len(self._header.license_url) == 0: + if self.header.license_url is None or len(self.header.license_url) == 0: return None else: - return self._header.license_url + return self.header.license_url @functools.cached_property def name(self) -> str: @@ -293,9 +293,9 @@ def sampling_rates(self) -> typing.List[int]: list( set( [ - self._deps.sampling_rate(file) - for file in self._deps.media - if self._deps.sampling_rate(file) + self.deps.sampling_rate(file) + for file in self.deps.media + if self.deps.sampling_rate(file) ] ) ) @@ -304,7 +304,7 @@ def sampling_rates(self) -> typing.List[int]: @functools.cached_property def schemes(self) -> typing.List[str]: r"""Schemes of dataset.""" - return list(self._header.schemes) + return list(self.header.schemes) @functools.cached_property def schemes_summary(self) -> str: @@ -316,7 +316,7 @@ def schemes_summary(self) -> str: e.g. ``'speaker: [age, gender, language]'``. """ - return format_schemes(self._header.schemes) + return format_schemes(self.header.schemes) @functools.cached_property def schemes_table(self) -> typing.List[typing.List[str]]: @@ -326,7 +326,7 @@ def schemes_table(self) -> typing.List[typing.List[str]]: with column names as keys. """ - db = self._header + db = self.header dataset_schemes = [] for scheme_id in db.schemes: dataset_scheme = self._scheme_to_list(scheme_id) @@ -345,7 +345,7 @@ def schemes_table(self) -> typing.List[typing.List[str]]: def short_description(self) -> str: r"""Description of dataset shortened to 150 chars.""" length = 150 - description = self._header.description or "" + description = self.header.description or "" # Fix RST used signs description = description.replace("`", "'") if len(description) > length: @@ -355,12 +355,12 @@ def short_description(self) -> str: @functools.cached_property def source(self) -> str: r"""Source of the database.""" - return self._header.source + return self.header.source @functools.cached_property def tables(self) -> typing.List[str]: """Tables of the dataset.""" - db = self._header + db = self.header tables = list(db) return tables @@ -368,7 +368,7 @@ def tables(self) -> typing.List[str]: def tables_table(self) -> typing.List[str]: """Tables of the dataset.""" table_list = [["ID", "Type", "Columns"]] - db = self._header + db = self.header for table_id in self.tables: table = db[table_id] if isinstance(table, audformat.MiscTable): @@ -383,7 +383,7 @@ def tables_table(self) -> typing.List[str]: @functools.cached_property def usage(self) -> str: r"""Usage of the database.""" - return self._header.usage + return self.header.usage @functools.cached_property def version(self) -> str: @@ -402,7 +402,7 @@ def _scheme_table_columns(self) -> typing.List[str]: ``'Mappings'``. """ - schemes = self._header.schemes + schemes = self.header.schemes if len(schemes) == 0: return [] @@ -422,7 +422,7 @@ def _scheme_table_columns(self) -> typing.List[str]: return columns def _scheme_to_list(self, scheme_id): - db = self._header + db = self.header scheme_info = self._scheme_table_columns scheme = db.schemes[scheme_id] From 53ecbbb06c414aa13cce2f692f08d5aea39026e8 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 16:58:36 +0200 Subject: [PATCH 08/22] Add docstring example --- audbcards/core/dataset.py | 5 +++++ audbcards/sphinx/__init__.py | 1 + pyproject.toml | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 08e32b1..d729abc 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -315,6 +315,11 @@ def schemes_summary(self) -> str: on schemes named ``'emotion'`` and ``'speaker'``, e.g. ``'speaker: [age, gender, language]'``. + Example: + >>> ds = Dataset("emodb", "1.4.1") + >>> ds.schemes_summary + 'emotion: [anger, ..., neutral], speaker: [age, ..., language], ...' + """ return format_schemes(self.header.schemes) diff --git a/audbcards/sphinx/__init__.py b/audbcards/sphinx/__init__.py index 349f474..89e4f94 100644 --- a/audbcards/sphinx/__init__.py +++ b/audbcards/sphinx/__init__.py @@ -1,6 +1,7 @@ import os import sphinx +import sphinx.application import audb import audeer diff --git a/pyproject.toml b/pyproject.toml index 85f76c7..965c494 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,10 @@ skip = './audbcards.egg-info,./build' [tool.pytest.ini_options] cache_dir = '.cache/pytest' xfail_strict = true +addopts = ''' + --doctest-modules + --ignore=docs/ +''' # ----- ruff -------------------------------------------------------------- From e572acf29090621ba8419c5701b8ab99768cae77 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 16:59:27 +0200 Subject: [PATCH 09/22] Readd self.cache_root --- audbcards/core/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index d729abc..dfbb5f4 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -48,7 +48,8 @@ def __init__( version: str, cache_root: str = None, ): - audeer.mkdir(cache_root) + self.cache_root = audeer.mkdir(cache_root) + r"""Cache root folder.""" # Store name and version in private attributes here, # ``self.name`` and ``self.version`` From 757c3faf92d5b6d9d9aa396fff90fa5f625ddb70 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Wed, 24 Apr 2024 17:02:24 +0200 Subject: [PATCH 10/22] Do not test docstrings of sphinx extension --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 965c494..5576887 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ xfail_strict = true addopts = ''' --doctest-modules --ignore=docs/ + --ignore=audbcards/sphinx/ ''' From baebd985756b406995a8775ff55d9d1d8507052e Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 25 Apr 2024 09:32:57 +0200 Subject: [PATCH 11/22] Ensure everything is loaded, starting from cache --- audbcards/core/dataset.py | 80 ++++++++++++++++++++++++--------------- tests/test_dataset.py | 3 +- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index dfbb5f4..8088e49 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -57,24 +57,12 @@ def __init__( self._name = name self._version = version - self._header = audb.info.header( - name, - version=version, - load_tables=True, # ensure misc tables are loaded - ) - self._deps = audb.dependencies( - name, - version=version, - verbose=False, - ) - self._repository = audb.repository(name, version) - self._backend = audbackend.access( - name=self._repository.backend, - host=self._repository.host, - repository=self._repository.name, - ) - if isinstance(self._backend, audbackend.Artifactory): - self._backend._use_legacy_file_structure() # pragma: nocover + # Private attributes, + # used inside corresponding properties. + self._header = self._load_header() + self._deps = self._load_dependencies() + self._repository_object = self._load_repository_object() # load before backend + self._backend = self._load_backend() # Clean up cache # by removing all other versions of the same dataset @@ -119,28 +107,34 @@ def _save_pickled(obj, path: str): with open(path, "wb") as f: pickle.dump(obj, f, protocol=4) + @property + def backend(self) -> audbackend.Backend: + r"""Dataset dependency table.""" + if not hasattr(self, "_backend"): # when loaded from cache + self._backend = self._load_backend() + return self._backend + @property def deps(self) -> audb.Dependencies: r"""Dataset dependency table.""" if not hasattr(self, "_deps"): # when loaded from cache - self._deps = audb.dependencies( - self.name, - version=self.version, - verbose=False, - ) + self._deps = self._load_dependencies() return self._deps @property def header(self) -> audformat.Database: r"""Dataset header.""" if not hasattr(self, "_header"): # when loaded from cache - self._header = audb.info.header( - self.name, - version=self.version, - load_tables=True, # ensure misc tables are loaded - ) + self._header = self._load_header() return self._header + @property + def repository_object(self) -> audb.Repository: + r"""Repository containing dataset.""" + if not hasattr(self, "_repository_object"): # when loaded from cache + self._repository_object = self._load_repository_object() + return self._repository_object + @functools.cached_property def archives(self) -> int: r"""Number of archives of media files in dataset.""" @@ -273,7 +267,7 @@ def properties(self): @functools.cached_property def repository(self) -> str: r"""Repository containing the dataset.""" - return f"{self._repository.name}" + return f"{self.repository_object.name}" @functools.cached_property def repository_link(self) -> str: @@ -281,9 +275,9 @@ def repository_link(self) -> str: # NOTE: this needs to be changed # as we want to support different backends return ( - f"{self._repository.host}/" + f"{self.repository_object.host}/" f"webapp/#/artifacts/browse/tree/General/" - f"{self._repository.name}/" + f"{self.repository}/" f"{self.name}" ) @@ -396,6 +390,30 @@ def version(self) -> str: r"""Version of dataset.""" return self._version + def _load_backend(self) -> audbackend.Backend: + r"""Load backend containing dataset.""" + backend = audbackend.access( + name=self._repository_object.backend, + host=self._repository_object.host, + repository=self.repository, + ) + if isinstance(backend, audbackend.Artifactory): + backend._use_legacy_file_structure() # pragma: nocover + return backend + + def _load_dependencies(self) -> audb.Dependencies: + r"""Load dataset dependencies.""" + return audb.dependencies(self.name, version=self.version, verbose=False) + + def _load_header(self) -> audformat.Database: + r"""Load dataset header.""" + # Ensure misc tables are loaded + return audb.info.header(self.name, version=self.version, load_tables=True) + + def _load_repository_object(self) -> audb.Repository: + r"""Load repository object containing dataset.""" + return audb.repository(self.name, self.version) + @functools.cached_property def _scheme_table_columns(self) -> typing.List[str]: """Column names for the scheme table. diff --git a/tests/test_dataset.py b/tests/test_dataset.py index dab2c45..4c9836c 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -73,7 +73,8 @@ def test_dataset(audb_cache, tmpdir, repository, db, request): # __init__ assert dataset.name == db.name assert dataset.version == pytest.VERSION - assert dataset._repository == repository + assert dataset.repository_object == repository + assert dataset.backend == backend expected_header = audb.info.header( db.name, version=pytest.VERSION, From a24e699686ff8ed8f3e3732d71bb9a5c23274d5f Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 25 Apr 2024 10:30:28 +0200 Subject: [PATCH 12/22] Improve code and tests --- audbcards/core/dataset.py | 12 +++++----- tests/requirements.txt | 1 + tests/test_dataset.py | 46 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 8088e49..2c4965f 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -245,14 +245,14 @@ def name(self) -> str: @functools.cached_property def publication_date(self) -> str: r"""Date dataset was uploaded to repository.""" - path = self._backend.join("/", self.name, "db.yaml") - return self._backend.date(path, self._version) + path = self.backend.join("/", self.name, "db.yaml") + return self.backend.date(path, self.version) @functools.cached_property def publication_owner(self) -> str: r"""User who uploaded dataset to repository.""" - path = self._backend.join("/", self.name, "db.yaml") - return self._backend.owner(path, self._version) + path = self.backend.join("/", self.name, "db.yaml") + return self.backend.owner(path, self.version) def properties(self): """Get list of properties of the object.""" @@ -393,8 +393,8 @@ def version(self) -> str: def _load_backend(self) -> audbackend.Backend: r"""Load backend containing dataset.""" backend = audbackend.access( - name=self._repository_object.backend, - host=self._repository_object.host, + name=self.repository_object.backend, + host=self.repository_object.host, repository=self.repository, ) if isinstance(backend, audbackend.Artifactory): diff --git a/tests/requirements.txt b/tests/requirements.txt index dfb4542..0949692 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,2 +1,3 @@ +audb >=1.6.5 # for audb.Dependencies.__eq__() audeer >=1.21.0 pytest diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 4c9836c..e69c5c0 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -365,3 +365,49 @@ def test_dataset_cache_path(): "emodb-1.2.1.pkl", ) assert cache_path_calculated == cache_path_expected + + +@pytest.mark.parametrize( + "db", + [ + "medium_db", + ], +) +def test_dataset_cache_loading(audb_cache, tmpdir, repository, db, request): + """Test cached properties after loading from cache. + + We no longer store all attributes/properties + in cache as pickle files, + but limit ourselves to the cached properties. + This test ensures, + that other attributes will be re-calculated. + + """ + db = request.getfixturevalue(db) + cache_root = audeer.mkdir(tmpdir, "cache") + dataset = audbcards.Dataset(db.name, pytest.VERSION, cache_root=cache_root) + del dataset + dataset = audbcards.Dataset(db.name, pytest.VERSION, cache_root=cache_root) + deps = audb.dependencies( + db.name, + version=pytest.VERSION, + cache_root=audb_cache, + ) + backend = audbackend.access( + name=repository.backend, + host=repository.host, + repository=repository.name, + ) + # header = audb.info.header( + # db.name, + # version=pytest.VERSION, + # cache_root=audb_cache, + # ) + assert dataset.backend == backend + assert dataset.deps == deps + # Disabled due to potential audformat issue: + # the `files` table cannot be found in cache, + # which is requested when using `load_tables=True` + # in `audb.info.header`. + # assert dataset.header == header + assert dataset.repository_object == repository From 831992553b50cbcc8212e0d60c53a7afb137504a Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 25 Apr 2024 11:50:27 +0200 Subject: [PATCH 13/22] Remove doctest --- audbcards/core/dataset.py | 5 ----- pyproject.toml | 5 ----- 2 files changed, 10 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 2c4965f..71c22f4 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -310,11 +310,6 @@ def schemes_summary(self) -> str: on schemes named ``'emotion'`` and ``'speaker'``, e.g. ``'speaker: [age, gender, language]'``. - Example: - >>> ds = Dataset("emodb", "1.4.1") - >>> ds.schemes_summary - 'emotion: [anger, ..., neutral], speaker: [age, ..., language], ...' - """ return format_schemes(self.header.schemes) diff --git a/pyproject.toml b/pyproject.toml index 5576887..85f76c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,11 +70,6 @@ skip = './audbcards.egg-info,./build' [tool.pytest.ini_options] cache_dir = '.cache/pytest' xfail_strict = true -addopts = ''' - --doctest-modules - --ignore=docs/ - --ignore=audbcards/sphinx/ -''' # ----- ruff -------------------------------------------------------------- From c086b3fb1ecfcefd04115ea53dc39977f4fc40ec Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 11:21:42 +0200 Subject: [PATCH 14/22] Re-enable test for same header --- tests/test_dataset.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index e69c5c0..636220d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -398,16 +398,16 @@ def test_dataset_cache_loading(audb_cache, tmpdir, repository, db, request): host=repository.host, repository=repository.name, ) - # header = audb.info.header( - # db.name, - # version=pytest.VERSION, - # cache_root=audb_cache, - # ) + header = audb.info.header( + db.name, + version=pytest.VERSION, + load_tables=True, + cache_root=audb_cache, + ) assert dataset.backend == backend assert dataset.deps == deps - # Disabled due to potential audformat issue: - # the `files` table cannot be found in cache, - # which is requested when using `load_tables=True` - # in `audb.info.header`. - # assert dataset.header == header + # The dataset header is a not fully loaded `audformat.Database` object, + # so we cannot directly use `audformat.Database.__eq__()` + # to compare it. + assert str(dataset.header) == str(header) assert dataset.repository_object == repository From 4a4a4cb7534cafeeb59b262682d5e8ffc1dea174 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 11:27:53 +0200 Subject: [PATCH 15/22] Rename Dataset.properties() to cached_properties() --- audbcards/core/datacard.py | 2 +- audbcards/core/dataset.py | 24 ++++++++++++------------ tests/test_dataset.py | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/audbcards/core/datacard.py b/audbcards/core/datacard.py index 38b8311..5393d46 100644 --- a/audbcards/core/datacard.py +++ b/audbcards/core/datacard.py @@ -395,7 +395,7 @@ def _render_template(self) -> str: template = environment.get_template("datacard.j2") # Convert dataset object to dictionary - dataset = self.dataset.properties() + dataset = self.dataset.cached_properties() # Add additional datacard only properties dataset = self._expand_dataset(dataset) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 71c22f4..bf09a8c 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -37,7 +37,7 @@ def create( return obj obj = cls(name, version, cache_root) - _ = obj.properties() + _ = obj.cached_properties() cls._save_pickled(obj, dataset_cache_filename) return obj @@ -77,7 +77,7 @@ def __init__( def __getstate__(self): r"""Returns attributes to be pickled.""" - return self.properties() + return self.cached_properties() @staticmethod def _dataset_cache_path(name: str, version: str, cache_root: str) -> str: @@ -254,16 +254,6 @@ def publication_owner(self) -> str: path = self.backend.join("/", self.name, "db.yaml") return self.backend.owner(path, self.version) - def properties(self): - """Get list of properties of the object.""" - class_items = self.__class__.__dict__.items() - props = dict( - (k, getattr(self, k)) - for k, v in class_items - if isinstance(v, functools.cached_property) - ) - return props - @functools.cached_property def repository(self) -> str: r"""Repository containing the dataset.""" @@ -385,6 +375,16 @@ def version(self) -> str: r"""Version of dataset.""" return self._version + def cached_properties(self): + """Get list of cached properties of the object.""" + class_items = self.__class__.__dict__.items() + props = dict( + (k, getattr(self, k)) + for k, v in class_items + if isinstance(v, functools.cached_property) + ) + return props + def _load_backend(self) -> audbackend.Backend: r"""Load backend containing dataset.""" backend = audbackend.access( diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 636220d..93e2539 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -29,7 +29,7 @@ def test_dataset_property_scope(tmpdir, db, request): cache_root=dataset_cache, ) - props = [x for x in dataset.properties().keys()] + props = [x for x in dataset.cached_properties().keys()] # should not exist in local scope for prop in props: @@ -305,8 +305,8 @@ def test_cache_file_existence(self, constructor): def test_props_equal(self, constructor): """Cached and uncached datasets have equal props.""" ds_uncached, ds_cached, _ = constructor - props_uncached = ds_uncached.properties() - props_cached = ds_cached.properties() + props_uncached = ds_uncached.cached_properties() + props_cached = ds_cached.cached_properties() list_props_uncached = list(props_uncached.keys()) list_props_cached = list(props_cached.keys()) assert list_props_uncached == list_props_cached From 0e3cdf18834a9bcb01c000ad7f6fd1b097d67633 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 11:32:25 +0200 Subject: [PATCH 16/22] Rename cached_properties() to _cached_properties() --- audbcards/core/datacard.py | 2 +- audbcards/core/dataset.py | 6 +++--- tests/test_dataset.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/audbcards/core/datacard.py b/audbcards/core/datacard.py index 5393d46..b89f7c8 100644 --- a/audbcards/core/datacard.py +++ b/audbcards/core/datacard.py @@ -395,7 +395,7 @@ def _render_template(self) -> str: template = environment.get_template("datacard.j2") # Convert dataset object to dictionary - dataset = self.dataset.cached_properties() + dataset = self.dataset._cached_properties() # Add additional datacard only properties dataset = self._expand_dataset(dataset) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index bf09a8c..4d4f785 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -37,7 +37,7 @@ def create( return obj obj = cls(name, version, cache_root) - _ = obj.cached_properties() + _ = obj._cached_properties() cls._save_pickled(obj, dataset_cache_filename) return obj @@ -77,7 +77,7 @@ def __init__( def __getstate__(self): r"""Returns attributes to be pickled.""" - return self.cached_properties() + return self._cached_properties() @staticmethod def _dataset_cache_path(name: str, version: str, cache_root: str) -> str: @@ -375,7 +375,7 @@ def version(self) -> str: r"""Version of dataset.""" return self._version - def cached_properties(self): + def _cached_properties(self): """Get list of cached properties of the object.""" class_items = self.__class__.__dict__.items() props = dict( diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 93e2539..fe0f0d0 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -29,7 +29,7 @@ def test_dataset_property_scope(tmpdir, db, request): cache_root=dataset_cache, ) - props = [x for x in dataset.cached_properties().keys()] + props = [x for x in dataset._cached_properties().keys()] # should not exist in local scope for prop in props: @@ -305,8 +305,8 @@ def test_cache_file_existence(self, constructor): def test_props_equal(self, constructor): """Cached and uncached datasets have equal props.""" ds_uncached, ds_cached, _ = constructor - props_uncached = ds_uncached.cached_properties() - props_cached = ds_cached.cached_properties() + props_uncached = ds_uncached._cached_properties() + props_cached = ds_cached._cached_properties() list_props_uncached = list(props_uncached.keys()) list_props_cached = list(props_cached.keys()) assert list_props_uncached == list_props_cached From fa66009c0380583618f0d0768eb23dd16378e68b Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 11:55:38 +0200 Subject: [PATCH 17/22] Fix inclusion of properties in API docs --- audbcards/core/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 4d4f785..c7e1a4c 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -571,12 +571,12 @@ def __new__( # Copy attributes and methods # to include in documentation - for prop in [ + for _prop in [ # use private variable `_prop` to avoid inclusion in API doc name for name, value in inspect.getmembers(_Dataset) - if isinstance(value, functools.cached_property) and not name.startswith("_") + if not name.startswith("_") and name not in ["create"] ]: - vars()[prop] = getattr(_Dataset, prop) + vars()[_prop] = getattr(_Dataset, _prop) @staticmethod def _map_iso_languages(*args): From 9d5ba4294c06bb3bccf7d0171efef687d13bb5b9 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 12:06:13 +0200 Subject: [PATCH 18/22] Revert changes handled in #87 --- audbcards/core/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index c7e1a4c..18a82ec 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -571,12 +571,12 @@ def __new__( # Copy attributes and methods # to include in documentation - for _prop in [ # use private variable `_prop` to avoid inclusion in API doc + for prop in [ name for name, value in inspect.getmembers(_Dataset) if not name.startswith("_") and name not in ["create"] ]: - vars()[_prop] = getattr(_Dataset, _prop) + vars()[prop] = getattr(_Dataset, prop) @staticmethod def _map_iso_languages(*args): From 368b9772e0faab20567ce6004056b5baba882d1c Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 12:18:35 +0200 Subject: [PATCH 19/22] Include cache_root in API docs --- audbcards/core/dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 18a82ec..e1e1c51 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -569,6 +569,18 @@ def __new__( instance = _Dataset.create(name, version, cache_root=cache_root) return instance + # Add an __init__() function, + # to allow documenting instance variables + def __init__( + self, + name: str, + version: str, + *, + cache_root: str = None, + ): + self.cache_root = audeer.mkdir(cache_root) + r"""Cache root folder.""" + # Copy attributes and methods # to include in documentation for prop in [ From 27d54fbca80b5272455a9a869a24ac88101533c9 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 12:19:59 +0200 Subject: [PATCH 20/22] Improve docstring of repository_object --- audbcards/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index e1e1c51..60d18a6 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -130,7 +130,7 @@ def header(self) -> audformat.Database: @property def repository_object(self) -> audb.Repository: - r"""Repository containing dataset.""" + r"""Repository object containing dataset.""" if not hasattr(self, "_repository_object"): # when loaded from cache self._repository_object = self._load_repository_object() return self._repository_object From 64988636c0899d15df0564ba2d792602903f4048 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 26 Apr 2024 12:23:22 +0200 Subject: [PATCH 21/22] Fix docstring of backend --- audbcards/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 60d18a6..a100f80 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -109,7 +109,7 @@ def _save_pickled(obj, path: str): @property def backend(self) -> audbackend.Backend: - r"""Dataset dependency table.""" + r"""Dataset backend object.""" if not hasattr(self, "_backend"): # when loaded from cache self._backend = self._load_backend() return self._backend From a19b2540fbecc3b3806511c0acdfecd9674f7aee Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Tue, 30 Apr 2024 08:47:17 +0200 Subject: [PATCH 22/22] Remove Artifactory related pragma --- audbcards/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index a100f80..04d9b92 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -393,7 +393,7 @@ def _load_backend(self) -> audbackend.Backend: repository=self.repository, ) if isinstance(backend, audbackend.Artifactory): - backend._use_legacy_file_structure() # pragma: nocover + backend._use_legacy_file_structure() return backend def _load_dependencies(self) -> audb.Dependencies: