diff --git a/audbcards/core/dataset.py b/audbcards/core/dataset.py index 3d497b1..cd9a6ff 100644 --- a/audbcards/core/dataset.py +++ b/audbcards/core/dataset.py @@ -13,9 +13,8 @@ import audeer import audformat +from audbcards.core import utils from audbcards.core.config import config -from audbcards.core.utils import format_schemes -from audbcards.core.utils import limit_presented_samples class _Dataset: @@ -350,7 +349,7 @@ def schemes_summary(self) -> str: e.g. ``'speaker: [age, gender, language]'``. """ - return format_schemes(self.header.schemes) + return utils.format_schemes(self.header.schemes) @functools.cached_property def schemes_table(self) -> typing.List[typing.List[str]]: @@ -405,7 +404,9 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]: Shows the header and the first 5 lines for each table as a list of lists. - All table values are converted to strings. + All table values are converted to strings, + stripped from HTML tags or newlines, + and limited to a maximum length of 100 characters. Returns: dictionary with table IDs as keys @@ -434,9 +435,11 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]: verbose=False, ) df = df.reset_index() - preview[table] = [df.columns.tolist()] + df.head(5).astype( - "string" - ).values.tolist() + header = [df.columns.tolist()] + body = df.head(5).astype("string").values.tolist() + # Remove unwanted chars and limit length of each entry + body = [[utils.parse_text(column) for column in row] for row in body] + preview[table] = header + body return preview @functools.cached_property @@ -562,7 +565,7 @@ def _scheme_to_list(self, scheme_id): label[:-1] + r"\_" if label.endswith("_") else label for label in labels ] - labels = limit_presented_samples( + labels = utils.limit_presented_samples( labels, 15, replacement_text="[...]", diff --git a/audbcards/core/utils.py b/audbcards/core/utils.py index 71a8d3a..e6a266c 100644 --- a/audbcards/core/utils.py +++ b/audbcards/core/utils.py @@ -1,6 +1,8 @@ +import re import typing import matplotlib.pyplot as plt +import pandas as pd import audeer import audformat @@ -99,6 +101,30 @@ def limit_presented_samples( return samples +def parse_text(text: str) -> str: + """Remove unsupported characters and restrict length. + + Args: + text: input text + + Returns: + parsed text + + """ + # Missing text + if pd.isna(text): + return text + # Remove newlines + text = text.replace("\n", "\\n") + # Remove HTML tags + text = re.sub("<[^<]+?>", "", text) + # Limit length + max_characters_per_entry = 100 + if len(text) > max_characters_per_entry: + text = text[: max_characters_per_entry - 3] + "..." + return text + + def set_plot_margins( *, left=0, diff --git a/tests/test_utils.py b/tests/test_utils.py index 008b8d9..7d2bb30 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,7 +65,19 @@ def test_format_schemes(scheme_names, scheme_dtypes, labels, expected): ], ) def test_limit_presented_samples(sample, limit, replacement_text, expected): - limited_sample = audbcards.core.dataset.limit_presented_samples( + limited_sample = audbcards.core.utils.limit_presented_samples( sample, limit, replacement_text ) assert limited_sample == expected + + +@pytest.mark.parametrize( + "text, expected", + [ + ("abc\ndef", "abc\\ndef"), + ("a" * 101, "a" * 97 + "..."), + ('text link', "text link"), + ], +) +def test_parse_text(text, expected): + assert audbcards.core.utils.parse_text(text) == expected