Skip to content

Commit

Permalink
Ensure text in table preview is valid
Browse files Browse the repository at this point in the history
  • Loading branch information
hagenw committed Jul 25, 2024
1 parent af7043d commit 67f3274
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 9 deletions.
19 changes: 11 additions & 8 deletions audbcards/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
import audeer
import audformat

from audbcards.core import utils
from audbcards.core.config import config
from audbcards.core.utils import format_schemes
from audbcards.core.utils import limit_presented_samples


class _Dataset:
Expand Down Expand Up @@ -350,7 +349,7 @@ def schemes_summary(self) -> str:
e.g. ``'speaker: [age, gender, language]'``.
"""
return format_schemes(self.header.schemes)
return utils.format_schemes(self.header.schemes)

@functools.cached_property
def schemes_table(self) -> typing.List[typing.List[str]]:
Expand Down Expand Up @@ -405,7 +404,9 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
Shows the header
and the first 5 lines for each table
as a list of lists.
All table values are converted to strings.
All table values are converted to strings,
stripped from HTML tags or newlines,
and limited to a maximum length of 100 characters.
Returns:
dictionary with table IDs as keys
Expand Down Expand Up @@ -434,9 +435,11 @@ def tables_preview(self) -> typing.Dict[str, typing.List[typing.List[str]]]:
verbose=False,
)
df = df.reset_index()
preview[table] = [df.columns.tolist()] + df.head(5).astype(
"string"
).values.tolist()
header = [df.columns.tolist()]
body = df.head(5).astype("string").values.tolist()
# Remove unwanted chars and limit length of each entry
body = [[utils.parse_text(column) for column in row] for row in body]
preview[table] = header + body
return preview

@functools.cached_property
Expand Down Expand Up @@ -562,7 +565,7 @@ def _scheme_to_list(self, scheme_id):
label[:-1] + r"\_" if label.endswith("_") else label
for label in labels
]
labels = limit_presented_samples(
labels = utils.limit_presented_samples(
labels,
15,
replacement_text="[...]",
Expand Down
26 changes: 26 additions & 0 deletions audbcards/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
import typing

import matplotlib.pyplot as plt
import pandas as pd

import audeer
import audformat
Expand Down Expand Up @@ -99,6 +101,30 @@ def limit_presented_samples(
return samples


def parse_text(text: str) -> str:
"""Remove unsupported characters and restrict length.
Args:
text: input text
Returns:
parsed text
"""
# Missing text
if pd.isna(text):
return text
# Remove newlines
text = text.replace("\n", "\\n")
# Remove HTML tags
text = re.sub("<[^<]+?>", "", text)
# Limit length
max_characters_per_entry = 100
if len(text) > max_characters_per_entry:
text = text[: max_characters_per_entry - 3] + "..."
return text


def set_plot_margins(
*,
left=0,
Expand Down
14 changes: 13 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,19 @@ def test_format_schemes(scheme_names, scheme_dtypes, labels, expected):
],
)
def test_limit_presented_samples(sample, limit, replacement_text, expected):
limited_sample = audbcards.core.dataset.limit_presented_samples(
limited_sample = audbcards.core.utils.limit_presented_samples(
sample, limit, replacement_text
)
assert limited_sample == expected


@pytest.mark.parametrize(
"text, expected",
[
("abc\ndef", "abc\\ndef"),
("a" * 101, "a" * 97 + "..."),
('<a href="http://www.google.de">text link</a>', "text link"),
],
)
def test_parse_text(text, expected):
assert audbcards.core.utils.parse_text(text) == expected

0 comments on commit 67f3274

Please sign in to comment.