Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve html representation of datasets #1100

Open
wants to merge 24 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
649f141
improve dev repr
h-mayorquin Apr 19, 2024
475cda9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 19, 2024
7f3c94e
address ruff
h-mayorquin Apr 19, 2024
5128d53
add changelog
h-mayorquin Apr 23, 2024
21ae3cf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2024
4eb2635
add table representation for hdf5 info
h-mayorquin Apr 26, 2024
08292c6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 26, 2024
59083c2
add test
h-mayorquin Apr 29, 2024
06a064e
Merge remote-tracking branch 'refs/remotes/origin/improve_html_repr_o…
h-mayorquin Apr 29, 2024
7ce5b3f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 29, 2024
fc14d71
ruff
h-mayorquin Apr 29, 2024
a2931e2
Merge remote-tracking branch 'refs/remotes/origin/improve_html_repr_o…
h-mayorquin Apr 29, 2024
96456a4
Merge branch 'dev' into improve_html_repr_of_data
h-mayorquin Apr 29, 2024
133e28d
handle division by zer
h-mayorquin Apr 30, 2024
ae21b61
add zarr, array, hdf5 repr tests
stephprince May 1, 2024
28449a3
generalize array html table description
stephprince May 1, 2024
6e6a84c
remove zarr tests
stephprince May 1, 2024
89fd978
fix nbytes
h-mayorquin May 2, 2024
a0e1736
fix use of nbytes ahead
h-mayorquin May 2, 2024
538ba98
added TODO
h-mayorquin May 2, 2024
e0ad0a1
Merge branch 'dev' into improve_html_repr_of_data
h-mayorquin May 2, 2024
9cbcf64
add html test array data type
stephprince May 2, 2024
5b235e0
Merge branch 'dev' into improve_html_repr_of_data
rly Oct 2, 2024
3813723
Merge branch 'dev' into improve_html_repr_of_data
rly Oct 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- Unwrap `TermSetWrapper` within the builder to support different backends more efficiently. @mavaylon1 [#1070](https://github.com/hdmf-dev/hdmf/pull/1070)
- Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
- Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066)
- Improve html representation of data in `Containers` @h-mayorquin [#1100](https://github.com/hdmf-dev/hdmf/pull/1100)

## HDMF 3.12.2 (February 9, 2024)

Expand Down
83 changes: 75 additions & 8 deletions src/hdmf/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,7 +711,9 @@ def _generate_html_repr(self, fields, level=0, access_code="", is_field=False):
for index, item in enumerate(fields):
access_code += f'[{index}]'
html_repr += self._generate_field_html(index, item, level, access_code)
elif isinstance(fields, np.ndarray):
elif isinstance(fields, (np.ndarray, h5py.Dataset)):
html_repr += self._generate_array_html(fields, level)
elif hasattr(fields, "store") and hasattr(fields, "shape"): # Duck typing for zarr array
html_repr += self._generate_array_html(fields, level)
else:
pass
Expand All @@ -728,18 +730,22 @@ def _generate_field_html(self, key, value, level, access_code):
return f'<div style="margin-left: {level * 20}px;" class="container-fields"><span class="field-key"' \
f' title="{access_code}">{key}: </span><span class="field-value">{value}</span></div>'

if hasattr(value, "generate_html_repr"):
if isinstance(value, (np.ndarray, h5py.Dataset)):
html_content = self._generate_array_html(value, level + 1)
elif hasattr(value, "store") and hasattr(value, "shape"): # Duck typing for zarr array
html_content = self._generate_array_html(value, level + 1)
elif hasattr(value, "generate_html_repr"):
html_content = value.generate_html_repr(level + 1, access_code)

elif hasattr(value, '__repr_html__'):
stephprince marked this conversation as resolved.
Show resolved Hide resolved
html_content = value.__repr_html__()

elif hasattr(value, "fields"):
elif hasattr(value, "fields"): # Note that h5py.Dataset has a fields attribute so there is an implicit order
html_content = self._generate_html_repr(value.fields, level + 1, access_code, is_field=True)
elif isinstance(value, (list, dict, np.ndarray)):
html_content = self._generate_html_repr(value, level + 1, access_code, is_field=False)
else:
html_content = f'<span class="field-key">{value}</span>'


html_repr = (
f'<details><summary style="display: list-item; margin-left: {level * 20}px;" '
f'class="container-fields field-key" title="{access_code}"><b>{key}</b></summary>'
Expand All @@ -749,10 +755,71 @@ def _generate_field_html(self, key, value, level, access_code):

return html_repr


def _generate_array_html(self, array, level):
"""Generates HTML for a NumPy array."""
str_ = str(array).replace("\n", "</br>")
return f'<div style="margin-left: {level * 20}px;" class="container-fields">{str_}</div>'
"""Generates HTML for a NumPy array, h5py Dataset, or Zarr array."""

def convert_bytes_to_str(bytes_size):
suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']
i = 0
while bytes_size >= 1024 and i < len(suffixes)-1:
bytes_size /= 1024.
i += 1
return f"{bytes_size:.2f} {suffixes[i]}"

# Generates an html report for the backend info, inspired on zarr info html representation
def html_table(item_dicts) -> str:
report = '<table class="data-info">'
report += "<tbody>"
for k, v in item_dicts.items():
report += (
f"<tr>"
f'<th style="text-align: left">{k}</th>'
f'<td style="text-align: left">{v}</td>'
f"</tr>"
)
report += "</tbody>"
report += "</table>"
return report

array_size_in_bytes = array.nbytes
array_size_repr = convert_bytes_to_str(array_size_in_bytes)
basic_array_info_dict = {"shape": array.shape, "dtype": array.dtype, "Array size": array_size_repr}

if isinstance(array, np.ndarray):
head = "NumPy Array"
backend_info_dict = basic_array_info_dict

if isinstance(array, h5py.Dataset):
hdf5_dataset = array
chunks = hdf5_dataset.chunks
compression = hdf5_dataset.compression
uncompressed_size = hdf5_dataset.nbytes
compression_opts = hdf5_dataset.compression_opts
compressed_size = hdf5_dataset.id.get_storage_size()
compression_ratio = uncompressed_size / compressed_size if compressed_size != 0 else "undefined"

head = "HDF5 Dataset"
hdf5_info_dict = {"chunks": chunks, "compression": compression, "compression_opts": compression_opts,
"compression_ratio": compression_ratio}
backend_info_dict = {**basic_array_info_dict, **hdf5_info_dict}

if hasattr(array, "store") and hasattr(array, "shape"): # Duck typing for zarr array
head = "Zarr Array"
zarr_info_dict = {k:v for k, v in array.info_items()}
backend_info_dict = zarr_info_dict
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to avoid having logic that is specific for a particular io backend in the Container. The reason is that this inhibts implementing backends in a self-contained manner and stand-alone packages and requires updating many places throughout HDMF.

The checks for h5py.Dataset and Zarr.array are really only relevant when a file was read from file. To help disentangle the dependencies, I'm wondering whether we could do the following:

  1. Add a static method generate_dataset_html to HDMFIO that would then need to implemented by HDF5IO and ZarrIO
  2. In the Container you could then do something like:
read_io = self.get_read_io()  # if the Container was read from file, this will give you the IO object that read it
if read_io is not None:
    html_repr = read_io.generate_dataset_html(my_dataset)
else:
    # The file was not read from disk so the dataset should be numpy array or a list

`

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to avoid having logic that is specific for a particular io backend in the Container. The reason is that this inhibts implementing backends in a self-contained manner and stand-alone packages and requires updating many places throughout HDMF.

I see, yes, it would be nice if we could decouple this. On the other hand, right now, if they do implement their own backend they will just lose the representation for datasets which is not critical.

The checks for h5py.Dataset and Zarr.array are really only relevant when a file was read from file.

Is it? Right now in the test we are passsing an hdf5 dataset as data without writing the nwbfile for testin the display. Is this not intended?

This proposal seems very good:

read_io = self.get_read_io()  # if the Container was read from file, this will give you the IO object that read it
if read_io is not None:
    html_repr = read_io.generate_dataset_html(my_dataset)
else:
    # The file was not read from disk so the dataset should be numpy array or a list

I see two downsides:

  1. Missing extensive representation for in-memory files (it is nice to know what you will write!).
  2. Fragmenting the code base for html representations.

Is there any other backend in the works right now? If not, maybe we can do this simpler way and add the complexity once we are closer to need it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would worth giving this a try here. If it works for HDF5, we can then we can easily move the logic for Zarr to hdmf-zarr. I don't think it should be too hard to make this work right now, but these things tend to get hard to change later on.

Copy link
Author

@h-mayorquin h-mayorquin May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about handling in-memory objects as well?

Copy link
Contributor

@oruebel oruebel May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In-memory-only objects (i.e., numpy arrays and lists) can be handled here in the Container class.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can then follow your approach in another PR to add backend related information extracted through the DataIO objects.

Could you point me to the PR you are referring to. I'm not sure what role DataIO plays for this PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean that we can implement the following strategy in another PR to add backend specific information:

read_io = self.get_read_io()  # if the Container was read from file, this will give you the IO object that read it
if read_io is not None:
    html_repr = read_io.generate_dataset_html(my_dataset)
else:
    # The file was not read from disk so the dataset should be numpy array or a list

Copy link
Contributor

@oruebel oruebel May 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that looks good 👍 . Just to avoid confusion, read_io is an instance of HDMFIO (i.e., HDF5IO or ZarrIO) and not DataIO. To implement the logic we would then need to:

  1. Add HDMFIO.generate_dataset_html(dataset) which would implement just a minimalist representation
  2. Implement HDF5IO.generate_dataset_html(h5py.Dataset) to represent h5py.Dataset
  3. In a separate PR on hdmf_zarr implement ZarrIO.generate_dataset_html(Zarr.array)

To simplify this implementation and generate consistent representations, we could make a utility function that would take information about a dataset (e.g,. name, shape, data type, etc.) as input and generate the html representation such that the individual generate_data_html on the I/O backends would just collect the information from the dataset and use the utility function to generate the actual html.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that looks good 👍 . Just to avoid confusion, read_io is an instance of HDMFIO (i.e., HDF5IO or ZarrIO) and not DataIO. To implement the logic we would then need to.

Yes, I realized after that I was confusing these two objects.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Thanks for your hard work on this PR and the fruitful discussion.


# Add <br> tags and concatenate the components
head_html = head
backend_info_html = html_table(backend_info_dict)
repr_html = head_html + "<br>" + backend_info_html

# Display data for small datasets
array_is_small = array_size_in_bytes < 1024 * 0.1 # 10 % a kilobyte to display the array
if array_is_small or isinstance(array, np.ndarray):
repr_html += "<br>" + str(np.asarray(array))

return f'<div style="margin-left: {level * 20}px;" class="container-fields">{repr_html}</div>'

@staticmethod
def __smart_str(v, num_indent):
Expand Down
51 changes: 51 additions & 0 deletions tests/unit/test_container.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
from uuid import uuid4, UUID
import os
import h5py

from hdmf.container import AbstractContainer, Container, Data, HERDManager
from hdmf.common.resources import HERD
Expand Down Expand Up @@ -423,6 +424,23 @@ def __init__(self, **kwargs):
self.data = kwargs['data']
self.str = kwargs['str']

class ContainerWithData(Container):

__fields__ = (
"data",
"str"
)

@docval(
{'name': "data", "doc": 'data', 'type': 'array_data', "default": None},
{'name': "str", "doc": 'str', 'type': str, "default": None},

)
def __init__(self, **kwargs):
super().__init__('test name')
self.data = kwargs['data']
self.str = kwargs['str']

def test_repr_html_(self):
child_obj1 = Container('test child 1')
obj1 = self.ContainerWithChildAndData(child=child_obj1, data=[1, 2, 3], str="hello")
Expand Down Expand Up @@ -455,6 +473,39 @@ def test_repr_html_(self):
'class="field-value">hello</span></div></div>'
)

def test_repr_html_array(self):
obj = self.ContainerWithData(data=np.array([1, 2, 3, 4]), str="hello")
expected_html_table = (
'class="container-fields">NumPy Array<br><table class="data-info"><tbody><tr><th style="text-align: '
'left">shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">dtype</'
'th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Array size</th><td '
'style="text-align: left">32.00 bytes</td></tr></tbody></table><br>[1 2 3 4]</div></details><div '
'style="margin-left: 0px;" class="container-fields"><span class="field-key" title=".str">str: </'
'span><span class="field-value">hello</span></div></div>'
)
self.assertIn(expected_html_table, obj._repr_html_())

def test_repr_html_hdf5_dataset(self):
stephprince marked this conversation as resolved.
Show resolved Hide resolved

# Open an HDF5 file in write mode
with h5py.File('data.h5', 'w') as file:
dataset = file.create_dataset(name='my_dataset', data=[1, 2, 3, 4])
obj = self.ContainerWithData(data=dataset, str="hello")
expected_html_table = (
'class="container-fields">HDF5 Dataset<br><table class="data-info"><tbody><tr><th style="text-align: '
'left">shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">dtype</'
'th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Array size</th><td '
'style="text-align: left">32.00 bytes</td></tr><tr><th style="text-align: left">chunks</th><td '
'style="text-align: left">None</td></tr><tr><th style="text-align: left">compression</th><td '
'style="text-align: left">None</td></tr><tr><th style="text-align: left">compression_opts</th><td '
'style="text-align: left">None</td></tr><tr><th style="text-align: left">compression_ratio</th><td '
'style="text-align: left">1.0</td></tr></tbody></table><br>[1 2 3 4]</div></details><div '
'style="margin-left: 0px;" class="container-fields"><span class="field-key" title=".str">str: </'
'span><span class="field-value">hello</span></div></div>'
)

self.assertIn(expected_html_table, obj._repr_html_())
os.remove('data.h5')

class TestData(TestCase):

Expand Down
Loading