hdmf-dev · h-mayorquin · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@
 - Unwrap `TermSetWrapper` within the builder to support different backends more efficiently. @mavaylon1 [#1070](https://github.com/hdmf-dev/hdmf/pull/1070)
 - Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
 - Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066)
+- Improve html representation of data in `Containers` @h-mayorquin [#1100](https://github.com/hdmf-dev/hdmf/pull/1100)
 
 ## HDMF 3.12.2 (February 9, 2024)
 

diff --git a/src/hdmf/container.py b/src/hdmf/container.py
@@ -711,7 +711,9 @@ def _generate_html_repr(self, fields, level=0, access_code="", is_field=False):
             for index, item in enumerate(fields):
                 access_code += f'[{index}]'
                 html_repr += self._generate_field_html(index, item, level, access_code)
-        elif isinstance(fields, np.ndarray):
+        elif isinstance(fields, (np.ndarray, h5py.Dataset)):
+            html_repr += self._generate_array_html(fields, level)
+        elif hasattr(fields, "store") and hasattr(fields, "shape"):  # Duck typing for zarr array
             html_repr += self._generate_array_html(fields, level)
         else:
             pass
@@ -728,18 +730,22 @@ def _generate_field_html(self, key, value, level, access_code):
             return f'<div style="margin-left: {level * 20}px;" class="container-fields"><span class="field-key"' \
                    f' title="{access_code}">{key}: </span><span class="field-value">{value}</span></div>'
 
-        if hasattr(value, "generate_html_repr"):
+        if isinstance(value, (np.ndarray, h5py.Dataset)):
+            html_content = self._generate_array_html(value, level + 1)
+        elif hasattr(value, "store") and hasattr(value, "shape"):  # Duck typing for zarr array
+            html_content = self._generate_array_html(value, level + 1)
+        elif hasattr(value, "generate_html_repr"):
             html_content = value.generate_html_repr(level + 1, access_code)
-
         elif hasattr(value, '__repr_html__'):
             html_content = value.__repr_html__()
-
-        elif hasattr(value, "fields"):
+        elif hasattr(value, "fields"):  # Note that h5py.Dataset has a fields attribute so there is an implicit order
             html_content = self._generate_html_repr(value.fields, level + 1, access_code, is_field=True)
         elif isinstance(value, (list, dict, np.ndarray)):
             html_content = self._generate_html_repr(value, level + 1, access_code, is_field=False)
         else:
             html_content = f'<span class="field-key">{value}</span>'
+
+
         html_repr = (
             f'<details><summary style="display: list-item; margin-left: {level * 20}px;" '
             f'class="container-fields field-key" title="{access_code}"><b>{key}</b></summary>'
@@ -749,10 +755,71 @@ def _generate_field_html(self, key, value, level, access_code):
 
         return html_repr
 
+
     def _generate_array_html(self, array, level):
-        """Generates HTML for a NumPy array."""
-        str_ = str(array).replace("\n", "</br>")
-        return f'<div style="margin-left: {level * 20}px;" class="container-fields">{str_}</div>'
+        """Generates HTML for a NumPy array, h5py Dataset, or Zarr array."""
+
+        def convert_bytes_to_str(bytes_size):
+            suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']
+            i = 0
+            while bytes_size >= 1024 and i < len(suffixes)-1:
+                bytes_size /= 1024.
+                i += 1
+            return f"{bytes_size:.2f} {suffixes[i]}"
+
+        # Generates an html report for the backend info, inspired on zarr info html representation
+        def html_table(item_dicts) -> str:
+            report = '<table class="data-info">'
+            report += "<tbody>"
+            for k, v in item_dicts.items():
+                report += (
+                    f"<tr>"
+                    f'<th style="text-align: left">{k}</th>'
+                    f'<td style="text-align: left">{v}</td>'
+                    f"</tr>"
+                )
+            report += "</tbody>"
+            report += "</table>"
+            return report
+
+        array_size_in_bytes = array.nbytes
+        array_size_repr = convert_bytes_to_str(array_size_in_bytes)
+        basic_array_info_dict = {"shape": array.shape, "dtype": array.dtype,  "Array size": array_size_repr}
+
+        if isinstance(array, np.ndarray):
+            head = "NumPy Array"
+            backend_info_dict = basic_array_info_dict
+
+        if isinstance(array, h5py.Dataset):
+            hdf5_dataset = array
+            chunks = hdf5_dataset.chunks
+            compression = hdf5_dataset.compression
+            uncompressed_size = hdf5_dataset.nbytes
+            compression_opts = hdf5_dataset.compression_opts
+            compressed_size = hdf5_dataset.id.get_storage_size()
+            compression_ratio = uncompressed_size / compressed_size if compressed_size != 0 else "undefined"
+
+            head = "HDF5 Dataset"
+            hdf5_info_dict = {"chunks": chunks, "compression": compression, "compression_opts": compression_opts,
+                              "compression_ratio": compression_ratio}
+            backend_info_dict = {**basic_array_info_dict, **hdf5_info_dict}
+
+        if hasattr(array, "store") and hasattr(array, "shape"):  # Duck typing for zarr array
+            head = "Zarr Array"
+            zarr_info_dict = {k:v for k, v in array.info_items()}
+            backend_info_dict = zarr_info_dict
+
+        # Add <br> tags and concatenate the components
+        head_html = head
+        backend_info_html = html_table(backend_info_dict)
+        repr_html = head_html + "<br>" + backend_info_html
+
+        # Display data for small datasets
+        array_is_small = array_size_in_bytes < 1024 * 0.1  # 10 % a kilobyte to display the array
+        if array_is_small or isinstance(array, np.ndarray):
+            repr_html += "<br>" + str(np.asarray(array))
+
+        return f'<div style="margin-left: {level * 20}px;" class="container-fields">{repr_html}</div>'
 
     @staticmethod
     def __smart_str(v, num_indent):

diff --git a/tests/unit/test_container.py b/tests/unit/test_container.py
@@ -1,6 +1,7 @@
 import numpy as np
 from uuid import uuid4, UUID
 import os
+import h5py
 
 from hdmf.container import AbstractContainer, Container, Data, HERDManager
 from hdmf.common.resources import HERD
@@ -423,6 +424,23 @@ def __init__(self, **kwargs):
             self.data = kwargs['data']
             self.str = kwargs['str']
 
+    class ContainerWithData(Container):
+
+        __fields__ = (
+            "data",
+            "str"
+        )
+
+        @docval(
+            {'name': "data", "doc": 'data', 'type': 'array_data', "default": None},
+            {'name': "str", "doc": 'str', 'type': str, "default": None},
+
+        )
+        def __init__(self, **kwargs):
+            super().__init__('test name')
+            self.data = kwargs['data']
+            self.str = kwargs['str']
+
     def test_repr_html_(self):
         child_obj1 = Container('test child 1')
         obj1 = self.ContainerWithChildAndData(child=child_obj1, data=[1, 2, 3], str="hello")
@@ -455,6 +473,39 @@ def test_repr_html_(self):
             'class="field-value">hello</span></div></div>'
         )
 
+    def test_repr_html_array(self):
+        obj = self.ContainerWithData(data=np.array([1, 2, 3, 4]), str="hello")
+        expected_html_table = (
+            'class="container-fields">NumPy Array<br><table class="data-info"><tbody><tr><th style="text-align: '
+            'left">shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">dtype</'
+            'th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Array size</th><td '
+            'style="text-align: left">32.00 bytes</td></tr></tbody></table><br>[1 2 3 4]</div></details><div '
+            'style="margin-left: 0px;" class="container-fields"><span class="field-key" title=".str">str: </'
+            'span><span class="field-value">hello</span></div></div>'
+        )
+        self.assertIn(expected_html_table, obj._repr_html_())
+
+    def test_repr_html_hdf5_dataset(self):
+
+        # Open an HDF5 file in write mode
+        with h5py.File('data.h5', 'w') as file:
+            dataset = file.create_dataset(name='my_dataset', data=[1, 2, 3, 4])
+            obj = self.ContainerWithData(data=dataset, str="hello")
+            expected_html_table = (
+                'class="container-fields">HDF5 Dataset<br><table class="data-info"><tbody><tr><th style="text-align: '
+                'left">shape</th><td style="text-align: left">(4,)</td></tr><tr><th style="text-align: left">dtype</'
+                'th><td style="text-align: left">int64</td></tr><tr><th style="text-align: left">Array size</th><td '
+                'style="text-align: left">32.00 bytes</td></tr><tr><th style="text-align: left">chunks</th><td '
+                'style="text-align: left">None</td></tr><tr><th style="text-align: left">compression</th><td '
+                'style="text-align: left">None</td></tr><tr><th style="text-align: left">compression_opts</th><td '
+                'style="text-align: left">None</td></tr><tr><th style="text-align: left">compression_ratio</th><td '
+                'style="text-align: left">1.0</td></tr></tbody></table><br>[1 2 3 4]</div></details><div '
+                'style="margin-left: 0px;" class="container-fields"><span class="field-key" title=".str">str: </'
+                'span><span class="field-value">hello</span></div></div>'
+            )
+
+            self.assertIn(expected_html_table, obj._repr_html_())
+        os.remove('data.h5')
 
 class TestData(TestCase):