From 201b8c4f06a8c520cb16692d6789695a05a18d74 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Tue, 7 May 2024 04:44:20 -0700 Subject: [PATCH] VectorData Expand by Default via write_dataset (#1093) --- CHANGELOG.md | 1 + src/hdmf/backends/hdf5/h5tools.py | 34 +++++++++++++++++++++++------ tests/unit/test_io_hdf5.py | 2 +- tests/unit/test_io_hdf5_h5tools.py | 35 +++++++++++++++++++++++++----- 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d5a2cc62..d44a1db06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Added `TypeConfigurator` to automatically wrap fields with `TermSetWrapper` according to a configuration file. @mavaylon1 [#1016](https://github.com/hdmf-dev/hdmf/pull/1016) - Updated `TermSetWrapper` to support validating a single field within a compound array. @mavaylon1 [#1061](https://github.com/hdmf-dev/hdmf/pull/1061) - Updated testing to not install in editable mode and not run `coverage` by default. @rly [#1107](https://github.com/hdmf-dev/hdmf/pull/1107) +- Updated the default behavior for writing HDF5 datasets to be expandandable datasets with chunking enabled by default. This does not override user set chunking parameters. @mavaylon1 [#1093](https://github.com/hdmf-dev/hdmf/pull/1093) ## HDMF 3.13.0 (March 20, 2024) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 05ce36e13..4444ec486 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -364,7 +364,10 @@ def copy_file(self, **kwargs): 'default': True}, {'name': 'herd', 'type': 'hdmf.common.resources.HERD', 'doc': 'A HERD object to populate with references.', - 'default': None}) + 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions', + 'of a dataset to None and enabling auto-chunking by default.')}) def write(self, **kwargs): """Write the container to an HDF5 file.""" if self.__mode == 'r': @@ -804,10 +807,16 @@ def close_linked_files(self): 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 'default': True}, {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}) + 'doc': 'The source of the builders when exporting', 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions', + 'of a dataset to None and enabling auto-chunking by default.')}) def write_builder(self, **kwargs): f_builder = popargs('builder', kwargs) - link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) + link_data, exhaust_dci, export_source = getargs('link_data', + 'exhaust_dci', + 'export_source', + kwargs) self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" % (f_builder.name, self.source, kwargs)) for name, gbldr in f_builder.groups.items(): @@ -978,6 +987,9 @@ def _filler(): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions', + 'of a dataset to None and enabling auto-chunking by default.')}, returns='the Group that was created', rtype=Group) def write_group(self, **kwargs): parent, builder = popargs('parent', 'builder', kwargs) @@ -1078,6 +1090,9 @@ def write_link(self, **kwargs): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the max shape for all dimensions', + 'of a dataset to None and enabling auto-chunking by default.')}, returns='the Dataset that was created', rtype=Dataset) def write_dataset(self, **kwargs): # noqa: C901 """ Write a dataset to HDF5 @@ -1085,7 +1100,7 @@ def write_dataset(self, **kwargs): # noqa: C901 The function uses other dataset-dependent write functions, e.g, ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data. """ - parent, builder = popargs('parent', 'builder', kwargs) + parent, builder, expandable = popargs('parent', 'builder', 'expandable', kwargs) link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) if self.get_written(builder): @@ -1102,6 +1117,7 @@ def write_dataset(self, **kwargs): # noqa: C901 data = data.data else: options['io_settings'] = {} + attributes = builder.attributes options['dtype'] = builder.dtype dset = None @@ -1206,7 +1222,7 @@ def _filler(): return # If the compound data type contains only regular data (i.e., no references) then we can write it as usual else: - dset = self.__list_fill__(parent, name, data, options) + dset = self.__list_fill__(parent, name, data, expandable, options) # Write a dataset containing references, i.e., a region or object reference. # NOTE: we can ignore options['io_settings'] for scalar data elif self.__is_ref(options['dtype']): @@ -1301,7 +1317,7 @@ def _filler(): self.__dci_queue.append(dataset=dset, data=data) # Write a regular in memory array (e.g., numpy array, list etc.) elif hasattr(data, '__len__'): - dset = self.__list_fill__(parent, name, data, options) + dset = self.__list_fill__(parent, name, data, expandable, options) # Write a regular scalar dataset else: dset = self.__scalar_fill__(parent, name, data, options) @@ -1429,7 +1445,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None): return dset @classmethod - def __list_fill__(cls, parent, name, data, options=None): + def __list_fill__(cls, parent, name, data, expandable, options=None): # define the io settings and data type if necessary io_settings = {} dtype = None @@ -1451,6 +1467,10 @@ def __list_fill__(cls, parent, name, data, options=None): data_shape = (len(data),) else: data_shape = get_data_shape(data) + if expandable: + # Don't override existing settings + if 'maxshape' not in io_settings: + io_settings['maxshape'] = tuple([None]*len(data_shape)) # Create the dataset try: diff --git a/tests/unit/test_io_hdf5.py b/tests/unit/test_io_hdf5.py index 0dae1fbbe..b4bc72d8d 100644 --- a/tests/unit/test_io_hdf5.py +++ b/tests/unit/test_io_hdf5.py @@ -225,5 +225,5 @@ def test_dataset_shape(self): io.write_builder(self.builder) builder = io.read_builder() dset = builder['test_bucket']['foo_holder']['foo1']['my_data'].data - self.assertEqual(get_data_shape(dset), (10,)) + self.assertEqual(get_data_shape(dset), (None,)) io.close() diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 5a4fd5a32..2efea40d6 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -28,6 +28,7 @@ from hdmf.testing import TestCase, remove_test_file from hdmf.common.resources import HERD from hdmf.term_set import TermSet, TermSetWrapper +from hdmf.utils import get_data_shape from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager, @@ -163,6 +164,7 @@ def test_write_dataset_list(self): self.io.write_dataset(self.f, DatasetBuilder('test_dataset', a.tolist(), attributes={})) dset = self.f['test_dataset'] self.assertTrue(np.all(dset[:] == a)) + self.assertEqual(get_data_shape(dset), (None, None, None)) def test_write_dataset_list_compress_gzip(self): a = H5DataIO(np.arange(30).reshape(5, 2, 3), @@ -266,9 +268,9 @@ def test_write_dataset_list_fillvalue(self): self.assertEqual(dset.fillvalue, -1) ########################################## - # write_dataset tests: tables + # write_dataset tests: cmpd_dt ########################################## - def test_write_table(self): + def test_write_cmpd_dt(self): cmpd_dt = np.dtype([('a', np.int32), ('b', np.float64)]) data = np.zeros(10, dtype=cmpd_dt) data['a'][1] = 101 @@ -279,8 +281,9 @@ def test_write_table(self): dset = self.f['test_dataset'] self.assertEqual(dset['a'].tolist(), data['a'].tolist()) self.assertEqual(dset['b'].tolist(), data['b'].tolist()) + self.assertEqual(get_data_shape(dset), (None,)) - def test_write_table_nested(self): + def test_write_cmpd_dt_nested(self): b_cmpd_dt = np.dtype([('c', np.int32), ('d', np.float64)]) cmpd_dt = np.dtype([('a', np.int32), ('b', b_cmpd_dt)]) data = np.zeros(10, dtype=cmpd_dt) @@ -739,12 +742,12 @@ def test_copy_h5py_dataset_h5dataio_input(self): self.f['test_copy'][:].tolist()) def test_list_fill_empty(self): - dset = self.io.__list_fill__(self.f, 'empty_dataset', [], options={'dtype': int, 'io_settings': {}}) + dset = self.io.__list_fill__(self.f, 'empty_dataset', [], True, options={'dtype': int, 'io_settings': {}}) self.assertTupleEqual(dset.shape, (0,)) def test_list_fill_empty_no_dtype(self): with self.assertRaisesRegex(Exception, r"cannot add \S+ to [/\S]+ - could not determine type"): - self.io.__list_fill__(self.f, 'empty_dataset', []) + self.io.__list_fill__(self.f, 'empty_dataset', [], True) def test_read_str(self): a = ['a', 'bb', 'ccc', 'dddd', 'e'] @@ -763,6 +766,28 @@ def test_read_str(self): '') +class TestExpand(TestCase): + def setUp(self): + self.manager = get_foo_buildmanager() + self.path = get_temp_filepath() + + def test_expand_false(self): + # Setup all the data we need + foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14) + foobucket = FooBucket('bucket1', [foo1]) + foofile = FooFile(buckets=[foobucket]) + + with HDF5IO(self.path, manager=self.manager, mode='w') as io: + io.write(foofile, expandable=False) + + io = HDF5IO(self.path, manager=self.manager, mode='r') + read_foofile = io.read() + self.assertListEqual(foofile.buckets['bucket1'].foos['foo1'].my_data, + read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist()) + self.assertEqual(get_data_shape(read_foofile.buckets['bucket1'].foos['foo1'].my_data), + (5,)) + + class TestRoundTrip(TestCase): def setUp(self):