From db51abdaf838c3d502c4efb502268af6fb758d54 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 15 Aug 2024 00:57:16 -0700 Subject: [PATCH 1/3] Fix conversion of Zarr string dataset to HDF5 --- src/hdmf/backends/hdf5/h5tools.py | 6 ++++++ src/hdmf/build/objectmapper.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 8135d75e7..23107e282 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -923,6 +923,12 @@ def __resolve_dtype__(cls, dtype, data): # TODO: These values exist, but I haven't solved them yet # binary # number + + # Use text dtype for Zarr datasets of strings. Zarr stores variable lenght strings + # as objects so we need to detect this special case here + if hasattr(data, 'attrs') and 'zarr_dtype' in data.attrs and data.attrs['zarr_dtype'] == 'str': + return cls.__dtypes['text'] + dtype = cls.__resolve_dtype_helper__(dtype) if dtype is None: dtype = cls.get_type(data) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index b5815ee2c..5d96eeddf 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -209,10 +209,18 @@ def convert_dtype(cls, spec, value, spec_dtype=None): # noqa: C901 if (isinstance(value, np.ndarray) or (hasattr(value, 'astype') and hasattr(value, 'dtype'))): if spec_dtype_type is _unicode: - ret = value.astype('U') + if hasattr(value, 'attrs') and 'zarr_dtype' in value.attrs: + # Zarr stores strings as objects so we cannot convert to unicode dtype + ret = value + else: + ret = value.astype('U') ret_dtype = "utf8" elif spec_dtype_type is _ascii: - ret = value.astype('S') + if hasattr(value, 'attrs') and 'zarr_dtype' in value.attrs: + # Zarr stores strings as objects so we cannot convert to unicode dtype + ret = value + else: + ret = value.astype('S') ret_dtype = "ascii" else: dtype_func, warning_msg = cls.__resolve_numeric_dtype(value.dtype, spec_dtype_type) From b39d20f0e10eaf255c280629b03523fd1af256b4 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 15 Aug 2024 01:02:47 -0700 Subject: [PATCH 2/3] Updated changelog --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a6369094..58be0d327 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ ### Enhancements - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157) +### Bug fixes +- Fixed bug when converting string datasets from Zarr to HDF5. @oruebel [#1171](https://github.com/hdmf-dev/hdmf/pull/1171) + ## HDMF 3.14.3 (July 29, 2024) ### Enhancements @@ -18,10 +21,10 @@ is available on build (during the write process), but not on read of a dataset f - Warn when unexpected keys are present in specs. @rly [#1134](https://github.com/hdmf-dev/hdmf/pull/1134) - Support appending to zarr arrays. @mavaylon1 [#1136](https://github.com/hdmf-dev/hdmf/pull/1136) - Support specifying "value" key in DatasetSpec. @rly [#1143](https://github.com/hdmf-dev/hdmf/pull/1143) -- Add support for numpy 2. @rly [#1139](https://github.com/hdmf-dev/hdmf/pull/1139) +- Added support for numpy 2. @rly [#1139](https://github.com/hdmf-dev/hdmf/pull/1139) ### Bug fixes -- Fix iterator increment causing an extra +1 added after the end of completion. @CodyCBakerPhD [#1128](https://github.com/hdmf-dev/hdmf/pull/1128) +- Fixed iterator increment causing an extra +1 added after the end of completion. @CodyCBakerPhD [#1128](https://github.com/hdmf-dev/hdmf/pull/1128) ## HDMF 3.14.1 (June 6, 2024) From 6f759a64e1c02f5cacfaf88abe5659918ad66e73 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 15 Aug 2024 01:05:32 -0700 Subject: [PATCH 3/3] Fix spelling --- src/hdmf/backends/hdf5/h5tools.py | 4 ++-- src/hdmf/build/objectmapper.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 23107e282..b0b18b952 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -924,8 +924,8 @@ def __resolve_dtype__(cls, dtype, data): # binary # number - # Use text dtype for Zarr datasets of strings. Zarr stores variable lenght strings - # as objects so we need to detect this special case here + # Use text dtype for Zarr datasets of strings. Zarr stores variable length strings + # as objects, so we need to detect this special case here if hasattr(data, 'attrs') and 'zarr_dtype' in data.attrs and data.attrs['zarr_dtype'] == 'str': return cls.__dtypes['text'] diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 5d96eeddf..58dcf19f6 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -210,14 +210,14 @@ def convert_dtype(cls, spec, value, spec_dtype=None): # noqa: C901 (hasattr(value, 'astype') and hasattr(value, 'dtype'))): if spec_dtype_type is _unicode: if hasattr(value, 'attrs') and 'zarr_dtype' in value.attrs: - # Zarr stores strings as objects so we cannot convert to unicode dtype + # Zarr stores strings as objects, so we cannot convert to unicode dtype ret = value else: ret = value.astype('U') ret_dtype = "utf8" elif spec_dtype_type is _ascii: if hasattr(value, 'attrs') and 'zarr_dtype' in value.attrs: - # Zarr stores strings as objects so we cannot convert to unicode dtype + # Zarr stores strings as objects, so we cannot convert to unicode dtype ret = value else: ret = value.astype('S')