diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index e9dfc88e9..42fe63ea2 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -178,7 +178,7 @@ def _vcf_type_to_numpy( elif vcf_type == "Float": return "f4", FLOAT32_MISSING, FLOAT32_FILL elif vcf_type == "Character": - return "S1", CHAR_MISSING, CHAR_FILL + return "U1", CHAR_MISSING, CHAR_FILL elif vcf_type == "String": return "O", STR_MISSING, STR_FILL raise ValueError( @@ -188,7 +188,7 @@ def _vcf_type_to_numpy( def _is_str_or_char(array: ArrayLike) -> bool: """Return True if the array is of string or character type""" - return array.dtype.kind in ("O", "S", "U") + return array.dtype.kind in ("O", "U") class VcfFieldHandler: diff --git a/sgkit/io/vcf/vcf_writer_utils.py b/sgkit/io/vcf/vcf_writer_utils.py index b7dff029e..3ec165d6c 100644 --- a/sgkit/io/vcf/vcf_writer_utils.py +++ b/sgkit/io/vcf/vcf_writer_utils.py @@ -11,6 +11,7 @@ FLOAT32_MISSING_AS_INT32, INT_FILL, INT_MISSING, + STR_MISSING, ) COLON = ord(":") @@ -316,7 +317,8 @@ def vcf_values_to_byte_buf_size(a): elif a.dtype == np.float32: # values + separators return a.size * FLOAT32_BUF_SIZE + a.size - elif a.dtype.kind == "S": + elif a.dtype.kind == "U": + # NOTE! Assuming UTF-8 here? # values + separators return a.size * a.dtype.itemsize + a.size else: @@ -502,8 +504,8 @@ def create_mask(arr): return np.all(arr == INT_MISSING, axis=axis) elif arr.dtype == np.float32: return np.all(arr.view("i4") == FLOAT32_MISSING_AS_INT32, axis=axis) - elif arr.dtype.kind == "S": - return np.all(arr == STR_MISSING_BYTE, axis=axis) + elif arr.dtype.kind == "U": + return np.all(arr == STR_MISSING, axis=axis) else: raise ValueError(f"Unsupported dtype: {arr.dtype}")