diff --git a/python/python/tests/test_file.py b/python/python/tests/test_file.py index 6206d34fa2..74281aa84c 100644 --- a/python/python/tests/test_file.py +++ b/python/python/tests/test_file.py @@ -436,3 +436,106 @@ def test_blob(tmp_path): reader = LanceFileReader(str(path)) assert len(reader.metadata().columns[0].pages) == 1 assert reader.read_all().to_table() == pa.table({"val": vals}) + + +def test_enum_vs_categorical(tmp_path): + # Helper method to make two dict arrays, with same dictionary values + # but different indices + def make_tbls(values, indices1, indices2): + # Need to make two separate dictionaries here or else arrow-rs won't concat + d1 = pa.array(values, pa.string()) + d2 = pa.array(values, pa.string()) + i1 = pa.array(indices1, pa.int16()) + i2 = pa.array(indices2, pa.int16()) + + dict1 = pa.DictionaryArray.from_arrays(i1, d1) + dict2 = pa.DictionaryArray.from_arrays(i2, d2) + tab1 = pa.table({"dictionary": dict1}) + tab2 = pa.table({"dictionary": dict2}) + return tab1, tab2 + + # Helper method to round trip two tables through lance and return the decoded + # dictionary array + def round_trip_dict(tab1: pa.Table, tab2: pa.Table) -> pa.DictionaryArray: + with LanceFileWriter(tmp_path / "categorical.lance") as writer: + writer.write_batch(tab1) + writer.write_batch(tab2) + + reader = LanceFileReader(tmp_path / "categorical.lance") + round_tripped = reader.read_all().to_table() + + arr2 = round_tripped.column("dictionary").chunk(0).dictionary + return arr2 + + # Helper method to convert a table with dictionary array into a table with + # enum array + def enumify(tbl) -> pa.Table: + categories = ",".join(tbl.column(0).chunk(0).dictionary.to_pylist()) + enum_schema = pa.schema( + [ + pa.field( + "dictionary", + pa.dictionary(pa.int16(), pa.string()), + metadata={ + "ARROW:extension:name": "polars.enum", + "ARROW:extension:metadata": '{"categories": [' + + categories + + "]}", + }, + ) + ] + ) + return pa.table([tbl.column(0)], schema=enum_schema) + + tab1, tab2 = make_tbls( + ["blue", "red", "green", "yellow"], + [0, 1, 0, 1, 0, 1], + [1, 2, 1, 2, 1, 2], + ) + + round_trip = round_trip_dict(tab1, tab2) + + # Sometimes array concatenation will just concatenate the dictionaries + assert round_trip.to_pylist() == [ + "blue", + "red", + "green", + "yellow", + "blue", + "red", + "green", + "yellow", + ] + + tab1 = enumify(tab1) + tab2 = enumify(tab2) + + round_trip = round_trip_dict(tab1, tab2) + + # However, there should be no concatenation with the enum type + assert round_trip.to_pylist() == [ + "blue", + "red", + "green", + "yellow", + ] + + tab1, tab2 = make_tbls( + [str(i) for i in range(1000)], + list(range(500)), + list(range(500, 900)), + ) + + round_trip = round_trip_dict(tab1, tab2) + + # Other times array concatenation will combine the + # dictionaries and remove unused items + assert round_trip.to_pylist() == [str(i) for i in range(900)] + + # Again, no concatenation with enum type + tab1 = enumify(tab1) + tab2 = enumify(tab2) + + round_trip = round_trip_dict(tab1, tab2) + + assert round_trip.to_pylist() == [str(i) for i in range(1000)] diff --git a/rust/lance-arrow/Cargo.toml b/rust/lance-arrow/Cargo.toml index 8a7850f1a3..3685b55e0b 100644 --- a/rust/lance-arrow/Cargo.toml +++ b/rust/lance-arrow/Cargo.toml @@ -23,6 +23,8 @@ arrow-select = { workspace = true } half = { workspace = true } num-traits = { workspace = true } rand.workspace = true +serde = { workspace = true } +serde_json = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] getrandom = { version = "0.2", features = ["js"] } diff --git a/rust/lance-arrow/src/dict_enum.rs b/rust/lance-arrow/src/dict_enum.rs new file mode 100644 index 0000000000..c39503a7ab --- /dev/null +++ b/rust/lance-arrow/src/dict_enum.rs @@ -0,0 +1,146 @@ +use std::sync::Arc; + +use arrow_array::{cast::AsArray, Array, StringArray}; +use arrow_schema::{ArrowError, Field as ArrowField}; +use serde::{Deserialize, Serialize}; + +use crate::{ + bfloat16::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY}, + DataTypeExt, Result, +}; + +const ENUM_TYPE: &str = "polars.enum"; + +// TODO: Could be slightly more efficient to use custom JSON serialization +// to go straight from JSON to StringArray without the Vec intermediate +// but this is fine for now +#[derive(Deserialize, Serialize)] +struct DictionaryEnumMetadata { + categories: Vec, +} + +pub struct DictionaryEnumType { + pub categories: Arc, +} + +impl DictionaryEnumType { + /// Adds extension type metadata to the given field + /// + /// Fails if the field is already an extension type of some kind + pub fn wrap_field(&self, field: &ArrowField) -> Result { + let mut metadata = field.metadata().clone(); + if metadata.contains_key(ARROW_EXT_NAME_KEY) { + return Err(ArrowError::InvalidArgumentError( + "Field already has extension metadata".to_string(), + )); + } + metadata.insert(ARROW_EXT_NAME_KEY.to_string(), ENUM_TYPE.to_string()); + metadata.insert( + ARROW_EXT_META_KEY.to_string(), + serde_json::to_string(&DictionaryEnumMetadata { + categories: self + .categories + .as_any() + .downcast_ref::() + .unwrap() + .values() + .iter() + .map(|x| x.to_string()) + .collect(), + }) + .unwrap(), + ); + Ok(field.clone().with_metadata(metadata)) + } + + /// Creates a new enum type from the given dictionary array + /// + /// # Arguments + /// + /// * `arr` - The dictionary array to create the enum type from + /// + /// # Errors + /// + /// An error is returned if the array is not a dictionary array or if the dictionary + /// array does not have string values + pub fn from_dict_array(arr: &dyn Array) -> Result { + let arr = arr.as_any_dictionary_opt().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Expected a dictionary array for enum type".to_string(), + ) + })?; + if !arr.values().data_type().is_binary_like() { + Err(ArrowError::InvalidArgumentError( + "Expected a dictionary array with string values for enum type".to_string(), + )) + } else { + Ok(Self { + categories: Arc::new(arr.values().clone()), + }) + } + } + + /// Attempts to parse the type from the given field + /// + /// If the field is not an enum type then None is returned + /// + /// Errors can occur if the field is an enum type but the metadata + /// is not correctly formatted + /// + /// # Arguments + /// + /// * `field` - The field to parse + /// * `sample_arr` - An optional sample array. If provided then categories will be extracted + /// from this array, avoiding the need to parse the metadata. This array should be a dictionary + /// array where the dictionary items are the categories. + /// + /// The sample_arr is only used if the field is an enum type. E.g. it is safe to do something + /// like: + /// + /// ```ignore + /// let arr = batch.column(0); + /// let field = batch.schema().field(0); + /// let enum_type = DictionaryEnumType::from_field(field, Some(arr)); + /// ``` + pub fn from_field( + field: &ArrowField, + sample_arr: Option<&Arc>, + ) -> Result> { + if field + .metadata() + .get(ARROW_EXT_NAME_KEY) + .map(|k| k.eq_ignore_ascii_case(ENUM_TYPE)) + .unwrap_or(false) + { + // Prefer extracting values from the first array if possible as it's cheaper + if let Some(arr) = sample_arr { + let dict_arr = arr.as_any_dictionary_opt().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Expected a dictionary array for enum type".to_string(), + ) + })?; + Ok(Some(Self { + categories: dict_arr.values().clone(), + })) + } else { + // No arrays, need to use the field metadata + let meta = field.metadata().get(ARROW_EXT_META_KEY).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Field {} is missing extension metadata", + field.name() + )) + })?; + let meta: DictionaryEnumMetadata = serde_json::from_str(meta).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Arrow extension metadata for enum was not correctly formed: {}", + e + )) + })?; + let categories = Arc::new(StringArray::from_iter_values(meta.categories)); + Ok(Some(Self { categories })) + } + } else { + Ok(None) + } + } +} diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index 903a662178..01e2f446ee 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -19,6 +19,7 @@ use arrow_select::{interleave::interleave, take::take}; use rand::prelude::*; pub mod deepcopy; +pub mod dict_enum; pub mod schema; pub use schema::*; pub mod bfloat16; diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index 4291bd48b9..9825ad1556 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -21,10 +21,10 @@ use std::{ }; use arrow::array::{ArrayData, ArrayDataBuilder, AsArray}; -use arrow_array::{new_empty_array, new_null_array, Array, ArrayRef, UInt64Array}; +use arrow_array::{make_array, new_empty_array, new_null_array, Array, ArrayRef, UInt64Array}; use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder, NullBuffer}; -use arrow_schema::DataType; -use lance_arrow::DataTypeExt; +use arrow_schema::{DataType, Field}; +use lance_arrow::{dict_enum::DictionaryEnumType, DataTypeExt}; use snafu::{location, Location}; use lance_core::{Error, Result}; @@ -960,7 +960,7 @@ fn encode_bitmap_data(arrays: &[ArrayRef], num_values: u64) -> LanceBuffer { // Concatenate dictionary arrays. This is a bit tricky because we might overflow the // index type. If we do, we need to upscale the indices to a larger type. -fn concat_dict_arrays(arrays: &[ArrayRef]) -> ArrayRef { +fn concat_dictionary_categorical(arrays: &[ArrayRef]) -> ArrayRef { let value_type = arrays[0].as_any_dictionary().values().data_type(); let array_refs = arrays.iter().map(|arr| arr.as_ref()).collect::>(); match arrow_select::concat::concat(&array_refs) { @@ -1015,6 +1015,34 @@ fn max_index_val(index_type: &DataType) -> u64 { } } +fn concat_dictionary_enum(arrays: &[ArrayRef], enum_values: &dyn Array) -> Arc { + // If this is an enum type then all arrays should have the same dictionary + // + // This is a bit of an expensive check but it seems worthwhile to ensure correctness. Could maybe + // add some kind of "unsafe" path at some point. + // + // If the values are not equivalent then we'd be silently writing corrupt data. + assert!(arrays + .iter() + .all(|arr| arr.as_any_dictionary().values().as_ref() == enum_values)); + let indices_array_refs = arrays + .iter() + .map(|arr| arr.as_any_dictionary().keys()) + .collect::>(); + let combined_indices = arrow_select::concat::concat(&indices_array_refs).unwrap(); + + let dict_type = arrays[0].data_type().clone(); + let dict_arr_data = combined_indices + .to_data() + .into_builder() + .data_type(dict_type) + .add_child_data(enum_values.to_data()) + .build() + .unwrap(); + + make_array(dict_arr_data) +} + // If we get multiple dictionary arrays and they don't all have the same dictionary // then we need to normalize the indices. Otherwise we might have something like: // @@ -1033,8 +1061,17 @@ fn max_index_val(index_type: &DataType) -> u64 { // // In addition, we want to normalize the representation of nulls. The cheapest thing to // do (space-wise) is to put the nulls in the dictionary. -fn arrow_dictionary_to_data_block(arrays: &[ArrayRef], validity: Option) -> DataBlock { - let array = concat_dict_arrays(arrays); +fn arrow_dictionary_to_data_block( + arrays: &[ArrayRef], + validity: Option, + field: &Field, +) -> DataBlock { + let array = + if let Some(enum_type) = DictionaryEnumType::from_field(field, arrays.first()).unwrap() { + concat_dictionary_enum(arrays, &enum_type.categories) + } else { + concat_dictionary_categorical(arrays) + }; let array_dict = array.as_any_dictionary(); let mut indices = array_dict.keys(); let num_values = indices.len() as u64; @@ -1103,7 +1140,12 @@ fn arrow_dictionary_to_data_block(arrays: &[ArrayRef], validity: Option Nullability { } impl DataBlock { - pub fn from_arrays(arrays: &[ArrayRef], num_values: u64) -> Self { + /// Convert a slice of Arrow arrays into a DataBlock + /// + /// This process should not fail but will panic if any of the arrays are not internally + /// consistent with the field. + pub fn from_arrays(arrays: &[ArrayRef], num_values: u64, field: &Field) -> Self { + let data_type = field.data_type(); + + debug_assert!(arrays + .iter() + .all(|arr| arr.data_type() == field.data_type())); + if arrays.is_empty() || num_values == 0 { return Self::AllNull(AllNullDataBlock { num_values: 0 }); } - let data_type = arrays[0].data_type(); let nulls = extract_nulls(arrays, num_values); if let Nullability::All = nulls { @@ -1217,25 +1268,28 @@ impl DataBlock { }) } DataType::Null => Self::AllNull(AllNullDataBlock { num_values }), - DataType::Dictionary(_, _) => arrow_dictionary_to_data_block(arrays, nulls.to_option()), + DataType::Dictionary(_, _) => { + arrow_dictionary_to_data_block(arrays, nulls.to_option(), field) + } DataType::Struct(fields) => { let structs = arrays.iter().map(|arr| arr.as_struct()).collect::>(); let mut children = Vec::with_capacity(fields.len()); - for child_idx in 0..fields.len() { + for (child_idx, child_field) in fields.iter().enumerate() { let child_vec = structs .iter() .map(|s| s.column(child_idx).clone()) .collect::>(); - children.push(Self::from_arrays(&child_vec, num_values)); + children.push(Self::from_arrays(&child_vec, num_values, child_field)); } Self::Struct(StructDataBlock { children }) } - DataType::FixedSizeList(_, dim) => { + DataType::FixedSizeList(child_field, dim) => { let children = arrays .iter() .map(|arr| arr.as_fixed_size_list().values().clone()) .collect::>(); - let child_block = Self::from_arrays(&children, num_values * *dim as u64); + let child_block = + Self::from_arrays(&children, num_values * *dim as u64, &child_field); Self::FixedSizeList(FixedSizeListBlock { child: Box::new(child_block), dimension: *dim as u64, @@ -1271,16 +1325,9 @@ impl DataBlock { } } - pub fn from_array(array: T) -> Self { + pub fn from_array(array: T, field: &Field) -> Self { let num_values = array.len(); - Self::from_arrays(&[Arc::new(array)], num_values as u64) - } -} - -impl From for DataBlock { - fn from(array: ArrayRef) -> Self { - let num_values = array.len() as u64; - Self::from_arrays(&[array], num_values) + Self::from_arrays(&[Arc::new(array)], num_values as u64, field) } } @@ -1290,10 +1337,13 @@ mod tests { use arrow::datatypes::{Int32Type, Int8Type}; use arrow_array::{ - ArrayRef, DictionaryArray, Int8Array, LargeBinaryArray, StringArray, UInt8Array, + ArrayRef, DictionaryArray, Int8Array, LargeBinaryArray, StringArray, UInt16Array, + UInt8Array, }; use arrow_buffer::{BooleanBuffer, NullBuffer}; + use arrow_schema::{DataType, Field}; + use lance_arrow::dict_enum::DictionaryEnumType; use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED}; use rand::SeedableRng; @@ -1309,13 +1359,14 @@ mod tests { let strings1 = StringArray::from(vec![Some("hello"), None, Some("world")]); let strings2 = StringArray::from(vec![Some("a"), Some("b")]); let strings3 = StringArray::from(vec![Option::<&'static str>::None, None]); + let field = Field::new("", DataType::Utf8, true); let arrays = &[strings1, strings2, strings3] .iter() .map(|arr| Arc::new(arr.clone()) as ArrayRef) .collect::>(); - let block = DataBlock::from_arrays(arrays, 7); + let block = DataBlock::from_arrays(arrays, 7, &field); assert_eq!(block.num_values(), 7); let block = block.as_nullable().unwrap(); @@ -1339,7 +1390,7 @@ mod tests { .map(|arr| Arc::new(arr.clone()) as ArrayRef) .collect::>(); - let block = DataBlock::from_arrays(arrays, 3); + let block = DataBlock::from_arrays(arrays, 3, &field); assert_eq!(block.num_values(), 3); // Should be no nullable wrapper @@ -1350,13 +1401,15 @@ mod tests { #[test] fn test_string_sliced() { + let field = Field::new("", DataType::Utf8, true); + let check = |arr: Vec, expected_off: Vec, expected_data: &[u8]| { let arrs = arr .into_iter() .map(|a| Arc::new(a) as ArrayRef) .collect::>(); let num_rows = arrs.iter().map(|a| a.len()).sum::() as u64; - let data = DataBlock::from_arrays(&arrs, num_rows); + let data = DataBlock::from_arrays(&arrs, num_rows, &field); assert_eq!(data.num_values(), num_rows); @@ -1384,8 +1437,9 @@ mod tests { #[test] fn test_large() { + let field = Field::new("", DataType::LargeBinary, true); let arr = LargeBinaryArray::from_vec(vec![b"hello", b"world"]); - let data = DataBlock::from_array(arr); + let data = DataBlock::from_array(arr, &field); assert_eq!(data.num_values(), 2); let data = data.as_variable_width().unwrap(); @@ -1400,10 +1454,15 @@ mod tests { #[test] fn test_dictionary_indices_normalized() { + let field = Field::new( + "", + DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), + true, + ); let arr1 = DictionaryArray::::from_iter([Some("a"), Some("a"), Some("b")]); let arr2 = DictionaryArray::::from_iter([Some("b"), Some("c")]); - let data = DataBlock::from_arrays(&[Arc::new(arr1), Arc::new(arr2)], 5); + let data = DataBlock::from_arrays(&[Arc::new(arr1), Arc::new(arr2)], 5, &field); assert_eq!(data.num_values(), 5); let data = data.as_dictionary().unwrap(); @@ -1430,13 +1489,19 @@ mod tests { #[test] fn test_dictionary_nulls() { + let field = Field::new( + "", + DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), + true, + ); + // Test both ways of encoding nulls // By default, nulls get encoded into the indices let arr1 = DictionaryArray::::from_iter([None, Some("a"), Some("b")]); let arr2 = DictionaryArray::::from_iter([Some("c"), None]); - let data = DataBlock::from_arrays(&[Arc::new(arr1), Arc::new(arr2)], 5); + let data = DataBlock::from_arrays(&[Arc::new(arr1), Arc::new(arr2)], 5, &field); let check_common = |data: DataBlock| { assert_eq!(data.num_values(), 5); @@ -1471,7 +1536,7 @@ mod tests { let indices = Int8Array::from(vec![Some(3), Some(0), Some(1), Some(2), Some(3)]); let dict = DictionaryArray::new(indices, Arc::new(items)); - let data = DataBlock::from_array(dict); + let data = DataBlock::from_array(dict, &field); println!("Check two"); check_common(data); @@ -1479,6 +1544,11 @@ mod tests { #[test] fn test_dictionary_cannot_add_null() { + let field = Field::new( + "", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + true, + ); // 256 unique strings let items = StringArray::from( (0..256) @@ -1494,7 +1564,7 @@ mod tests { // We want to normalize this by pushing nulls into the dictionary, but we cannot because // the dictionary is too large for the index type let dict = DictionaryArray::new(indices, Arc::new(items)); - let data = DataBlock::from_array(dict); + let data = DataBlock::from_array(dict, &field); assert_eq!(data.num_values(), 257); @@ -1525,6 +1595,12 @@ mod tests { #[test] fn test_dictionary_cannot_concatenate() { + let field = Field::new( + "", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + true, + ); + // 256 unique strings let items = StringArray::from( (0..256) @@ -1540,7 +1616,7 @@ mod tests { let indices = UInt8Array::from_iter_values(0..=255); let dict1 = DictionaryArray::new(indices.clone(), Arc::new(items)); let dict2 = DictionaryArray::new(indices, Arc::new(other_items)); - let data = DataBlock::from_arrays(&[Arc::new(dict1), Arc::new(dict2)], 512); + let data = DataBlock::from_arrays(&[Arc::new(dict1), Arc::new(dict2)], 512, &field); assert_eq!(data.num_values(), 512); let dict = data.as_dictionary().unwrap(); @@ -1557,24 +1633,60 @@ mod tests { ); } + #[test] + fn test_enum_dictionary() { + let field = Field::new( + "", + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), + true, + ); + let items = Arc::new(StringArray::from_iter_values( + (0..1000).map(|s| s.to_string()), + )); + let items2 = Arc::new(StringArray::from_iter_values( + (0..1000).map(|s| s.to_string()), + )); + + let indices1 = UInt16Array::from_iter_values(0..500); + let indices2 = UInt16Array::from_iter_values(500..999); + let dict1 = Arc::new(DictionaryArray::new(indices1, items)); + let dict2 = Arc::new(DictionaryArray::new(indices2, items2)); + let data = DataBlock::from_arrays(&[dict1.clone(), dict2.clone()], 1000, &field); + let data = data.as_dictionary().unwrap(); + + // Concatenating the dictionaries will drop the unused value ("999") + assert_eq!(data.dictionary.num_values(), 999); + + // However, if the input arrays are "enum" arrays, then we should not lose any + // values + let enum_type = DictionaryEnumType::from_dict_array(dict1.as_ref()).unwrap(); + let enum_field = enum_type.wrap_field(&field).unwrap(); + + let data = DataBlock::from_arrays(&[dict1.clone(), dict2.clone()], 1000, &enum_field); + let data = data.as_dictionary().unwrap(); + assert_eq!(data.dictionary.num_values(), 1000); + } + #[test] fn test_data_size() { + let field = Field::new("", DataType::Int32, true); + let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0); // test data_size() when input has no nulls let mut gen = array::rand::().with_nulls(&[false, false, false]); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone(), &field); assert!(block.data_size() == arr.get_buffer_memory_size() as u64); let arr = gen.generate(RowCount::from(400), &mut rng).unwrap(); - let block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone(), &field); assert!(block.data_size() == arr.get_buffer_memory_size() as u64); // test data_size() when input has nulls let mut gen = array::rand::().with_nulls(&[false, true, false]); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone(), &field); let array_data = arr.to_data(); let total_buffer_size: usize = array_data.buffers().iter().map(|buffer| buffer.len()).sum(); @@ -1583,7 +1695,7 @@ mod tests { assert!(block.data_size() == (total_buffer_size + array_nulls_size_in_bytes) as u64); let arr = gen.generate(RowCount::from(400), &mut rng).unwrap(); - let block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone(), &field); let array_data = arr.to_data(); let total_buffer_size: usize = array_data.buffers().iter().map(|buffer| buffer.len()).sum(); @@ -1592,7 +1704,7 @@ mod tests { let mut gen = array::rand::().with_nulls(&[true, true, false]); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone(), &field); let array_data = arr.to_data(); let total_buffer_size: usize = array_data.buffers().iter().map(|buffer| buffer.len()).sum(); @@ -1600,7 +1712,7 @@ mod tests { assert!(block.data_size() == (total_buffer_size + array_nulls_size_in_bytes) as u64); let arr = gen.generate(RowCount::from(400), &mut rng).unwrap(); - let block = DataBlock::from_array(arr.clone()); + let block = DataBlock::from_array(arr.clone(), &field); let array_data = arr.to_data(); let total_buffer_size: usize = array_data.buffers().iter().map(|buffer| buffer.len()).sum(); @@ -1611,7 +1723,7 @@ mod tests { let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap(); let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap(); let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9); + let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9, &field); let concatenated_array = concat(&[ &*Arc::new(arr1.clone()) as &dyn Array, diff --git a/rust/lance-encoding/src/encoder.rs b/rust/lance-encoding/src/encoder.rs index 4d72407d02..94ff43c742 100644 --- a/rust/lance-encoding/src/encoder.rs +++ b/rust/lance-encoding/src/encoder.rs @@ -4,7 +4,7 @@ use std::{collections::HashMap, env, sync::Arc}; use arrow::array::AsArray; use arrow_array::{Array, ArrayRef, RecordBatch, UInt8Array}; -use arrow_schema::DataType; +use arrow_schema::{DataType, Field as ArrowField}; use bytes::{Bytes, BytesMut}; use futures::future::BoxFuture; use lance_arrow::DataTypeExt; @@ -461,18 +461,18 @@ impl CoreArrayEncodingStrategy { fn default_binary_encoder( arrays: &[ArrayRef], - data_type: &DataType, - field_meta: Option<&HashMap>, + field: &ArrowField, data_size: u64, version: LanceFileVersion, ) -> Result> { + let indices_field = ArrowField::new("", DataType::UInt64, false); let bin_indices_encoder = - Self::choose_array_encoder(arrays, &DataType::UInt64, data_size, false, version, None)?; + Self::choose_array_encoder(arrays, &indices_field, data_size, false, version)?; - let compression = field_meta.and_then(Self::get_field_compression); + let compression = Self::get_field_compression(field.metadata()); let bin_encoder = Box::new(BinaryEncoder::new(bin_indices_encoder, compression)); - if compression.is_none() && Self::can_use_fsst(data_type, data_size, version) { + if compression.is_none() && Self::can_use_fsst(field.data_type(), data_size, version) { Ok(Box::new(FsstArrayEncoder::new(bin_encoder))) } else { Ok(bin_encoder) @@ -481,60 +481,56 @@ impl CoreArrayEncodingStrategy { fn choose_array_encoder( arrays: &[ArrayRef], - data_type: &DataType, + field: &ArrowField, data_size: u64, use_dict_encoding: bool, version: LanceFileVersion, - field_meta: Option<&HashMap>, ) -> Result> { - match data_type { + match field.data_type() { DataType::FixedSizeList(inner, dimension) => { Ok(Box::new(BasicEncoder::new(Box::new(FslEncoder::new( Self::choose_array_encoder( arrays, - inner.data_type(), + inner.as_ref(), data_size, use_dict_encoding, version, - None, )?, *dimension as u32, ))))) } DataType::Dictionary(key_type, value_type) => { + let key_field = ArrowField::new("", key_type.as_ref().clone(), true); + // TODO: arrow-rs doesn't keep metadata for value type in dictionary + // We assume the metadata of the field applies to the values and not the dictionary itself + let value_field = ArrowField::new("", value_type.as_ref().clone(), true); let key_encoder = - Self::choose_array_encoder(arrays, key_type, data_size, false, version, None)?; - let value_encoder = Self::choose_array_encoder( - arrays, value_type, data_size, false, version, None, - )?; + Self::choose_array_encoder(arrays, &key_field, data_size, false, version)?; + let value_encoder = + Self::choose_array_encoder(arrays, &value_field, data_size, false, version)?; Ok(Box::new(AlreadyDictionaryEncoder::new( key_encoder, value_encoder, + field.clone(), ))) } DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => { if use_dict_encoding { + let dict_indices_field = ArrowField::new("", DataType::UInt8, false); let dict_indices_encoder = Self::choose_array_encoder( // We need to pass arrays to this method to figure out what kind of compression to // use but we haven't actually calculated the indices yet. For now, we just assume // worst case and use the full range. In the future maybe we can pass in statistics // instead of the actual data &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))], - &DataType::UInt8, + &dict_indices_field, data_size, false, version, - None, - )?; - let dict_items_encoder = Self::choose_array_encoder( - arrays, - &DataType::Utf8, - data_size, - false, - version, - None, )?; + let dict_items_encoder = + Self::choose_array_encoder(arrays, &field, data_size, false, version)?; Ok(Box::new(DictionaryEncoder::new( dict_indices_encoder, @@ -545,41 +541,36 @@ impl CoreArrayEncodingStrategy { // The variable 'data_type' is passed through recursion so comparing with it would be incorrect else if BINARY_DATATYPES.contains(arrays[0].data_type()) { if let Some(byte_width) = check_fixed_size_encoding(arrays, version) { + let bytes_field = ArrowField::new("", DataType::UInt8, false); // use FixedSizeBinaryEncoder let bytes_encoder = Self::choose_array_encoder( arrays, - &DataType::UInt8, + &bytes_field, data_size, false, version, - None, )?; Ok(Box::new(BasicEncoder::new(Box::new( FixedSizeBinaryEncoder::new(bytes_encoder, byte_width as usize), )))) } else { - Self::default_binary_encoder( - arrays, data_type, field_meta, data_size, version, - ) + Self::default_binary_encoder(arrays, field, data_size, version) } } else { - Self::default_binary_encoder(arrays, data_type, field_meta, data_size, version) + Self::default_binary_encoder(arrays, field, data_size, version) } } DataType::Struct(fields) => { - let num_fields = fields.len(); let mut inner_encoders = Vec::new(); - for i in 0..num_fields { - let inner_datatype = fields[i].data_type(); + for field in fields { let inner_encoder = Self::choose_array_encoder( arrays, - inner_datatype, + field, data_size, use_dict_encoding, version, - None, )?; inner_encoders.push(inner_encoder); } @@ -587,11 +578,11 @@ impl CoreArrayEncodingStrategy { Ok(Box::new(PackedStructEncoder::new(inner_encoders))) } DataType::UInt8 | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => { - if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type { + if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == field.data_type() { let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays); Ok(Box::new(BitpackedForNonNegArrayEncoder::new( compressed_bit_width as usize, - data_type.clone(), + field.data_type().clone(), ))) } else { Ok(Box::new(BasicEncoder::new(Box::new( @@ -604,11 +595,11 @@ impl CoreArrayEncodingStrategy { // then a bitpacked array for the narrow(bit-width) values, I need `BitpackedForNeg` to be merged first, I am // thinking about putting this sparse array in the metadata so bitpacking remain using one page buffer only. DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => { - if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == data_type { + if version >= LanceFileVersion::V2_1 && arrays[0].data_type() == field.data_type() { let compressed_bit_width = compute_compressed_bit_width_for_non_neg(arrays); Ok(Box::new(BitpackedForNonNegArrayEncoder::new( compressed_bit_width as usize, - data_type.clone(), + field.data_type().clone(), ))) } else { Ok(Box::new(BasicEncoder::new(Box::new( @@ -744,18 +735,18 @@ impl ArrayEncodingStrategy for CoreArrayEncodingStrategy { .iter() .map(|arr| arr.get_buffer_memory_size() as u64) .sum::(); - let data_type = arrays[0].data_type(); - let use_dict_encoding = data_type == &DataType::Utf8 + let use_dict_encoding = field.data_type() == DataType::Utf8 && check_dict_encoding(arrays, get_dict_encoding_threshold()); + let arrow_field = ArrowField::from(field); + Self::choose_array_encoder( arrays, - data_type, + &arrow_field, data_size, use_dict_encoding, self.version, - Some(&field.metadata), ) } } diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index d23e1dc6b8..6cfdb19201 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -508,10 +508,11 @@ impl PrimitiveFieldEncoder { .create_array_encoder(&arrays, &self.field)?; let column_idx = self.column_index; let data_type = self.field.data_type(); + let arrow_field = arrow_schema::Field::from(&self.field); Ok(tokio::task::spawn(async move { let num_values = arrays.iter().map(|arr| arr.len() as u64).sum(); - let data = DataBlock::from_arrays(&arrays, num_values); + let data = DataBlock::from_arrays(&arrays, num_values, &arrow_field); let mut buffer_index = 0; let array = encoder.encode(data, &data_type, &mut buffer_index)?; let (data, description) = array.into_buffers(); @@ -834,7 +835,8 @@ impl PrimitiveStructuralEncoder { // and potentially more decoder asymmetry. However, it may be worth // investigating at some point - let data = DataBlock::from_arrays(&arrays, num_values); + let arrow_field = arrow_schema::Field::from(field); + let data = DataBlock::from_arrays(&arrays, num_values, &arrow_field); let num_values = data.num_values(); // The validity is encoded in repdef so we can remove it let data = data.remove_validity(); diff --git a/rust/lance-encoding/src/encodings/physical/binary.rs b/rust/lance-encoding/src/encodings/physical/binary.rs index 91dc5f133d..7ab01000eb 100644 --- a/rust/lance-encoding/src/encodings/physical/binary.rs +++ b/rust/lance-encoding/src/encodings/physical/binary.rs @@ -560,7 +560,10 @@ pub mod tests { None, None, ])) as ArrayRef; - let string_data = DataBlock::from(string_array).as_nullable().unwrap(); + let field = Field::new("", DataType::Utf8, true); + let string_data = DataBlock::from_array(string_array, &field) + .as_nullable() + .unwrap(); let nulls = string_data.nulls; let string_data = string_data.data.as_variable_width().unwrap(); diff --git a/rust/lance-encoding/src/encodings/physical/bitpack.rs b/rust/lance-encoding/src/encodings/physical/bitpack.rs index e93ff1dc54..e16cf82c7b 100644 --- a/rust/lance-encoding/src/encodings/physical/bitpack.rs +++ b/rust/lance-encoding/src/encodings/physical/bitpack.rs @@ -812,7 +812,8 @@ pub mod test { num_bits: params.num_bits, signed_type: params.signed, }; - let data = DataBlock::from_array(arr); + let field = arrow_schema::Field::new("", data_type.clone(), false); + let data = DataBlock::from_array(arr, &field); let result = encoder.encode(data, &data_type, &mut buffed_index).unwrap(); let data = result.data.as_fixed_width().unwrap(); diff --git a/rust/lance-encoding/src/encodings/physical/dictionary.rs b/rust/lance-encoding/src/encodings/physical/dictionary.rs index 133eeb5857..7cfe66a6c0 100644 --- a/rust/lance-encoding/src/encodings/physical/dictionary.rs +++ b/rust/lance-encoding/src/encodings/physical/dictionary.rs @@ -10,7 +10,7 @@ use arrow_array::types::UInt8Type; use arrow_array::{ make_array, new_null_array, Array, ArrayRef, DictionaryArray, StringArray, UInt8Array, }; -use arrow_schema::DataType; +use arrow_schema::{DataType, Field as ArrowField}; use futures::{future::BoxFuture, FutureExt}; use lance_arrow::DataTypeExt; use lance_core::{Error, Result}; @@ -217,16 +217,19 @@ impl PrimitivePageDecoder for DictionaryPageDecoder { pub struct AlreadyDictionaryEncoder { indices_encoder: Box, items_encoder: Box, + field: ArrowField, } impl AlreadyDictionaryEncoder { pub fn new( indices_encoder: Box, items_encoder: Box, + field: ArrowField, ) -> Self { Self { indices_encoder, items_encoder, + field, } } } @@ -242,7 +245,6 @@ impl ArrayEncoder for AlreadyDictionaryEncoder { panic!("Expected dictionary type"); }; - println!("Before"); let dict_data = match data { DataBlock::Dictionary(dict_data) => dict_data, DataBlock::AllNull(all_null) => { @@ -251,6 +253,8 @@ impl ArrayEncoder for AlreadyDictionaryEncoder { let indices = arrow_cast::cast(&indices, key_type.as_ref()).unwrap(); let indices = indices.into_data(); let values = new_null_array(value_type, 1); + let values_field = ArrowField::new("", value_type.as_ref().clone(), true) + .with_metadata(self.field.metadata().clone()); DictionaryDataBlock { indices: FixedWidthDataBlock { bits_per_value: key_type.byte_width() as u64 * 8, @@ -259,12 +263,11 @@ impl ArrayEncoder for AlreadyDictionaryEncoder { block_info: BlockInfo::new(), used_encoding: UsedEncoding::new(), }, - dictionary: Box::new(DataBlock::from_array(values)), + dictionary: Box::new(DataBlock::from_array(values, &values_field)), } } _ => panic!("Expected dictionary data"), }; - println!("After"); let num_dictionary_items = dict_data.dictionary.num_values() as u32; let encoded_indices = self.indices_encoder.encode( @@ -380,8 +383,10 @@ impl ArrayEncoder for DictionaryEncoder { let (index_array, items_array) = encode_dict_indices_and_items(str_data.as_string()); let dict_size = items_array.len() as u32; - let index_data = DataBlock::from(index_array); - let items_data = DataBlock::from(items_array); + let indices_field = ArrowField::new("", index_array.data_type().clone(), true); + let items_field = ArrowField::new("", items_array.data_type().clone(), true); + let index_data = DataBlock::from_array(index_array, &indices_field); + let items_data = DataBlock::from_array(items_array, &items_field); let encoded_indices = self.indices_encoder diff --git a/rust/lance-encoding/src/statistics.rs b/rust/lance-encoding/src/statistics.rs index 91b8bc0a3b..499153c3a5 100644 --- a/rust/lance-encoding/src/statistics.rs +++ b/rust/lance-encoding/src/statistics.rs @@ -312,7 +312,9 @@ mod tests { let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap(); let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap(); let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9); + let field = Field::new("", DataType::Int32, false); + let mut block = + DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9, &field); let concatenated_array = concat(&[ &*Arc::new(arr1.clone()) as &dyn Array, @@ -345,7 +347,8 @@ mod tests { // test DataType::Binary let mut gen = lance_datagen::array::rand_type(&DataType::Binary); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let field = Field::new("", DataType::Binary, false); + let mut block = DataBlock::from_array(arr.clone(), &field); let data_size_array = block.get_stat(Stat::DataSize).unwrap_or_else(|| { panic!( "A data block of type: {} should have valid {} statistics", @@ -379,9 +382,11 @@ mod tests { ] .into(); - let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields)); + let struct_type = DataType::Struct(fields); + let mut gen = lance_datagen::array::rand_type(&struct_type); + let struct_field = Field::new("", struct_type, false); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let mut block = DataBlock::from_array(arr.clone(), &struct_field); assert!( block.get_stat(Stat::DataSize).is_none(), "Expected Stat::DataSize to be None for data block of type: {}", @@ -389,12 +394,11 @@ mod tests { ); // test DataType::Dictionary - let mut gen = array::rand_type(&DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - )); + let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + let mut gen = array::rand_type(&dict_type); + let dict_field = Field::new("", dict_type, false); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let mut block = DataBlock::from_array(arr.clone(), &dict_field); assert!( block.get_stat(Stat::DataSize).is_none(), "Expected Stat::DataSize to be None for data block of type: {}", @@ -403,7 +407,8 @@ mod tests { let mut gen = array::rand::().with_nulls(&[false, true, false]); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let field = Field::new("", DataType::Int32, true); + let mut block = DataBlock::from_array(arr.clone(), &field); assert!( block.get_stat(Stat::DataSize).is_none(), "Expected Stat::DataSize to be None for data block of type: {}", @@ -415,7 +420,8 @@ mod tests { fn test_bit_width_stat_for_integers() { let int8_array = Int8Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let int8_field = Field::new("", DataType::Int8, false); + let mut block = DataBlock::from_array(array_ref, &int8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -430,7 +436,7 @@ mod tests { let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -445,7 +451,7 @@ mod tests { let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -460,7 +466,7 @@ mod tests { let int8_array = Int8Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -475,7 +481,8 @@ mod tests { let int16_array = Int16Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let int16_field = Field::new("", DataType::Int16, false); + let mut block = DataBlock::from_array(array_ref, &int16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -490,7 +497,7 @@ mod tests { let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -505,7 +512,7 @@ mod tests { let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -520,7 +527,7 @@ mod tests { let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -534,7 +541,7 @@ mod tests { ); let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -549,7 +556,7 @@ mod tests { let int16_array = Int16Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -563,8 +570,9 @@ mod tests { ); let int32_array = Int32Array::from(vec![1, 2, 3]); + let int32_field = Field::new("", DataType::Int32, false); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -579,7 +587,7 @@ mod tests { let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -594,7 +602,7 @@ mod tests { let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -609,7 +617,7 @@ mod tests { let int32_array = Int32Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -624,7 +632,7 @@ mod tests { let int32_array = Int32Array::from(vec![-1, 2, 3, -88]); let array_ref: ArrayRef = Arc::new(int32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -639,7 +647,8 @@ mod tests { let int64_array = Int64Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let int64_field = Field::new("", DataType::Int64, false); + let mut block = DataBlock::from_array(array_ref, &int64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -654,7 +663,7 @@ mod tests { let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -669,7 +678,7 @@ mod tests { let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -684,7 +693,7 @@ mod tests { let int64_array = Int64Array::from(vec![-1, 2, 3]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -699,7 +708,7 @@ mod tests { let int64_array = Int64Array::from(vec![-1, 2, 3, -88]); let array_ref: ArrayRef = Arc::new(int64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &int64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -714,7 +723,8 @@ mod tests { let uint8_array = UInt8Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let uint8_field = Field::new("", DataType::UInt8, false); + let mut block = DataBlock::from_array(array_ref, &uint8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -729,7 +739,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -744,7 +754,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -759,7 +769,7 @@ mod tests { let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]); let array_ref: ArrayRef = Arc::new(uint8_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint8_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -774,7 +784,8 @@ mod tests { let uint16_array = UInt16Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let uint16_field = Field::new("", DataType::UInt16, false); + let mut block = DataBlock::from_array(array_ref, &uint16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -789,7 +800,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -804,7 +815,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -819,7 +830,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -833,7 +844,7 @@ mod tests { ); let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -848,7 +859,7 @@ mod tests { let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]); let array_ref: ArrayRef = Arc::new(uint16_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint16_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -863,7 +874,8 @@ mod tests { let uint32_array = UInt32Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let uint32_field = Field::new("", DataType::UInt32, false); + let mut block = DataBlock::from_array(array_ref, &uint32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -878,7 +890,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -893,7 +905,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -908,7 +920,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -923,7 +935,7 @@ mod tests { let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]); let array_ref: ArrayRef = Arc::new(uint32_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint32_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -938,7 +950,8 @@ mod tests { let uint64_array = UInt64Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let uint64_field = Field::new("", DataType::UInt64, false); + let mut block = DataBlock::from_array(array_ref, &uint64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -953,7 +966,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -968,7 +981,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -983,7 +996,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -998,7 +1011,7 @@ mod tests { let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]); let array_ref: ArrayRef = Arc::new(uint64_array.clone()); - let mut block = DataBlock::from_array(array_ref); + let mut block = DataBlock::from_array(array_ref, &uint64_field); let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::BitWidth); @@ -1017,7 +1030,8 @@ mod tests { let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0); let mut gen = lance_datagen::array::rand_type(&DataType::Binary); let arr = gen.generate(RowCount::from(3), &mut rng).unwrap(); - let mut block = DataBlock::from_array(arr.clone()); + let field = Field::new("", DataType::Binary, false); + let mut block = DataBlock::from_array(arr.clone(), &field); assert_eq!( block.get_stat(Stat::BitWidth), None, @@ -1030,11 +1044,8 @@ mod tests { fn test_cardinality_fixed_width_datablock() { let int8_array = Int8Array::from(vec![1, 2, 3]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); - println!( - "block.get_stat(Stat::Cardinality): {:?}", - block.get_stat(Stat::Cardinality) - ); + let field = Field::new("", DataType::Int8, false); + let mut block = DataBlock::from_array(array_ref, &field); let expected_bit_width = Arc::new(UInt64Array::from(vec![3])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::Cardinality); @@ -1049,11 +1060,7 @@ mod tests { let int8_array = Int8Array::from(vec![1, 1, 1]); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); - println!( - "block.get_stat(Stat::Cardinality): {:?}", - block.get_stat(Stat::Cardinality) - ); + let mut block = DataBlock::from_array(array_ref, &field); let expected_bit_width = Arc::new(UInt64Array::from(vec![1])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::Cardinality); @@ -1068,11 +1075,7 @@ mod tests { let int8_array = Int8Array::from_iter(0..10); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); - println!( - "block.get_stat(Stat::Cardinality): {:?}", - block.get_stat(Stat::Cardinality) - ); + let mut block = DataBlock::from_array(array_ref, &field); let expected_bit_width = Arc::new(UInt64Array::from(vec![10])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::Cardinality); @@ -1087,11 +1090,7 @@ mod tests { let int8_array = Int8Array::from_iter(-10..10); let array_ref: ArrayRef = Arc::new(int8_array.clone()); - let mut block = DataBlock::from_array(array_ref); - println!( - "block.get_stat(Stat::Cardinality): {:?}", - block.get_stat(Stat::Cardinality) - ); + let mut block = DataBlock::from_array(array_ref, &field); let expected_bit_width = Arc::new(UInt64Array::from(vec![20])) as ArrayRef; let actual_bit_width = block.get_stat(Stat::Cardinality); @@ -1111,10 +1110,7 @@ mod tests { let mut block = DataBlock::from_arrays( &[array_ref1, array_ref2], (int8_array.len() + int8_array2.len()) as u64, - ); - println!( - "block.get_stat(Stat::Cardinality): {:?}", - block.get_stat(Stat::Cardinality) + &field, ); let expected_bit_width = Arc::new(UInt64Array::from(vec![200])) as ArrayRef; @@ -1132,13 +1128,11 @@ mod tests { let int16_array2 = Int16Array::from_iter(-10..10); let array_ref1: ArrayRef = Arc::new(int16_array.clone()); let array_ref2: ArrayRef = Arc::new(int16_array2.clone()); + let field = Field::new("", DataType::Int16, false); let mut block = DataBlock::from_arrays( &[array_ref1, array_ref2], (int16_array.len() + int8_array2.len()) as u64, - ); - println!( - "block.get_stat(Stat::Cardinality): {:?}", - block.get_stat(Stat::Cardinality) + &field, ); let expected_bit_width = Arc::new(UInt64Array::from(vec![200])) as ArrayRef; @@ -1156,7 +1150,8 @@ mod tests { #[test] fn test_cardinality_variable_width_datablock() { let string_array = StringArray::from(vec![Some("hello"), Some("world")]); - let mut block = DataBlock::from_array(string_array.clone()); + let field = Field::new("", DataType::Utf8, false); + let mut block = DataBlock::from_array(string_array.clone(), &field); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1173,7 +1168,7 @@ mod tests { Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let mut block = DataBlock::from_array(string_array.clone(), &field); let expected_cardinality = Arc::new(UInt64Array::from(vec![3])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1190,7 +1185,7 @@ mod tests { Some("Saunders Mac Lane"), Some("Samuel Eilenberg"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let mut block = DataBlock::from_array(string_array.clone(), &field); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1203,7 +1198,8 @@ mod tests { ); let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]); - let mut block = DataBlock::from_array(string_array.clone()); + let field = Field::new("", DataType::LargeUtf8, false); + let mut block = DataBlock::from_array(string_array.clone(), &field); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1220,7 +1216,7 @@ mod tests { Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let mut block = DataBlock::from_array(string_array.clone(), &field); let expected_cardinality = Arc::new(UInt64Array::from(vec![3])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality); @@ -1237,7 +1233,7 @@ mod tests { Some("Saunders Mac Lane"), Some("Samuel Eilenberg"), ]); - let mut block = DataBlock::from_array(string_array.clone()); + let mut block = DataBlock::from_array(string_array.clone(), &field); let expected_cardinality = Arc::new(UInt64Array::from(vec![2])) as ArrayRef; let actual_cardinality = block.get_stat(Stat::Cardinality);